From 907f315158343e70ae847344355c88bd40d8808e Mon Sep 17 00:00:00 2001 From: GustavoA1604 Date: Wed, 20 May 2026 14:36:33 -0300 Subject: [PATCH] tts-cpp: Add dynamic backend selection for android --- tts-cpp/CMakeLists.txt | 100 +++-- tts-cpp/include/tts-cpp/chatterbox/engine.h | 49 ++- tts-cpp/include/tts-cpp/supertonic/engine.h | 27 ++ tts-cpp/src/backend_selection.cpp | 394 ++++++++++++++++++++ tts-cpp/src/backend_selection.h | 90 +++++ tts-cpp/src/backend_util.h | 53 +++ tts-cpp/src/campplus.cpp | 1 + tts-cpp/src/campplus_forward.inc | 4 +- tts-cpp/src/chatterbox_cli.cpp | 26 +- tts-cpp/src/chatterbox_engine.cpp | 16 + tts-cpp/src/chatterbox_tts.cpp | 83 ++--- tts-cpp/src/main.cpp | 93 ++--- tts-cpp/src/mel_extract_stft.cpp | 8 +- tts-cpp/src/s3tokenizer.cpp | 13 +- tts-cpp/src/supertonic_engine.cpp | 13 + tts-cpp/src/supertonic_gguf.cpp | 68 ++-- tts-cpp/src/t3_mtl.cpp | 43 ++- tts-cpp/src/voice_encoder.cpp | 11 +- 18 files changed, 856 insertions(+), 236 deletions(-) create mode 100644 tts-cpp/src/backend_selection.cpp create mode 100644 tts-cpp/src/backend_selection.h create mode 100644 tts-cpp/src/backend_util.h diff --git a/tts-cpp/CMakeLists.txt b/tts-cpp/CMakeLists.txt index 225897554ea..21cadbc8ae6 100644 --- a/tts-cpp/CMakeLists.txt +++ b/tts-cpp/CMakeLists.txt @@ -70,6 +70,55 @@ if (CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "Android") endif() endif() +# Android default backend stack: dynamic loading of Vulkan + OpenCL + +# per-arch CPU variants. Mirrors parakeet-cpp's same-repo sibling and +# the qvac llm-llamacpp Android config (see +# qvac-registry-vcpkg/ports/llama-cpp/portfile.cmake) so the tts-cpp +# Android prebuilds drop into the same `qvac__tts-ggml/` folder shape +# as the parakeet / llamacpp ones: a `.bare` module + sibling +# `libggml-{vulkan,opencl,cpu-android_armv*_*}.so` files that +# `ggml_backend_load_all_from_path()` discovers at runtime. +# +# Selection at runtime is centralised in +# `tts_cpp::detail::init_gpu_backend()` (src/backend_selection.cpp): +# OpenCL when an Adreno 700+ device is present, Vulkan for every +# other GPU (non-Adreno, Adreno < 700, Mali, Xclipse, ...). No +# static GPU backend entry points are linked anywhere in libtts-cpp; +# the registry walk reaches the right backend in both +# GGML_BACKEND_DL=ON (Android prebuild) and GGML_BACKEND_DL=OFF +# (desktop dev) modes. +# +# Callers that have specific reasons to deviate (e.g. a desktop +# bring-up build that wants Vulkan only) can still override any of +# these at the cmake command line; we only set defaults that haven't +# already been provided. +if (CMAKE_SYSTEM_NAME STREQUAL "Android") + if (NOT DEFINED CACHE{GGML_BACKEND_DL}) + set(GGML_BACKEND_DL ON CACHE BOOL "" FORCE) + endif() + if (NOT DEFINED CACHE{GGML_CPU_ALL_VARIANTS}) + set(GGML_CPU_ALL_VARIANTS ON CACHE BOOL "" FORCE) + endif() + if (NOT DEFINED CACHE{GGML_CPU_REPACK}) + set(GGML_CPU_REPACK ON CACHE BOOL "" FORCE) + endif() + if (NOT DEFINED CACHE{GGML_VULKAN}) + set(GGML_VULKAN ON CACHE BOOL "" FORCE) + endif() + if (NOT DEFINED CACHE{GGML_OPENCL}) + set(GGML_OPENCL ON CACHE BOOL "" FORCE) + endif() + # ggml-vulkan's coopmat / coopmat2 shader compile pulls in + # extensions that most Android Vulkan drivers don't expose; the + # upstream llama Android build disables both for the same reason. + if (NOT DEFINED CACHE{GGML_VULKAN_DISABLE_COOPMAT}) + set(GGML_VULKAN_DISABLE_COOPMAT ON CACHE BOOL "" FORCE) + endif() + if (NOT DEFINED CACHE{GGML_VULKAN_DISABLE_COOPMAT2}) + set(GGML_VULKAN_DISABLE_COOPMAT2 ON CACHE BOOL "" FORCE) + endif() +endif() + # Two related workarounds for clang-cl / MSVC builds on Windows. Both # come from msys2 sneaking GCC-flavoured libraries onto CMake's search # paths and being mismatched against MSVC-compiled translation units. @@ -161,33 +210,28 @@ if (MSVC) add_compile_definitions(_USE_MATH_DEFINES _CRT_SECURE_NO_WARNINGS) endif() -# INTERFACE library that holds the GGML_USE_ compile defines -# every TU that includes ggml.h needs to dispatch correctly on the -# enabled backend. The tts-cpp library AND any test executable that -# recompiles src/chatterbox_tts.cpp / src/main.cpp from source (i.e. -# bypasses the tts-cpp link) must link against this; otherwise the -# #ifdef GGML_USE_ branches inside those TUs evaluate as -# undefined and the GPU code paths get silently compiled out of the -# test executable, even when the parent build did enable the backend. -# Mirrors parakeet-cpp's parakeet-backend-defs INTERFACE lib. +# Legacy interface library kept for export-set compatibility (it is +# still part of `install(EXPORT tts-cppTargets)` below and downstream +# `find_package(tts-cpp)` consumers list it as a link dep). Body +# intentionally empty: tts-cpp now routes every backend decision +# through the ggml-backend registry +# (`ggml_backend_load_all` + `ggml_backend_dev_*`, see +# `init_gpu_backend()` / `init_cpu_backend()` / `init_blas_backend()` +# in src/backend_selection.cpp) and does NOT call any +# `ggml_backend__init` / `ggml_backend_is_` entry +# point directly. The `GGML_USE_VULKAN` / `GGML_USE_OPENCL` / +# `GGML_USE_METAL` / `GGML_USE_CUDA` / `GGML_USE_BLAS` compile defines +# that used to live here were only consumed by `#ifdef` cascades that +# called those static entry points; with the registry-only design +# they're dead, and shipping them would falsely advertise a static +# backend dependency that the GGML_BACKEND_DL=ON Android/Linux builds +# explicitly do not have (their backends live in separately-loadable +# `.so` files that are dlopen()'d by `ggml_backend_load_all_from_path` +# at runtime). Mirrors parakeet-cpp's `parakeet-backend-defs`. add_library(tts-cpp-backend-defs INTERFACE) -if (GGML_CUDA) - target_compile_definitions(tts-cpp-backend-defs INTERFACE GGML_USE_CUDA) -endif() -if (GGML_METAL) - target_compile_definitions(tts-cpp-backend-defs INTERFACE GGML_USE_METAL) -endif() -if (GGML_VULKAN) - target_compile_definitions(tts-cpp-backend-defs INTERFACE GGML_USE_VULKAN) -endif() -if (GGML_BLAS) - target_compile_definitions(tts-cpp-backend-defs INTERFACE GGML_USE_BLAS) -endif() -if (GGML_OPENCL) - target_compile_definitions(tts-cpp-backend-defs INTERFACE GGML_USE_OPENCL) -endif() set(TTS_CPP_LIB_SOURCES + src/backend_selection.cpp src/main.cpp src/chatterbox_cli.cpp src/gpt2_bpe.cpp @@ -594,7 +638,15 @@ if (TTS_CPP_BUILD_TESTS) tts_cpp_apply_ccache(test-metal-ops) # Metal-only kernel parity check. Useful only when built with # -DGGML_METAL=ON; skipped on CI fleets without Metal via `ctest -LE gpu`. + # GGML_USE_METAL is supplied locally here (rather than via + # tts-cpp-backend-defs) because the library itself no longer + # consumes the macro -- every #ifdef GGML_USE_ in src/ was + # removed alongside the registry-only refactor. The test still + # uses the macro to gate its direct ggml_backend_metal_init() + # call site (it's exercising the Metal-backend implementation + # directly, not going through tts-cpp's backend selection). if (GGML_METAL) + target_compile_definitions(test-metal-ops PRIVATE GGML_USE_METAL) tts_cpp_register_test(test-metal-ops LABEL "gpu") endif() diff --git a/tts-cpp/include/tts-cpp/chatterbox/engine.h b/tts-cpp/include/tts-cpp/chatterbox/engine.h index e60ef1db3b8..daef5e97c9e 100644 --- a/tts-cpp/include/tts-cpp/chatterbox/engine.h +++ b/tts-cpp/include/tts-cpp/chatterbox/engine.h @@ -75,12 +75,57 @@ struct EngineOptions { std::string voice_dir; // Backend selection. n_gpu_layers > 0 enables the first available - // GPU backend (CUDA → Metal → Vulkan → OpenCL in build order), falling - // back to the CPU backend when none is compiled in or initialisation fails. + // GPU backend via the Adreno-tier policy: Adreno 700+ -> OpenCL, + // every other GPU (Vulkan on non-Adreno Android, Metal on Apple, + // CUDA on Linux/Windows desktop, Mali iGPU via Vulkan, ...) -> the + // non-OpenCL preference. Adreno 6xx OpenCL is force-skipped (broken + // kernels) unless `TTS_CPP_ALLOW_ADRENO_6XX=1` is set in the env. + // Falls back to the CPU backend when no GPU was requested, none is + // registered, or every candidate refused init. // The exact per-layer split is not used today; any positive value // moves the whole model to the GPU. int n_gpu_layers = 0; + // Directory to scan for dynamically-loaded ggml backends + // (`libspeech-ggml-vulkan.so`, `libspeech-ggml-opencl.so`, + // `libspeech-ggml-cpu-android_armv8.2_1.so`, ...). Forwarded to + // `ggml_backend_load_all_from_path()` on the first Engine + // construction in the process; subsequent constructions reuse the + // already-populated registry. + // + // Leave empty to fall back to ggml's default search path + // (`ggml_backend_load_all()`), which walks compile-time defaults + // (`$EXE_DIR`, `LD_LIBRARY_PATH`, ...). Embedded host applications + // built with `GGML_BACKEND_DL=ON` (the Android / Linux non-Apple + // default; see CMakeLists.txt) should pass an explicit dir + // because the .so files ship next to the host's binary in a + // platform-specific subfolder rather than on the system loader's + // path. + // + // No-op on builds where ggml is statically linked + // (`GGML_BACKEND_DL=OFF`, e.g. desktop dev cmake builds and the + // Apple xcframework). On those, every backend is registered at + // constructor time from inside libggml and no filesystem scan + // takes place. + std::string backends_dir; + + // Sets `$GGML_OPENCL_CACHE_DIR` before the first backend init so + // ggml-opencl persists `clCreateProgramWithBinary` blobs across + // process restarts (see the program-binary-cache patch on + // qvac-ext-ggml@speech). Strongly recommended on Android where + // the cold `clBuildProgram` cost dominates first-utterance + // latency; pass a writable per-app directory (typically the + // app's `cacheDir` from the host platform). + // + // Honoured only on `__ANDROID__` builds; ignored elsewhere + // (desktop OpenCL platforms don't ship the binary-cache patch + // and would otherwise pollute the user's tmpdir). + // + // Leave empty to keep the existing `$GGML_OPENCL_CACHE_DIR` env + // value (or no cache at all). Wrapper scripts that already + // export the env take precedence. + std::string opencl_cache_dir; + // 0 = std::thread::hardware_concurrency() (capped at 4 by default). int n_threads = 0; diff --git a/tts-cpp/include/tts-cpp/supertonic/engine.h b/tts-cpp/include/tts-cpp/supertonic/engine.h index b32e51fefc5..76bd692e516 100644 --- a/tts-cpp/include/tts-cpp/supertonic/engine.h +++ b/tts-cpp/include/tts-cpp/supertonic/engine.h @@ -56,6 +56,33 @@ struct EngineOptions { int n_threads = 0; int n_gpu_layers = 0; + // Directory to scan for dynamically-loaded ggml backends + // (`libspeech-ggml-vulkan.so`, `libspeech-ggml-opencl.so`, + // `libspeech-ggml-cpu-android_armv8.2_1.so`, ...). Forwarded to + // `ggml_backend_load_all_from_path()` on the first Engine + // construction in the process; subsequent constructions reuse the + // already-populated registry. + // + // Leave empty to fall back to ggml's default search path + // (`ggml_backend_load_all()`). Embedded host applications built + // with `GGML_BACKEND_DL=ON` (the Android / Linux non-Apple + // default; see CMakeLists.txt) should pass an explicit dir so the + // .so files ship next to the host's binary in a per-module + // folder rather than relying on `LD_LIBRARY_PATH` / `dlopen()` + // heuristics. No-op on `GGML_BACKEND_DL=OFF` (static-link) + // builds. + std::string backends_dir; + + // Sets `$GGML_OPENCL_CACHE_DIR` before the first backend init so + // ggml-opencl persists `clCreateProgramWithBinary` blobs across + // process restarts. Strongly recommended on Android where the + // cold `clBuildProgram` cost dominates first-utterance latency; + // pass a writable per-app directory (typically the app's + // `cacheDir` from the host platform). + // + // Honoured only on `__ANDROID__` builds; ignored elsewhere. + std::string opencl_cache_dir; + // Optional path to a .npy file containing the initial noise tensor of // shape [1, latent_channels, latent_len] (float32). When provided, // latent_len is taken from the npy file (overriding the duration- diff --git a/tts-cpp/src/backend_selection.cpp b/tts-cpp/src/backend_selection.cpp new file mode 100644 index 00000000000..2c36287827c --- /dev/null +++ b/tts-cpp/src/backend_selection.cpp @@ -0,0 +1,394 @@ +#include "backend_selection.h" + +#include "ggml-backend.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace tts_cpp::detail { +namespace { + +// Backends-dir / OpenCL-cache-dir override + warning state. The +// setters are intended to be called by the first Engine +// construction; both are consumed once and then frozen for the rest +// of the process lifetime (the ggml-backend registry and +// $GGML_OPENCL_CACHE_DIR are both process-singleton state). +// +// `g_backends_loaded` is the canonical "registry already populated" +// flag, set inside `ensure_backends_loaded()` *before* the load-all +// call returns AND under the mutex so concurrent `set_*` calls +// either land their write (and have it picked up by the in-flight +// load) or atomically observe the flag and warn. We track it +// separately from `g_recorded_backends_dir` because the first +// Engine may have legitimately constructed with an empty +// `backends_dir` (default ggml search path), in which case +// `g_recorded_backends_dir` stays empty and is no longer a reliable +// "have we loaded?" sentinel -- a subsequent setter would otherwise +// silently write to `g_backends_dir`, never get re-scanned, and +// surface zero diagnostic to the caller. +// +// Mirrors parakeet-cpp/src/parakeet_ctc.cpp 1:1 (same Engine ctor / +// process-singleton-registry interaction). Kept in a tts-cpp-local +// anon namespace so the two libraries can be vendored side-by-side +// without ODR collisions on the static state. +std::mutex g_backends_dir_mutex; +std::string g_backends_dir; +std::string g_recorded_backends_dir; +std::string g_recorded_opencl_cache_dir; +std::atomic g_backends_loaded{false}; +std::atomic g_backends_dir_warned{false}; +std::atomic g_opencl_cache_dir_warned{false}; + +const char * dev_reg_name(ggml_backend_dev_t dev) { + if (!dev) return ""; + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + return reg ? ggml_backend_reg_name(reg) : ""; +} + +} // namespace + +void set_backends_directory(const std::string & dir) { + std::lock_guard lock(g_backends_dir_mutex); + if (g_backends_loaded.load(std::memory_order_acquire)) { + // Registry already populated for this process. We can't + // re-scan a different directory mid-flight (ggml's registry + // is a process-wide singleton), so log the conflict at most + // once and otherwise stay silent on subsequent identical + // sets (the common case when a host instantiates several + // Engines back-to-back from the same backends folder, or + // when the second value happens to match the recorded one). + if (dir != g_recorded_backends_dir && + !g_backends_dir_warned.exchange(true)) { + if (g_recorded_backends_dir.empty()) { + // First Engine constructed without an explicit + // backends_dir, so ggml's compile-time default + // search path was used. The current caller wanted + // a specific dir but missed the window. + fprintf(stderr, + "tts-cpp: set_backends_directory('%s') ignored -- the " + "ggml-backend registry was already populated against " + "ggml's default search path (no explicit backends_dir on " + "the first Engine). Call set_backends_directory() (or " + "construct an Engine with backends_dir set) before the " + "first Engine to influence which directory is scanned.\n", + dir.c_str()); + } else { + fprintf(stderr, + "tts-cpp: set_backends_directory('%s') ignored -- backends " + "already loaded from '%s' earlier in this process.\n", + dir.c_str(), g_recorded_backends_dir.c_str()); + } + } + return; + } + g_backends_dir = dir; +} + +void set_opencl_cache_dir(const std::string & dir) { +#if defined(__ANDROID__) + // Same "first Engine wins" contract as set_backends_directory: + // ggml-opencl reads $GGML_OPENCL_CACHE_DIR once per process at + // backend init (before the first kernel build), so a setenv + // after init is effectively a no-op on the cache binding. Gate + // on the shared g_backends_loaded flag because the OpenCL + // backend is registered at the same `ggml_backend_load_all*` + // call that flips the flag -- conservative because it might + // still take effect when the host hasn't yet instantiated a + // GPU device, but matches what the engine-ctor documentation + // promises and avoids the same silent-failure mode as + // set_backends_directory's previous gate. + std::lock_guard lock(g_backends_dir_mutex); + if (g_backends_loaded.load(std::memory_order_acquire)) { + if (!dir.empty() && dir != g_recorded_opencl_cache_dir && + !g_opencl_cache_dir_warned.exchange(true)) { + if (g_recorded_opencl_cache_dir.empty()) { + fprintf(stderr, + "tts-cpp: set_opencl_cache_dir('%s') ignored -- backends " + "were already loaded with no explicit OpenCL cache dir " + "earlier in this process ($GGML_OPENCL_CACHE_DIR either " + "unset or set by another consumer). Call " + "set_opencl_cache_dir() before the first Engine to take " + "effect.\n", + dir.c_str()); + } else { + fprintf(stderr, + "tts-cpp: set_opencl_cache_dir('%s') ignored -- " + "$GGML_OPENCL_CACHE_DIR already pinned to '%s' earlier in " + "this process.\n", + dir.c_str(), g_recorded_opencl_cache_dir.c_str()); + } + } + return; + } + if (dir.empty()) return; + // ggml-opencl's program-binary-cache patch reads this once per + // process at backend init (before the first kernel build). Set + // it before constructing the first Engine; later calls don't + // re-bind the cache but cost nothing. + ::setenv("GGML_OPENCL_CACHE_DIR", dir.c_str(), /*overwrite=*/1); + g_recorded_opencl_cache_dir = dir; +#else + (void) dir; +#endif +} + +// Trigger one-time discovery + load of every available ggml backend. +// Idempotent: repeated calls inside the same process are no-ops once +// the registry is populated. Routed through a static guard so we +// don't pay the directory-walk cost on every model load. +// +// Why this instead of the per-backend ggml_backend__init() entry +// points the cascade used to call directly: with GGML_BACKEND_DL=ON +// (the dynamic-loader mode embedded host applications typically +// ship with) the CUDA / Metal / Vulkan / OpenCL / BLAS / ggml-cpu +// backends live in separate shared libraries that are dlopened at +// runtime; their concrete init symbols are not linkable from +// libtts-cpp, and the only supported entry point is the registry. +// With GGML_BACKEND_DL=OFF the backends are statically linked into +// libggml, registered at constructor time, and +// ggml_backend_load_all() is a cheap no-op. Both modes therefore +// reach the same registry walk below, matching the convention used +// by llama.cpp / parakeet-cpp / other ggml-based libraries. +// +// The optional backends dir comes from `set_backends_directory()` +// (typically wired from `EngineOptions::backends_dir`). When set and +// non-empty, the loader walks that single directory instead of the +// compile-time defaults so embedded host apps can ship the +// `libggml-{vulkan,opencl,cpu-*}.so` files in their own +// per-module folder rather than relying on `LD_LIBRARY_PATH` / +// `dlopen()` heuristics. +void ensure_backends_loaded() { + static const bool loaded = []() { + std::string dir; + { + std::lock_guard lock(g_backends_dir_mutex); + dir = g_backends_dir; + g_recorded_backends_dir = g_backends_dir; + // Flip the loaded sentinel under the mutex (and *before* + // we release it for the load-all call below) so any + // concurrent setter that's about to acquire the mutex + // sees the registry as already-claimed and falls into + // its warn-once branch. Without this, a setter racing + // a first Engine construction would land its value + // *after* we already captured `dir` into the local -- + // the registry would scan against the wrong directory + // (or the default), and the second Engine would have + // no idea its override was lost. + g_backends_loaded.store(true, std::memory_order_release); + } + if (!dir.empty()) { + ggml_backend_load_all_from_path(dir.c_str()); + } else { + ggml_backend_load_all(); + } + return true; + }(); + (void) loaded; +} + +// Parse the Adreno generation number from a device name / +// description string. Returns: +// - a 3-or-4-digit generation number ("Adreno (TM) 750" -> 750, +// "Adreno 830" -> 830, "Adreno 660" -> 660) +// - a synthetic 800 for the "Adreno X" naming used by +// Snapdragon X Elite parts (X1-85 / X1-45 etc.). These are +// 7xx/8xx-tier silicon with kernels that ggml-opencl supports +// and outperform Vulkan on. Mapped to 800 here so they take +// the OpenCL branch in the tier policy. +// - -1 when no Adreno marker is present (Mali, desktop GPUs, ...) +// +// Used to drive the OpenCL vs Vulkan tier policy below: Adreno +// 7xx/8xx/X ship OpenCL kernels that outperform Vulkan on those +// parts, while Adreno 6xx ggml-opencl is known broken (incorrect +// results). Mirrors parakeet-cpp's `parse_adreno_version` and the +// equivalent helper in llm-llamacpp's +// BackendSelection.cpp::parseAdrenoVersion so the three stacks +// reach the same decision on the same hardware. +int parse_adreno_version(const char * s) { + if (!s) return -1; + const char * p = std::strstr(s, "Adreno"); + if (!p) p = std::strstr(s, "adreno"); + if (!p) return -1; + p += 6; // strlen("Adreno") == strlen("adreno") == 6 + while (*p && !(*p >= '0' && *p <= '9') && *p != 'X' && *p != 'x') ++p; + if (!*p) return -1; + if (*p == 'X' || *p == 'x') { + ++p; + if (*p < '0' || *p > '9') return -1; // "Xclipse" etc. is not Adreno-X + return 800; + } + int v = 0; + while (*p >= '0' && *p <= '9') { + v = v * 10 + (*p - '0'); + ++p; + if (v > 100000) return -1; + } + return v; +} + +bool is_adreno_6xx(const char * s) { + const int v = parse_adreno_version(s); + return v >= 600 && v < 700; +} + +bool is_adreno_700plus(const char * s) { + const int v = parse_adreno_version(s); + return v >= 700; +} + +// Pick a GPU backend using the same tier policy as parakeet-cpp's +// `init_gpu_backend` / llm-llamacpp's BackendSelection: ggml-opencl +// is only used when an Adreno 700+ device is present (where its +// kernels are validated and faster than Vulkan); every other GPU +// (Vulkan, Metal, CUDA, Mali, Intel iGPU, ...) goes through the +// non-OpenCL preference. Adreno 6xx OpenCL is known broken +// (incorrect outputs) and is force-skipped unless the caller opts +// in via `TTS_CPP_ALLOW_ADRENO_6XX=1`. +// +// Routed exclusively through the ggml-backend registry +// (`ggml_backend_load_all` + `ggml_backend_dev_*`). No direct calls +// to `ggml_backend_vulkan_init` / `ggml_backend_opencl_init` / +// `ggml_backend_metal_init` are made anywhere in tts-cpp -- under +// the GGML_BACKEND_DL=ON build mode embedded host applications ship +// with, those entry points live in separate shared libraries that +// are dlopen()'d at runtime and are not linkable from libtts-cpp. +// The registry walk reaches the same backends in both modes. +ggml_backend_t init_gpu_backend(int n_gpu_layers, + bool verbose, + const char * log_prefix) { + if (n_gpu_layers <= 0) return nullptr; + if (!log_prefix) log_prefix = "tts-cpp"; + + ensure_backends_loaded(); + + struct Cand { + ggml_backend_dev_t dev; + const char * name; + const char * desc; + const char * reg_name; + }; + std::vector opencl_adreno_700plus; + std::vector other_gpu; // Vulkan / Metal / CUDA / Mali / Intel / ... + std::vector opencl_other; // Non-Adreno OpenCL (e.g. desktop) + int max_adreno_version = -1; + + const size_t n_dev = ggml_backend_dev_count(); + for (size_t i = 0; i < n_dev; ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (!dev) continue; + const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev); + if (type != GGML_BACKEND_DEVICE_TYPE_GPU && + type != GGML_BACKEND_DEVICE_TYPE_IGPU) { + continue; + } + const char * name = ggml_backend_dev_name(dev); + const char * desc = ggml_backend_dev_description(dev); + const char * reg_name = dev_reg_name(dev); + const bool is_opencl = reg_name && std::strcmp(reg_name, "OpenCL") == 0; + + const int adreno_v = std::max(parse_adreno_version(name), + parse_adreno_version(desc)); + if (adreno_v > max_adreno_version) max_adreno_version = adreno_v; + + if (is_opencl) { + if (adreno_v >= 700) { + opencl_adreno_700plus.push_back({dev, name, desc, reg_name}); + } else if (adreno_v >= 600 && adreno_v < 700) { + const char * reported = name ? name : (desc ? desc : "unknown"); + const char * override_env = std::getenv("TTS_CPP_ALLOW_ADRENO_6XX"); + if (!override_env || override_env[0] != '1') { + if (verbose) { + fprintf(stderr, + "%s: OpenCL device '%s' is Adreno 6xx; " + "skipping (7xx/8xx/X1E supported, set " + "TTS_CPP_ALLOW_ADRENO_6XX=1 to override)\n", + log_prefix, reported); + } + continue; + } + if (verbose) { + fprintf(stderr, + "%s: TTS_CPP_ALLOW_ADRENO_6XX=1 set; " + "keeping OpenCL backend on '%s' anyway\n", + log_prefix, reported); + } + opencl_other.push_back({dev, name, desc, reg_name}); + } else { + opencl_other.push_back({dev, name, desc, reg_name}); + } + } else { + other_gpu.push_back({dev, name, desc, reg_name}); + } + } + + // Tier policy: + // 1. Adreno 700+: prefer OpenCL (validated, faster than Vulkan + // on Snapdragon 8 Gen 2/3/4 etc.). + // 2. Anything else with a non-OpenCL GPU: prefer that + // (Vulkan on all non-Adreno Android, Metal on Apple, CUDA + // on Linux/Windows desktop, Mali iGPU via Vulkan, ...). + // 3. Last resort: any other OpenCL device (e.g. desktop OpenCL + // or non-Adreno mobile when no Vulkan is registered). + auto try_init = [&](const std::vector & bucket) -> ggml_backend_t { + for (const Cand & c : bucket) { + ggml_backend_t b = ggml_backend_dev_init(c.dev, nullptr); + if (!b) continue; + if (verbose) { + fprintf(stderr, + "%s: using %s backend (%s)\n", + log_prefix, + c.reg_name && *c.reg_name ? c.reg_name : "GPU", + c.name ? c.name : (c.desc ? c.desc : "unknown")); + } + return b; + } + return nullptr; + }; + + if (!opencl_adreno_700plus.empty()) { + if (ggml_backend_t b = try_init(opencl_adreno_700plus)) return b; + } + if (ggml_backend_t b = try_init(other_gpu)) return b; + if (ggml_backend_t b = try_init(opencl_other)) return b; + + if (verbose) { + if (max_adreno_version >= 600 && max_adreno_version < 700) { + fprintf(stderr, + "%s: only Adreno 6xx OpenCL detected (broken); " + "falling back to CPU\n", + log_prefix); + } else { + fprintf(stderr, + "%s: no GPU backend available, falling back to CPU\n", + log_prefix); + } + } + return nullptr; +} + +ggml_backend_t init_cpu_backend() { + ensure_backends_loaded(); + return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); +} + +ggml_backend_t init_blas_backend() { + ensure_backends_loaded(); + const size_t n_dev = ggml_backend_dev_count(); + for (size_t i = 0; i < n_dev; ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (!dev) continue; + if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_ACCEL) continue; + const char * reg_name = dev_reg_name(dev); + if (!reg_name || std::strcmp(reg_name, "BLAS") != 0) continue; + return ggml_backend_dev_init(dev, nullptr); + } + return nullptr; +} + +} // namespace tts_cpp::detail diff --git a/tts-cpp/src/backend_selection.h b/tts-cpp/src/backend_selection.h new file mode 100644 index 00000000000..60c99104e9f --- /dev/null +++ b/tts-cpp/src/backend_selection.h @@ -0,0 +1,90 @@ +#pragma once + +// Registry-only GPU backend selection for tts-cpp. +// +// Replaces the three legacy `init_backend` / `s3gen_init_backend` / +// `init_supertonic_backend` `#ifdef GGML_USE_` ladders that called +// `ggml_backend_{cuda,metal,vk,opencl}_init` directly. Under the +// dynamic-loader build mode embedded host applications ship with +// (`GGML_BACKEND_DL=ON`) those static entry points live in separate +// `.so` files that are dlopen()'d at runtime and are not linkable +// from libtts-cpp; the ggml-backend registry walk reaches the same +// backends in both `GGML_BACKEND_DL=ON` and `=OFF` modes, mirroring +// parakeet-cpp's design. +// +// Selection follows the same Adreno tier policy as parakeet-cpp's +// `init_gpu_backend` and the qvac llm-llamacpp `BackendSelection.cpp:: +// chooseBackend`: Adreno 700+ devices take the OpenCL branch +// (validated, faster than Vulkan on Snapdragon 8 Gen 2/3/4 and on the +// Snapdragon X Elite parts that report as `Adreno X`); every other +// GPU (Vulkan on all non-Adreno Android, Metal on Apple, CUDA on +// Linux/Windows desktop, Mali iGPU via Vulkan, ...) goes through the +// non-OpenCL preference. Adreno 6xx OpenCL is force-skipped (known +// broken kernels) unless the caller opts in via +// `TTS_CPP_ALLOW_ADRENO_6XX=1`. + +#include "ggml-backend.h" + +#include + +namespace tts_cpp::detail { + +// First-Engine-wins override for the directory `ggml_backend_load_all*()` +// scans on the first `ensure_backends_loaded()` call. Call before +// constructing the first Engine; later calls log a one-shot warn and +// are ignored (the ggml-backend registry is a process-wide singleton). +void set_backends_directory(const std::string & dir); + +// First-Engine-wins override for `$GGML_OPENCL_CACHE_DIR`. Honoured +// only on `__ANDROID__` builds; ignored elsewhere (desktop OpenCL +// platforms don't ship the program-binary-cache patch that reads this +// env var). Call before constructing the first Engine. +void set_opencl_cache_dir(const std::string & dir); + +// Idempotent process-wide load of every registered ggml backend. +// Routed through a function-static guard so callers can invoke it +// from every init helper without paying the directory walk cost +// more than once. +void ensure_backends_loaded(); + +// Pick a GPU backend using the Adreno tier policy described above. +// Returns nullptr when no GPU was requested (`n_gpu_layers <= 0`), +// when no GPU device is registered, or when every candidate device +// refused `ggml_backend_dev_init`. `log_prefix` controls the +// per-call log line tag (e.g. "s3gen", "supertonic", "chatterbox") +// so the existing user-visible logs in the three init sites stay +// distinguishable; verbose=false suppresses everything except hard +// errors. +ggml_backend_t init_gpu_backend(int n_gpu_layers, + bool verbose, + const char * log_prefix); + +// Convenience wrapper that picks up the registered CPU device and +// returns its init handle. Mirrors parakeet-cpp's +// `init_cpu_backend()`. Never throws; returns nullptr when the +// ggml-cpu backend isn't available (no .so on disk and not +// statically linked). +ggml_backend_t init_cpu_backend(); + +// Returns the first registered BLAS accel backend (if any) or +// nullptr. Mirrors parakeet-cpp's `init_blas_backend()`. Today no +// tts-cpp call site uses this but it is exposed for parity with +// the parakeet helper API so callers that want to mirror parakeet's +// (cpu + blas accel + gpu) cascade can. +ggml_backend_t init_blas_backend(); + +// Adreno-generation parser. Returns: +// - a 3-or-4-digit generation number ("Adreno (TM) 750" -> 750, +// "Adreno 830" -> 830, "Adreno 660" -> 660) +// - a synthetic 800 for the "Adreno X" naming used by +// Snapdragon X Elite parts (X1-85 / X1-45 etc.) +// - -1 when no Adreno marker is present (Mali, desktop GPUs, ...) +// +// Exposed for the tier-policy implementation; safe to call on +// nullptr / empty strings. +int parse_adreno_version(const char * s); + +bool is_adreno_6xx(const char * s); +bool is_adreno_700plus(const char * s); + +} // namespace tts_cpp::detail diff --git a/tts-cpp/src/backend_util.h b/tts-cpp/src/backend_util.h new file mode 100644 index 00000000000..2eb8a966ac3 --- /dev/null +++ b/tts-cpp/src/backend_util.h @@ -0,0 +1,53 @@ +#pragma once + +// Backend-introspection helpers that work uniformly under both +// GGML_BACKEND_DL=ON and GGML_BACKEND_DL=OFF. The legacy +// ggml_backend_is_cpu / ggml_backend_is_metal entry points live in +// the per-backend shared libraries (libggml-cpu.* / libggml-metal.*), +// so they are unlinkable from libtts-cpp under the dynamic-loader +// build mode embedded host applications typically ship with. Routing +// through the registry (ggml_backend_get_device + ggml_backend_dev_*) +// reaches the same answer in both modes. +// +// Mirrors parakeet-cpp/src/backend_util.h 1:1 (same QVAC speech-stack +// pattern); kept in a tts-cpp namespace so the two libraries can be +// vendored side-by-side without ODR collisions on the helpers. + +#include "ggml-backend.h" + +#include + +namespace tts_cpp::detail { + +inline const char * backend_reg_name(ggml_backend_t b) { + if (!b) return ""; + ggml_backend_dev_t dev = ggml_backend_get_device(b); + if (!dev) return ""; + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (!reg) return ""; + const char * n = ggml_backend_reg_name(reg); + return n ? n : ""; +} + +inline bool backend_is_cpu(ggml_backend_t b) { + if (!b) return false; + ggml_backend_dev_t dev = ggml_backend_get_device(b); + return dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU; +} + +inline bool backend_is_metal(ggml_backend_t b) { + return std::strcmp(backend_reg_name(b), "Metal") == 0; +} + +inline void backend_set_n_threads(ggml_backend_t b, int n_threads) { + if (!b || n_threads <= 0) return; + ggml_backend_dev_t dev = ggml_backend_get_device(b); + if (!dev) return; + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (!reg) return; + auto fn = (ggml_backend_set_n_threads_t) + ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (fn) fn(b, n_threads); +} + +} // namespace tts_cpp::detail diff --git a/tts-cpp/src/campplus.cpp b/tts-cpp/src/campplus.cpp index d52ca6d709d..8d051047812 100644 --- a/tts-cpp/src/campplus.cpp +++ b/tts-cpp/src/campplus.cpp @@ -1,4 +1,5 @@ #include "campplus.h" +#include "backend_selection.h" #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" diff --git a/tts-cpp/src/campplus_forward.inc b/tts-cpp/src/campplus_forward.inc index 09d247fdb86..fafacecd324 100644 --- a/tts-cpp/src/campplus_forward.inc +++ b/tts-cpp/src/campplus_forward.inc @@ -703,7 +703,9 @@ static bool campplus_embed_ggml(const std::vector & fbank_t_by_c, int T, campplus_gctx g; g.backend = backend; if (!g.backend) { - g.backend = ggml_backend_cpu_init(); + // Registry-routed CPU init (works under GGML_BACKEND_DL=ON and OFF). + // See voice_encoder.cpp for the longer rationale. + g.backend = ::tts_cpp::detail::init_cpu_backend(); g.owns_backend = true; if (!g.backend) { fprintf(stderr, "campplus_ggml: cpu init failed\n"); return false; } } diff --git a/tts-cpp/src/chatterbox_cli.cpp b/tts-cpp/src/chatterbox_cli.cpp index 78f3ec8e6b8..d112adcc8a4 100644 --- a/tts-cpp/src/chatterbox_cli.cpp +++ b/tts-cpp/src/chatterbox_cli.cpp @@ -21,17 +21,12 @@ #include "ggml-backend.h" #include "gguf.h" -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif +// Per-backend `#include "ggml-{cuda,metal,vulkan}.h"` blocks used +// to sit here gated on `GGML_USE_` so callers could reach the +// static `ggml_backend__init` entry points directly. Removed +// alongside the cascade in `main.cpp::init_backend`: every backend +// decision now routes through the ggml-backend registry +// (`backend_selection.{h,cpp}`). #include #include @@ -907,9 +902,12 @@ int tts_cpp_cli_main(int argc, char ** argv) { } } - // Voice-cloning preprocessing shares a backend: on Mac we pick - // Metal, on Linux + NVIDIA we pick CUDA / Vulkan. Falls back to - // the ggml-cpu NEON/AVX kernels when n_gpu_layers == 0. + // Voice-cloning preprocessing shares a backend with T3: the + // backend_selection registry walk reaches Metal on Apple, + // CUDA/Vulkan on Linux/Windows desktop, OpenCL on Adreno 700+ + // and Vulkan on every other Android GPU. Falls back to the + // ggml-cpu NEON/AVX kernels when n_gpu_layers == 0 or no GPU + // device is registered. ggml_backend_t vc_backend = init_backend(params.n_gpu_layers); // (1) speaker_emb via VoiceEncoder (3-layer LSTM + proj + L2-norm diff --git a/tts-cpp/src/chatterbox_engine.cpp b/tts-cpp/src/chatterbox_engine.cpp index 21f47832b6f..e34cf244db8 100644 --- a/tts-cpp/src/chatterbox_engine.cpp +++ b/tts-cpp/src/chatterbox_engine.cpp @@ -9,6 +9,7 @@ #include #include +#include "backend_selection.h" #include "chatterbox_t3_internal.h" #include "gpt2_bpe.h" #include "mtl_tokenizer.h" @@ -89,6 +90,21 @@ struct Engine::Impl { ggml_time_init(); g_log_verbose = opts.verbose ? 1 : 0; + + // Wire backends_dir + opencl_cache_dir BEFORE any backend init. + // The ggml-backend registry is a process-singleton: only the + // first Engine construction's `set_backends_directory` / + // `set_opencl_cache_dir` actually take effect (second + later + // Engines log a one-shot warn and reuse the already-loaded + // registry; see backend_selection.cpp for the contract). Mirrors + // parakeet-cpp's Engine ctor. + if (!opts.backends_dir.empty()) { + ::tts_cpp::detail::set_backends_directory(opts.backends_dir); + } + if (!opts.opencl_cache_dir.empty()) { + ::tts_cpp::detail::set_opencl_cache_dir(opts.opencl_cache_dir); + } + // Note: we deliberately do NOT call ggml_log_set here. The // process-global sink is owned by the host application via // tts_cpp_log_set (see ); installing one diff --git a/tts-cpp/src/chatterbox_tts.cpp b/tts-cpp/src/chatterbox_tts.cpp index 603cbc74f3f..24c43b5ecf9 100644 --- a/tts-cpp/src/chatterbox_tts.cpp +++ b/tts-cpp/src/chatterbox_tts.cpp @@ -21,6 +21,8 @@ // chatterbox_tts --s3gen-gguf MODEL.gguf --ref-dir DIR \ // --tokens-file TOKENS.txt --out OUT.wav +#include "backend_selection.h" +#include "backend_util.h" #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" @@ -29,18 +31,12 @@ #include "npy.h" #include "chatterbox_tts_test_hooks.h" -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif -#ifdef GGML_USE_OPENCL -#include "ggml-opencl.h" -#endif +// The per-backend `#include "ggml-{cuda,metal,vulkan,opencl}.h"` +// blocks gated on `GGML_USE_` that used to live here are gone: +// `s3gen_init_backend` below forwards to `backend_selection`'s +// registry walk, which reaches every backend through +// `ggml_backend_dev_*` without linking the per-backend static init +// symbols. Same shape as parakeet-cpp. #include #include @@ -71,7 +67,10 @@ static double now_ms() { } static void compute(ggml_backend_t backend, ggml_cgraph * gf) { - if (ggml_backend_is_cpu(backend)) ggml_backend_cpu_set_n_threads(backend, g_n_threads); + // Registry-routed n_threads (no-op on non-CPU backends); see + // src/t3_mtl.cpp for the GGML_BACKEND_DL=ON unresolvable-symbol + // rationale. + ::tts_cpp::detail::backend_set_n_threads(backend, g_n_threads); ggml_backend_graph_compute(backend, gf); } struct scoped_timer { @@ -103,55 +102,17 @@ struct model_ctx { }; static ggml_backend_t s3gen_init_backend(int n_gpu_layers, bool verbose) { -#ifdef GGML_USE_CUDA - if (n_gpu_layers > 0) { - auto * b = ggml_backend_cuda_init(0); - if (b) { if (verbose) fprintf(stderr, "s3gen: using CUDA backend\n"); return b; } - } -#endif -#ifdef GGML_USE_METAL - if (n_gpu_layers > 0) { - auto * b = ggml_backend_metal_init(); - if (b) { if (verbose) fprintf(stderr, "s3gen: using Metal backend\n"); return b; } - } -#endif -#ifdef GGML_USE_VULKAN - if (n_gpu_layers > 0) { - auto * b = ggml_backend_vk_init(0); - if (b) { - if (verbose) { - char desc[256] = {0}; - ggml_backend_vk_get_device_description(0, desc, sizeof(desc)); - fprintf(stderr, "s3gen: using Vulkan backend (device 0: %s)\n", desc); - } - return b; - } + // GPU cascade is centralised in backend_selection.cpp's + // `init_gpu_backend` (Adreno 700+ -> OpenCL, every other GPU -> + // Vulkan/Metal/CUDA/Mali, with Adreno 6xx OpenCL force-skipped). + if (ggml_backend_t b = ::tts_cpp::detail::init_gpu_backend(n_gpu_layers, verbose, "s3gen")) { + return b; } -#endif -#if defined(GGML_USE_OPENCL) - if (n_gpu_layers > 0) { - ggml_backend_reg_t ocl_reg = ggml_backend_opencl_reg(); - if (ocl_reg && ggml_backend_reg_dev_count(ocl_reg) > 0) { - auto * b = ggml_backend_opencl_init(); - if (b) { - if (verbose) { - fprintf(stderr, "s3gen: using OpenCL backend\n"); - } - return b; - } - } else if (verbose && ocl_reg) { - if (ggml_backend_reg_dev_count(ocl_reg) == 0) { - fprintf(stderr, "s3gen: no OpenCL device; using CPU\n"); - } else { - fprintf(stderr, "s3gen: OpenCL init failed; using CPU\n"); - } - } + if (ggml_backend_t b = ::tts_cpp::detail::init_cpu_backend()) { + if (verbose) fprintf(stderr, "s3gen: using CPU backend\n"); + return b; } -#endif - auto * b = ggml_backend_cpu_init(); - if (!b) throw std::runtime_error("ggml_backend_cpu_init() failed"); - if (verbose) fprintf(stderr, "s3gen: using CPU backend\n"); - return b; + throw std::runtime_error("s3gen_init_backend: no CPU device registered"); } // Process-wide cache of the loaded S3Gen GGUF so repeated calls (streaming @@ -2661,7 +2622,7 @@ int s3gen_synthesize_to_wav( // throughput (measured +11% S3Gen wall time on M4 CPU), so we keep the // two-call path there. Meanflow has no CFG to begin with. const bool use_b2 = (!meanflow) && (cfg_rate != 0.0f) && - !ggml_backend_is_cpu(m.backend); + !::tts_cpp::detail::backend_is_cpu(m.backend); // Persistent CFM estimator graph cache. Re-used across synth // calls when T matches — multi-synth chunks 2..N skip the graph diff --git a/tts-cpp/src/main.cpp b/tts-cpp/src/main.cpp index aa87cc73fa3..38b1e722111 100644 --- a/tts-cpp/src/main.cpp +++ b/tts-cpp/src/main.cpp @@ -1,3 +1,5 @@ +#include "backend_selection.h" +#include "backend_util.h" #include "gpt2_bpe.h" #include "mtl_tokenizer.h" #include "ggml.h" @@ -6,21 +8,15 @@ #include "ggml-backend.h" #include "gguf.h" -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif - -#ifdef GGML_USE_OPENCL -#include "ggml-opencl.h" -#endif +// Per-backend `#include "ggml-{cuda,metal,vulkan,opencl}.h"` blocks +// used to sit here gated on `GGML_USE_` so the legacy +// `init_backend` cascade below could call the static +// `ggml_backend__init` entry points directly. Removed alongside +// the cascade: tts-cpp routes every backend decision through the +// ggml-backend registry (`backend_selection.{h,cpp}`), which reaches +// the same backends in both `GGML_BACKEND_DL=ON` (Android prebuild) +// and `=OFF` (desktop dev) modes without linking those static +// symbols. Mirrors parakeet-cpp's design. #include #include @@ -291,55 +287,20 @@ int g_log_verbose = 0; ggml_backend_t init_backend(int n_gpu_layers) { const bool v = g_log_verbose != 0; -#ifdef GGML_USE_CUDA - if (n_gpu_layers > 0) { - auto * b = ggml_backend_cuda_init(0); - if (b) { if (v) fprintf(stderr, "%s: using CUDA backend\n", __func__); return b; } - } -#endif -#ifdef GGML_USE_METAL - if (n_gpu_layers > 0) { - auto * b = ggml_backend_metal_init(); - if (b) { if (v) fprintf(stderr, "%s: using Metal backend\n", __func__); return b; } - } -#endif -#ifdef GGML_USE_VULKAN - if (n_gpu_layers > 0) { - auto * b = ggml_backend_vk_init(0); - if (b) { - if (v) { - char desc[256] = {0}; - ggml_backend_vk_get_device_description(0, desc, sizeof(desc)); - fprintf(stderr, "%s: using Vulkan backend (device 0: %s)\n", __func__, desc); - } - return b; - } + // GPU cascade is centralised in backend_selection.cpp's + // `init_gpu_backend` (Adreno 700+ -> OpenCL, every other GPU -> + // Vulkan/Metal/CUDA/Mali, with Adreno 6xx OpenCL force-skipped). + // The registry walk it does reaches the same set of backends in + // both `GGML_BACKEND_DL=ON` and `=OFF` modes without linking the + // per-backend static `ggml_backend__init` entry points. + if (ggml_backend_t b = ::tts_cpp::detail::init_gpu_backend(n_gpu_layers, v, "chatterbox")) { + return b; } -#endif -#ifdef GGML_USE_OPENCL - if (n_gpu_layers > 0) { - ggml_backend_reg_t ocl_reg = ggml_backend_opencl_reg(); - if (ocl_reg && ggml_backend_reg_dev_count(ocl_reg) > 0) { - auto * b = ggml_backend_opencl_init(); - if (b) { - if (v) { - fprintf(stderr, "%s: using OpenCL backend\n", __func__); - } - return b; - } - } else if (v && ocl_reg) { - if (ggml_backend_reg_dev_count(ocl_reg) == 0) { - fprintf(stderr, "%s: no OpenCL device; using CPU\n", __func__); - } else { - fprintf(stderr, "%s: OpenCL init failed; using CPU\n", __func__); - } - } + if (ggml_backend_t b = ::tts_cpp::detail::init_cpu_backend()) { + if (v) fprintf(stderr, "chatterbox: using CPU backend\n"); + return b; } -#endif - auto * b = ggml_backend_cpu_init(); - if (!b) throw std::runtime_error("ggml_backend_cpu_init() failed"); - if (v) fprintf(stderr, "%s: using CPU backend\n", __func__); - return b; + throw std::runtime_error("init_backend: no CPU device registered"); } // -------------------------------------------------------------------------- @@ -700,7 +661,10 @@ bool eval_prompt( } } - if (ggml_backend_is_cpu(model.backend)) ggml_backend_cpu_set_n_threads(model.backend, n_threads); + // Registry-routed n_threads (no-op on non-CPU backends); see + // src/t3_mtl.cpp for the GGML_BACKEND_DL=ON unresolvable-symbol + // rationale. + ::tts_cpp::detail::backend_set_n_threads(model.backend, n_threads); ggml_backend_graph_compute(model.backend, gf); ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits"); @@ -723,7 +687,8 @@ bool eval_step( int32_t position = n_past; ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "position"), &position, 0, sizeof(position)); - if (ggml_backend_is_cpu(model.backend)) ggml_backend_cpu_set_n_threads(model.backend, n_threads); + // Registry-routed n_threads; see src/t3_mtl.cpp for rationale. + ::tts_cpp::detail::backend_set_n_threads(model.backend, n_threads); ggml_backend_graph_compute(model.backend, gf); ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits"); diff --git a/tts-cpp/src/mel_extract_stft.cpp b/tts-cpp/src/mel_extract_stft.cpp index 3453d814d68..949298d8969 100644 --- a/tts-cpp/src/mel_extract_stft.cpp +++ b/tts-cpp/src/mel_extract_stft.cpp @@ -18,6 +18,8 @@ // ggml-cpu's mul_mat uses NEON on ARM and AVX on x86, so this path is both // faster and more portable than the scalar loops in voice_features.cpp. +#include "backend_selection.h" + #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" @@ -148,10 +150,12 @@ static std::vector mel_graph_run( make_dft_basis(n_fft, F, cos_basis, neg_sin_basis); ggml_ctx gc; - gc.backend = ggml_backend_cpu_init(); + // Registry-routed CPU init (works under GGML_BACKEND_DL=ON and OFF). + // See voice_encoder.cpp for the longer rationale. + gc.backend = ::tts_cpp::detail::init_cpu_backend(); gc.owns_backend = true; if (!gc.backend) { - fprintf(stderr, "mel_graph_run: ggml_backend_cpu_init failed\n"); + fprintf(stderr, "mel_graph_run: init_cpu_backend failed\n"); return {}; } diff --git a/tts-cpp/src/s3tokenizer.cpp b/tts-cpp/src/s3tokenizer.cpp index a6d1a12ef48..5af41a3e34a 100644 --- a/tts-cpp/src/s3tokenizer.cpp +++ b/tts-cpp/src/s3tokenizer.cpp @@ -1,4 +1,6 @@ #include "s3tokenizer.h" +#include "backend_selection.h" +#include "backend_util.h" #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" @@ -333,9 +335,11 @@ static bool build_encoder_ctx(encoder_ctx & ec, const s3tokv2_weights & w, ec.backend = backend; ec.owns_backend = false; } else { - ec.backend = ggml_backend_cpu_init(); + // Registry-routed CPU init (works under GGML_BACKEND_DL=ON and OFF). + // See voice_encoder.cpp for the longer rationale. + ec.backend = ::tts_cpp::detail::init_cpu_backend(); ec.owns_backend = true; - if (!ec.backend) { fprintf(stderr, "s3tokv2: ggml_backend_cpu_init failed\n"); return false; } + if (!ec.backend) { fprintf(stderr, "s3tokv2: init_cpu_backend failed\n"); return false; } } // Enough tensors: stem (4) + 16*6 blocks = 100. Bump a bit for safety. @@ -643,9 +647,8 @@ bool s3tokv2_tokenize(const std::vector & wav, } if (n_threads <= 0) n_threads = (int)std::thread::hardware_concurrency(); - if (ggml_backend_is_cpu(ec.backend)) { - ggml_backend_cpu_set_n_threads(ec.backend, n_threads); - } + // Registry-routed n_threads; see t3_mtl.cpp for rationale. + ::tts_cpp::detail::backend_set_n_threads(ec.backend, n_threads); if (ggml_backend_graph_compute(ec.backend, gf) != GGML_STATUS_SUCCESS) { fprintf(stderr, "s3tokv2: graph_compute failed\n"); diff --git a/tts-cpp/src/supertonic_engine.cpp b/tts-cpp/src/supertonic_engine.cpp index 46c195a2c7c..cc87c09e084 100644 --- a/tts-cpp/src/supertonic_engine.cpp +++ b/tts-cpp/src/supertonic_engine.cpp @@ -1,6 +1,7 @@ #define TTS_CPP_BUILD #include "tts-cpp/supertonic/engine.h" +#include "backend_selection.h" #include "supertonic_internal.h" #include "npy.h" @@ -122,6 +123,18 @@ struct Engine::Impl { if (!std::filesystem::exists(opts.model_gguf_path)) { throw std::runtime_error(supertonic_setup_hint(opts.model_gguf_path)); } + + // Wire backends_dir + opencl_cache_dir BEFORE any backend + // init. First-Engine-wins across the whole process; second + // and later Engines reuse the already-loaded registry. See + // backend_selection.cpp. + if (!opts.backends_dir.empty()) { + ::tts_cpp::detail::set_backends_directory(opts.backends_dir); + } + if (!opts.opencl_cache_dir.empty()) { + ::tts_cpp::detail::set_opencl_cache_dir(opts.opencl_cache_dir); + } + if (!load_supertonic_gguf(opts.model_gguf_path, model, opts.n_gpu_layers, false)) { throw std::runtime_error("Supertonic Engine: failed to load GGUF: " + opts.model_gguf_path); diff --git a/tts-cpp/src/supertonic_gguf.cpp b/tts-cpp/src/supertonic_gguf.cpp index 477d9ff6fda..1c33ebe41e7 100644 --- a/tts-cpp/src/supertonic_gguf.cpp +++ b/tts-cpp/src/supertonic_gguf.cpp @@ -1,20 +1,16 @@ #include "supertonic_internal.h" +#include "backend_selection.h" +#include "backend_util.h" #include "ggml-cpu.h" #include "gguf.h" -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif -#ifdef GGML_USE_OPENCL -#include "ggml-opencl.h" -#endif +// The per-backend `#include "ggml-{cuda,metal,vulkan,opencl}.h"` +// blocks gated on `GGML_USE_` that used to live here are gone: +// `init_supertonic_backend` below forwards to `backend_selection`'s +// registry walk, which reaches every backend through +// `ggml_backend_dev_*` without linking the per-backend static init +// symbols. Same shape as parakeet-cpp. #include #include @@ -94,40 +90,17 @@ std::vector expand_supertonic_tensor_to_f32(const ggml_tensor * src) { } ggml_backend_t init_supertonic_backend(int n_gpu_layers, bool verbose) { -#ifdef GGML_USE_CUDA - if (n_gpu_layers > 0) { - ggml_backend_t b = ggml_backend_cuda_init(0); - if (b) { if (verbose) fprintf(stderr, "supertonic: using CUDA backend\n"); return b; } - } -#endif -#ifdef GGML_USE_METAL - if (n_gpu_layers > 0) { - ggml_backend_t b = ggml_backend_metal_init(); - if (b) { if (verbose) fprintf(stderr, "supertonic: using Metal backend\n"); return b; } + // GPU cascade is centralised in backend_selection.cpp's + // `init_gpu_backend` (Adreno 700+ -> OpenCL, every other GPU -> + // Vulkan/Metal/CUDA/Mali, with Adreno 6xx OpenCL force-skipped). + if (ggml_backend_t b = ::tts_cpp::detail::init_gpu_backend(n_gpu_layers, verbose, "supertonic")) { + return b; } -#endif -#ifdef GGML_USE_VULKAN - if (n_gpu_layers > 0) { - ggml_backend_t b = ggml_backend_vk_init(0); - if (b) { - if (verbose) fprintf(stderr, "supertonic: using Vulkan backend\n"); - return b; - } - } -#endif -#ifdef GGML_USE_OPENCL - if (n_gpu_layers > 0) { - ggml_backend_reg_t reg = ggml_backend_opencl_reg(); - if (reg && ggml_backend_reg_dev_count(reg) > 0) { - ggml_backend_t b = ggml_backend_opencl_init(); - if (b) { if (verbose) fprintf(stderr, "supertonic: using OpenCL backend\n"); return b; } - } + if (ggml_backend_t b = ::tts_cpp::detail::init_cpu_backend()) { + if (verbose) fprintf(stderr, "supertonic: using CPU backend\n"); + return b; } -#endif - ggml_backend_t b = ggml_backend_cpu_init(); - if (!b) throw std::runtime_error("ggml_backend_cpu_init failed"); - if (verbose) fprintf(stderr, "supertonic: using CPU backend\n"); - return b; + throw std::runtime_error("init_supertonic_backend: no CPU device registered"); } void set_env_if_unset(const char * name, const char * value) { @@ -230,8 +203,11 @@ void supertonic_set_n_threads(supertonic_model & model, int n_threads) { } void supertonic_graph_compute(const supertonic_model & model, ggml_cgraph * graph) { - if (ggml_backend_is_cpu(model.backend) && model.n_threads > 0) { - ggml_backend_cpu_set_n_threads(model.backend, model.n_threads); + // Registry-routed n_threads (no-op on non-CPU backends); see + // src/t3_mtl.cpp for the GGML_BACKEND_DL=ON unresolvable-symbol + // rationale. + if (model.n_threads > 0) { + ::tts_cpp::detail::backend_set_n_threads(model.backend, model.n_threads); } ggml_backend_graph_compute(model.backend, graph); } diff --git a/tts-cpp/src/t3_mtl.cpp b/tts-cpp/src/t3_mtl.cpp index 316f1747dea..bdae73bef47 100644 --- a/tts-cpp/src/t3_mtl.cpp +++ b/tts-cpp/src/t3_mtl.cpp @@ -24,6 +24,7 @@ #include "chatterbox_t3_internal.h" #include "t3_mtl.h" +#include "backend_util.h" #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" @@ -1135,9 +1136,12 @@ bool run_prompt_pass(const chatterbox_model & model, fill_causal_mask_f16(mask, N); set_in("kq_mask", mask.data(), mask.size() * sizeof(ggml_fp16_t)); - if (ggml_backend_is_cpu(model.backend)) { - ggml_backend_cpu_set_n_threads(model.backend, n_threads); - } + // Registry-routed n_threads (works under GGML_BACKEND_DL=ON: the CPU + // backend lives in a dlopen'd per-arch .so, so the static + // `ggml_backend_cpu_set_n_threads` symbol is unresolvable at link time). + // The helper is a no-op on non-CPU backends and on CPU backends that + // don't export `ggml_backend_set_n_threads`. + ::tts_cpp::detail::backend_set_n_threads(model.backend, n_threads); ggml_backend_graph_compute(model.backend, gf); ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits"); @@ -1202,9 +1206,12 @@ bool run_prompt_pass_b2(const chatterbox_model & model, fill_causal_mask_f16(mask, N); set_in("kq_mask", mask.data(), mask.size() * sizeof(ggml_fp16_t)); - if (ggml_backend_is_cpu(model.backend)) { - ggml_backend_cpu_set_n_threads(model.backend, n_threads); - } + // Registry-routed n_threads (works under GGML_BACKEND_DL=ON: the CPU + // backend lives in a dlopen'd per-arch .so, so the static + // `ggml_backend_cpu_set_n_threads` symbol is unresolvable at link time). + // The helper is a no-op on non-CPU backends and on CPU backends that + // don't export `ggml_backend_set_n_threads`. + ::tts_cpp::detail::backend_set_n_threads(model.backend, n_threads); ggml_backend_graph_compute(model.backend, gf); ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits"); @@ -1247,9 +1254,12 @@ bool run_step_pass_b2(const chatterbox_model & model, int32_t pos = n_past; ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "pos_ids"), &pos, 0, sizeof(pos)); - if (ggml_backend_is_cpu(model.backend)) { - ggml_backend_cpu_set_n_threads(model.backend, n_threads); - } + // Registry-routed n_threads (works under GGML_BACKEND_DL=ON: the CPU + // backend lives in a dlopen'd per-arch .so, so the static + // `ggml_backend_cpu_set_n_threads` symbol is unresolvable at link time). + // The helper is a no-op on non-CPU backends and on CPU backends that + // don't export `ggml_backend_set_n_threads`. + ::tts_cpp::detail::backend_set_n_threads(model.backend, n_threads); ggml_backend_graph_compute(model.backend, gf); ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits"); @@ -1301,9 +1311,12 @@ bool run_step_pass(const chatterbox_model & model, int32_t pos = n_past; ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "pos_ids"), &pos, 0, sizeof(pos)); - if (ggml_backend_is_cpu(model.backend)) { - ggml_backend_cpu_set_n_threads(model.backend, n_threads); - } + // Registry-routed n_threads (works under GGML_BACKEND_DL=ON: the CPU + // backend lives in a dlopen'd per-arch .so, so the static + // `ggml_backend_cpu_set_n_threads` symbol is unresolvable at link time). + // The helper is a no-op on non-CPU backends and on CPU backends that + // don't export `ggml_backend_set_n_threads`. + ::tts_cpp::detail::backend_set_n_threads(model.backend, n_threads); ggml_backend_graph_compute(model.backend, gf); ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits"); @@ -1673,7 +1686,7 @@ bool load_model_gguf_mtl(const std::string & path, // ggml-cpu's per-kernel overhead is already negligible and the // extra weight memory footprint (~75 MB for the multilingual // T3) trades unfavourably with thread-cache locality there. - if (!ggml_backend_is_cpu(model.backend)) { + if (!::tts_cpp::detail::backend_is_cpu(model.backend)) { const int n_embd = hp.n_embd; const int n_ff = hp.intermediate_size; @@ -1814,7 +1827,7 @@ bool eval_prompt_mtl(const chatterbox_model & model, // op processes B=2 in a tight loop, so batching just doubles the // per-op work without saving ops; mirrors §3.20's S3Gen B=2 finding // that on CPU the two-call path stayed the winner). - const bool use_b2 = !ggml_backend_is_cpu(model.backend); + const bool use_b2 = !::tts_cpp::detail::backend_is_cpu(model.backend); if (use_b2) { return run_prompt_pass_b2(model, allocr, n_threads, text_tokens, exaggeration, logits_cond_out, @@ -1860,7 +1873,7 @@ bool eval_step_mtl(const chatterbox_model & model, return false; } // Metal: cond+uncond batched into a single forward. See eval_prompt_mtl. - const bool use_b2 = !ggml_backend_is_cpu(model.backend); + const bool use_b2 = !::tts_cpp::detail::backend_is_cpu(model.backend); if (use_b2) { return run_step_pass_b2(model, allocr, n_threads, n_past, token, logits_cond_out, logits_uncond_out); diff --git a/tts-cpp/src/voice_encoder.cpp b/tts-cpp/src/voice_encoder.cpp index da3fe395012..f10a53bdc48 100644 --- a/tts-cpp/src/voice_encoder.cpp +++ b/tts-cpp/src/voice_encoder.cpp @@ -1,4 +1,5 @@ #include "voice_encoder.h" +#include "backend_selection.h" #include "voice_features.h" #include "ggml.h" #include "ggml-alloc.h" @@ -459,9 +460,15 @@ bool voice_encoder_embed(const std::vector & wav_16k, ve_graph G; G.backend = backend; if (!G.backend) { - G.backend = ggml_backend_cpu_init(); + // Route through the registry so this works under GGML_BACKEND_DL=ON + // (Android per-arch CPU dlopen variants) as well as the legacy + // statically-linked GGML_BACKEND_DL=OFF builds. Direct + // `ggml_backend_cpu_init()` is unresolvable in the dl mode because + // the symbol lives in the per-arch dlopen'd .so, not in the link + // line. + G.backend = ::tts_cpp::detail::init_cpu_backend(); if (!G.backend) { - fprintf(stderr, "voice_encoder_embed: ggml_backend_cpu_init failed\n"); + fprintf(stderr, "voice_encoder_embed: init_cpu_backend failed\n"); return false; } G.owns_backend = true;