From 5205428eb3bd8468bda85ad1c44cbbae76ecb02e Mon Sep 17 00:00:00 2001 From: Pratik Narola Date: Fri, 29 May 2026 17:45:06 +0530 Subject: [PATCH 1/7] QVAC-19254 tts-cpp: GPU scheduling for Adreno OpenCL (Supertonic + Chatterbox/S3Gen) Route Supertonic and Chatterbox/S3Gen GPU graphs through ggml_backend_sched so ops the GPU backend cannot run (CONV_TRANSPOSE_1D in the HiFT vocoder; the CPU-only GGML_OP_CUSTOM kernels in the Supertonic vector estimator/vocoder) are routed to CPU instead of asserting. Capability-gate the Chatterbox HiFT scheduler: a backend that runs every op in the graph (Metal, CUDA, CPU) computes directly on the primary backend; only a backend missing an op (Adreno OpenCL / Vulkan) uses the [GPU,CPU] scheduler. The gate queries ggml_backend_supports_op per node, so it is generic and does not regress iOS Metal (which supports CONV_TRANSPOSE_1D natively and otherwise aborts in the scheduler's graph-split). Gate Android GPU selection to Qualcomm Adreno: other Android GPU vendors are unvalidated and at least one (ARM Mali) aborts the host process uncatchably from graph compute, so non-Adreno devices fall through to CPU. parse_adreno_version handles the OpenCL device-name string (e.g. 'OpenCL 3.0 Adreno(TM) 740') by scanning every marker for the real model number. Also expose the pre-existing S3Gen mel/encoder/CFM intermediate dump via the --dump-mel-path CLI flag. --- tts-cpp/src/backend_selection.cpp | 122 ++++++++++++--- tts-cpp/src/backend_selection.h | 6 + tts-cpp/src/chatterbox_cli.cpp | 6 + tts-cpp/src/chatterbox_tts.cpp | 100 +++++++++++- tts-cpp/src/supertonic_engine.cpp | 4 +- tts-cpp/src/supertonic_gguf.cpp | 61 ++++++++ tts-cpp/src/supertonic_internal.h | 18 +++ tts-cpp/src/supertonic_vector_estimator.cpp | 161 +++++--------------- tts-cpp/src/supertonic_vocoder.cpp | 33 ++-- 9 files changed, 333 insertions(+), 178 deletions(-) diff --git a/tts-cpp/src/backend_selection.cpp b/tts-cpp/src/backend_selection.cpp index 2c36287827c..065adf90216 100644 --- a/tts-cpp/src/backend_selection.cpp +++ b/tts-cpp/src/backend_selection.cpp @@ -212,24 +212,38 @@ void ensure_backends_loaded() { // reach the same decision on the same hardware. int parse_adreno_version(const char * s) { if (!s) return -1; - const char * p = std::strstr(s, "Adreno"); - if (!p) p = std::strstr(s, "adreno"); - if (!p) return -1; - p += 6; // strlen("Adreno") == strlen("adreno") == 6 - while (*p && !(*p >= '0' && *p <= '9') && *p != 'X' && *p != 'x') ++p; - if (!*p) return -1; - if (*p == 'X' || *p == 'x') { - ++p; - if (*p < '0' || *p > '9') return -1; // "Xclipse" etc. is not Adreno-X - return 800; - } - int v = 0; - while (*p >= '0' && *p <= '9') { - v = v * 10 + (*p - '0'); - ++p; - if (v > 100000) return -1; + // Scan EVERY "Adreno"/"adreno" marker and keep the largest plausible + // (>= 100, i.e. real 3-digit model) version found. Some OpenCL device + // strings embed the API version before the model number, e.g. + // "QUALCOMM Adreno(TM) (OpenCL 3.0 Adreno(TM) 740)": parsing only the + // first marker yields 3 (from "OpenCL 3.0") and mis-tiers the GPU below + // Vulkan; the second "Adreno 740" marker recovers the real version. + int best = -1; + for (const char * p = s; *p; ++p) { + if (std::strncmp(p, "Adreno", 6) != 0 && + std::strncmp(p, "adreno", 6) != 0) { + continue; + } + const char * q = p + 6; // strlen("Adreno") == strlen("adreno") == 6 + while (*q && !(*q >= '0' && *q <= '9') && *q != 'X' && *q != 'x') ++q; + if (!*q) continue; + if (*q == 'X' || *q == 'x') { + if (*(q + 1) >= '0' && *(q + 1) <= '9') { // "Adreno X1-..." family + if (800 > best) best = 800; + } + continue; // "Xclipse" etc. is not Adreno-X + } + int v = 0; + bool overflow = false; + while (*q >= '0' && *q <= '9') { + v = v * 10 + (*q - '0'); + ++q; + if (v > 100000) { overflow = true; break; } + } + // Adreno models are 3-digit; ignore API-version noise like "OpenCL 3.0". + if (!overflow && v >= 100 && v > best) best = v; } - return v; + return best; } bool is_adreno_6xx(const char * s) { @@ -242,14 +256,48 @@ bool is_adreno_700plus(const char * s) { return v >= 700; } +// True if the device name/description identifies a Qualcomm Adreno GPU. +// Unlike parse_adreno_version (which needs a 3-digit model number and so +// returns -1 for the bare OpenCL "QUALCOMM Adreno(TM)" string), this is a +// vendor check used to gate Android GPU selection. ASCII case-insensitive +// because the strings vary in capitalisation: ggml-opencl reports +// CL_DEVICE_NAME ("QUALCOMM Adreno(TM)") and ggml-vulkan reports the Vulkan +// deviceName ("Adreno (TM) 740"). +bool is_qualcomm_adreno(const char * name, const char * desc) { + auto contains_ci = [](const char * hay, const char * needle) -> bool { + if (!hay || !needle) return false; + for (const char * h = hay; *h; ++h) { + const char * a = h; + const char * b = needle; + while (*a && *b) { + const char ca = (*a >= 'A' && *a <= 'Z') ? char(*a + 32) : *a; + const char cb = (*b >= 'A' && *b <= 'Z') ? char(*b + 32) : *b; + if (ca != cb) break; + ++a; + ++b; + } + if (!*b) return true; + } + return false; + }; + return contains_ci(name, "adreno") || contains_ci(desc, "adreno") || + contains_ci(name, "qualcomm") || contains_ci(desc, "qualcomm"); +} + // Pick a GPU backend using the same tier policy as parakeet-cpp's // `init_gpu_backend` / llm-llamacpp's BackendSelection: ggml-opencl // is only used when an Adreno 700+ device is present (where its // kernels are validated and faster than Vulkan); every other GPU -// (Vulkan, Metal, CUDA, Mali, Intel iGPU, ...) goes through the -// non-OpenCL preference. Adreno 6xx OpenCL is known broken -// (incorrect outputs) and is force-skipped unless the caller opts -// in via `TTS_CPP_ALLOW_ADRENO_6XX=1`. +// (Vulkan, Metal, CUDA, Intel iGPU, ...) goes through the non-OpenCL +// preference. Adreno 6xx OpenCL is known broken (incorrect outputs) +// and is force-skipped unless the caller opts in via +// `TTS_CPP_ALLOW_ADRENO_6XX=1`. +// +// On Android the device walk is additionally gated to Qualcomm Adreno +// only: other Android GPU vendors are not validated and at least one +// (ARM Mali / Tensor) aborts the host process from inside graph +// compute, so they are skipped and the engine falls back to CPU. +// Desktop GPU vendors are unaffected. // // Routed exclusively through the ggml-backend registry // (`ggml_backend_load_all` + `ggml_backend_dev_*`). No direct calls @@ -292,6 +340,29 @@ ggml_backend_t init_gpu_backend(int n_gpu_layers, const char * reg_name = dev_reg_name(dev); const bool is_opencl = reg_name && std::strcmp(reg_name, "OpenCL") == 0; +#if defined(__ANDROID__) + // Android GPU allowlist: only Qualcomm Adreno is validated for the + // tts-cpp GPU backends (OpenCL on Adreno 700+, Vulkan as the + // bring-up fallback). Other Android GPU vendors are not validated, + // and at least one (ARM Mali / Tensor) aborts the whole host + // process from inside ggml_backend_graph_compute via GGML_ASSERT -> + // ggml_abort(), which cannot be caught from C++. Skip non-Adreno + // devices so the policy falls through to CPU instead of risking a + // fatal abort on an unvalidated driver. + if (!is_qualcomm_adreno(name, desc)) { + if (verbose) { + fprintf(stderr, + "%s: Android GPU '%s' (%s) is not Qualcomm Adreno; " + "skipping (only Adreno is validated on Android; " + "falling through to CPU)\n", + log_prefix, + name ? name : "?", + desc ? desc : "?"); + } + continue; + } +#endif + const int adreno_v = std::max(parse_adreno_version(name), parse_adreno_version(desc)); if (adreno_v > max_adreno_version) max_adreno_version = adreno_v; @@ -331,10 +402,11 @@ ggml_backend_t init_gpu_backend(int n_gpu_layers, // 1. Adreno 700+: prefer OpenCL (validated, faster than Vulkan // on Snapdragon 8 Gen 2/3/4 etc.). // 2. Anything else with a non-OpenCL GPU: prefer that - // (Vulkan on all non-Adreno Android, Metal on Apple, CUDA - // on Linux/Windows desktop, Mali iGPU via Vulkan, ...). - // 3. Last resort: any other OpenCL device (e.g. desktop OpenCL - // or non-Adreno mobile when no Vulkan is registered). + // (Adreno Vulkan on Android — non-Adreno is filtered out + // above; Metal on Apple; CUDA / Vulkan on Linux/Windows + // desktop). + // 3. Last resort: any other OpenCL device (e.g. desktop OpenCL, + // or Adreno OpenCL whose version string lacked a model number). auto try_init = [&](const std::vector & bucket) -> ggml_backend_t { for (const Cand & c : bucket) { ggml_backend_t b = ggml_backend_dev_init(c.dev, nullptr); diff --git a/tts-cpp/src/backend_selection.h b/tts-cpp/src/backend_selection.h index 60c99104e9f..7054cb7273c 100644 --- a/tts-cpp/src/backend_selection.h +++ b/tts-cpp/src/backend_selection.h @@ -87,4 +87,10 @@ int parse_adreno_version(const char * s); bool is_adreno_6xx(const char * s); bool is_adreno_700plus(const char * s); +// Vendor check (name OR description, ASCII case-insensitive): true for a +// Qualcomm Adreno GPU. Unlike parse_adreno_version it does not require a +// model number, so it also matches the bare OpenCL "QUALCOMM Adreno(TM)" +// string. Used to gate Android GPU selection to the only validated vendor. +bool is_qualcomm_adreno(const char * name, const char * desc); + } // namespace tts_cpp::detail diff --git a/tts-cpp/src/chatterbox_cli.cpp b/tts-cpp/src/chatterbox_cli.cpp index d112adcc8a4..c70ad097352 100644 --- a/tts-cpp/src/chatterbox_cli.cpp +++ b/tts-cpp/src/chatterbox_cli.cpp @@ -320,6 +320,7 @@ struct cli_params { std::string tokens_file; // optional pre-tokenized speech tokens (skips T3) std::string text; // input text for T3 std::string output; // legacy: speech-tokens output file (if set, write tokens) + std::string dump_mel_path; // optional: dump S3Gen intermediates (_mu/_step0_dxdt/mel) to .npy for debugging // S3Gen + HiFT vocoder: std::string s3gen_gguf; // enables full text → wav pipeline std::string out_wav; // wav output path (requires --s3gen-gguf) @@ -450,6 +451,7 @@ static void print_usage(const char * argv0) { fprintf(stderr, " With --s3gen-gguf this is interpreted as *speech* tokens\n"); fprintf(stderr, " and the T3 step is skipped.\n"); fprintf(stderr, " --output PATH Write generated speech tokens to PATH (text mode).\n"); + fprintf(stderr, " --dump-mel-path PATH Debug: dump S3Gen mel to PATH, encoder to PATH_mu.npy, CFM step0 to PATH_step0_dxdt.npy.\n"); fprintf(stderr, "\n"); fprintf(stderr, " --s3gen-gguf PATH Enables the full text -> wav pipeline (S3Gen + HiFT).\n"); fprintf(stderr, " --out PATH Output wav file when --s3gen-gguf is set.\n"); @@ -590,6 +592,7 @@ static bool parse_args(int argc, char ** argv, cli_params & params) { else if (arg == "--text") { auto v = next("--text"); if (!v) return false; params.text = v; } else if (arg == "--tokens-file") { auto v = next("--tokens-file"); if (!v) return false; params.tokens_file = v; } else if (arg == "--output") { auto v = next("--output"); if (!v) return false; params.output = v; } + else if (arg == "--dump-mel-path") { auto v = next("--dump-mel-path"); if (!v) return false; params.dump_mel_path = v; } else if (arg == "--s3gen-gguf") { auto v = next("--s3gen-gguf"); if (!v) return false; params.s3gen_gguf = v; } else if (arg == "--out") { auto v = next("--out"); if (!v) return false; params.out_wav = v; } else if (arg == "--ref-dir") { auto v = next("--ref-dir"); if (!v) return false; params.ref_dir = v; } @@ -982,6 +985,7 @@ int tts_cpp_cli_main(int argc, char ** argv) { opts.verbose = params.verbose; opts.n_gpu_layers = params.n_gpu_layers; opts.cfm_steps = params.cfm_steps; + opts.dump_mel_path = params.dump_mel_path; opts.cfm_f16_kv_attn = params.cfm_f16_kv_attn; if (!params.reference_audio.empty()) { if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf, @@ -1265,6 +1269,7 @@ int tts_cpp_cli_main(int argc, char ** argv) { // chunk; --cfm-steps falls in as the per-chunk default below // (`stream_cfm_steps > 0 ? stream_cfm_steps : cfm_steps`). opts.cfm_steps = params.cfm_steps; + opts.dump_mel_path = params.dump_mel_path; opts.cfm_f16_kv_attn = params.cfm_f16_kv_attn; if (!params.reference_audio.empty()) { if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf, @@ -2063,6 +2068,7 @@ int tts_cpp_cli_main(int argc, char ** argv) { // Streaming chunks honour --stream-cfm-steps with --cfm-steps as // fallback when copts is set up further below. opts.cfm_steps = params.cfm_steps; + opts.dump_mel_path = params.dump_mel_path; opts.cfm_f16_kv_attn = params.cfm_f16_kv_attn; if (!params.reference_audio.empty()) { if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf, diff --git a/tts-cpp/src/chatterbox_tts.cpp b/tts-cpp/src/chatterbox_tts.cpp index 24c43b5ecf9..7c9514bfa81 100644 --- a/tts-cpp/src/chatterbox_tts.cpp +++ b/tts-cpp/src/chatterbox_tts.cpp @@ -87,6 +87,13 @@ struct scoped_timer { struct model_ctx { ggml_backend_t backend = nullptr; + // sched [backend, cpu_backend] routes ops the GPU backend can't run + // (GGML_OP_CONV_TRANSPOSE_1D in the HiFT vocoder) to CPU instead of asserting; + // stays a single-backend pass-through (cpu_backend null) when the primary is + // the CPU. Created lazily on the synthesis thread, not in load_s3gen_gguf — + // the latter runs in the preload thread and would race conditioning's init_cpu_backend(). + mutable ggml_backend_t cpu_backend = nullptr; + mutable ggml_backend_sched_t sched = nullptr; ggml_context * ctx_w = nullptr; ggml_backend_buffer_t buffer_w = nullptr; std::map tensors; @@ -101,6 +108,47 @@ struct model_ctx { float cfg_rate = 0.0f; }; +// Allocate + run a graph through the model scheduler — like the single-backend +// compute() above, but lets sched route unsupported ops to CPU. sched allocates +// at alloc time, so callers set inputs AFTER s3gen_sched_alloc and before +// s3gen_sched_compute (S3Gen sites already follow alloc -> set -> compute). +static void s3gen_sched_alloc(const model_ctx & m, ggml_cgraph * gf) { + // Lazy, single-threaded creation: reached only from run_hift_decode on the + // synthesis thread, after preload + conditioning, so init_cpu_backend() races nothing. + if (!m.sched) { + // Mark weights USAGE_WEIGHTS so sched copies a GPU-resident weight to CPU + // when a CPU-routed op (conv_transpose_1d) consumes it. Done here + // (synthesis thread), not in load_s3gen_gguf (preload thread). + ggml_backend_buffer_set_usage(m.buffer_w, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + ggml_backend_t sched_backends[2] = { m.backend, nullptr }; + int n_sched_backends = 1; + if (!::tts_cpp::detail::backend_is_cpu(m.backend)) { + m.cpu_backend = ::tts_cpp::detail::init_cpu_backend(); + if (!m.cpu_backend) throw std::runtime_error("s3gen: init CPU backend for scheduler failed"); + sched_backends[1] = m.cpu_backend; + n_sched_backends = 2; + } + // graph_size matches the HiFT graph's ggml_new_graph_custom capacity (it + // is the only graph routed through sched, and the largest S3Gen graph). + m.sched = ggml_backend_sched_new(sched_backends, /*bufts=*/nullptr, + n_sched_backends, /*graph_size=*/131072, + /*parallel=*/false, /*op_offload=*/false); + if (!m.sched) throw std::runtime_error("s3gen: ggml_backend_sched_new failed"); + } + ggml_backend_sched_reset(m.sched); + if (!ggml_backend_sched_alloc_graph(m.sched, gf)) { + throw std::runtime_error("s3gen_sched_alloc: ggml_backend_sched_alloc_graph failed"); + } +} + +static void s3gen_sched_compute(const model_ctx & m, ggml_cgraph * gf) { + // CPU work inside the sched runs on cpu_backend (GPU primary) or the primary + // itself (CPU-only model). Set its thread count per call, like compute(). + ggml_backend_t cpu_b = m.cpu_backend ? m.cpu_backend : m.backend; + ::tts_cpp::detail::backend_set_n_threads(cpu_b, g_n_threads); + ggml_backend_sched_graph_compute(m.sched, gf); +} + static ggml_backend_t s3gen_init_backend(int n_gpu_layers, bool verbose) { // GPU cascade is centralised in backend_selection.cpp's // `init_gpu_backend` (Adreno 700+ -> OpenCL, every other GPU -> @@ -185,9 +233,12 @@ static void s3gen_model_cache_release() { if (!g_s3gen_cache_entry) return; model_ctx * m = g_s3gen_cache_entry->m.get(); if (m) { + // Free the scheduler before the backends/buffers it references. + if (m->sched) { ggml_backend_sched_free(m->sched); m->sched = nullptr; } if (m->buffer_w) { ggml_backend_buffer_free(m->buffer_w); m->buffer_w = nullptr; } if (m->ctx_w) { ggml_free(m->ctx_w); m->ctx_w = nullptr; } if (m->backend) { ggml_backend_free(m->backend); m->backend = nullptr; } + if (m->cpu_backend) { ggml_backend_free(m->cpu_backend); m->cpu_backend = nullptr; } m->tensors.clear(); } g_s3gen_cache_entry.reset(); @@ -258,6 +309,12 @@ static model_ctx load_s3gen_gguf(const std::string & path, int n_gpu_layers, boo ggml_tensor * src = ggml_get_tensor(tmp_ctx, ggml_get_name(cur)); ggml_backend_tensor_set(cur, ggml_get_data(src), 0, ggml_nbytes(src)); } + // NOTE: ALL scheduler setup (m.sched, m.cpu_backend, and the buffer_w + // USAGE_WEIGHTS flag) is done lazily in s3gen_sched_alloc on the synthesis + // thread — NOT here. load_s3gen_gguf runs in the s3gen_preload background + // thread concurrently with the main thread's reference-audio conditioning; + // doing backend/buffer setup here disturbs that path + // (-> "mel_graph_run: init_cpu_backend failed"). { int64_t k_mf = gguf_find_key(g, "s3gen.meanflow"); @@ -1908,7 +1965,10 @@ static std::vector run_hift_decode(const model_ctx & m, graph_cache & cache = g_hift_graph_cache; const int64_t cache_key = pack_hift_key(T_mel, T_stft); - const bool build_graph = (cache.key != cache_key) || (cache.ctx == nullptr); + // Always rebuild: the scheduler's alloc_graph mutates node->src[] (the GPU<->CPU + // copies around the CPU-routed conv_transpose_1d), so a cached graph can't be + // reused. HiFT builds once per synth — negligible cost. + const bool build_graph = true; if (build_graph) { if (cache.allocr) { ggml_gallocr_free(cache.allocr); cache.allocr = nullptr; } if (cache.ctx) { ggml_free(cache.ctx); cache.ctx = nullptr; } @@ -2062,9 +2122,8 @@ static std::vector run_hift_decode(const model_ctx & m, y_trim = ggml_clamp(ctx, y_trim, -0.99f, 0.99f); ggml_set_name(y_trim, "wav"); ggml_set_output(y_trim); ggml_build_forward_expand(gf, y_trim); - - cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend)); - ggml_gallocr_reserve(cache.allocr, gf); + // No gallocr here — this graph is allocated by the model scheduler + // (s3gen_sched_alloc below) so conv_transpose_1d can be routed to CPU. } // end build_graph // Cached scaffolding (pulled outside build_graph too — when the graph @@ -2073,7 +2132,28 @@ static std::vector run_hift_decode(const model_ctx & m, const std::vector & ik_data = cached_istft_kernel(n_fft); const std::vector & ws_data = cached_window_sum(T_stft, n_fft, hop); - ggml_gallocr_alloc_graph(cache.allocr, gf); + // Capability-gate the scheduler. The [GPU,CPU] ggml_backend_sched exists only + // to route CONV_TRANSPOSE_1D to CPU because ggml-opencl / ggml-vulkan lack that + // kernel. A backend that can run every op in this graph itself (Metal, CUDA, + // CPU) does not need the scheduler — and the scheduler's graph-split aborts on + // the iOS Metal driver — so run those directly on the primary backend (the + // pre-scheduler path). Only use the scheduler when the primary backend can't + // run some op. Generic: asks the actual backend about the actual graph, with + // no platform / backend-name hardcoding, so iOS Metal is not regressed by the + // Android-motivated routing. + bool primary_runs_all = true; + const int hift_n_nodes = ggml_graph_n_nodes(gf); + for (int i = 0; i < hift_n_nodes; ++i) { + if (!ggml_backend_supports_op(m.backend, ggml_graph_node(gf, i))) { primary_runs_all = false; break; } + } + ggml_gallocr_t hift_allocr = nullptr; + if (primary_runs_all) { + hift_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend)); + ggml_gallocr_reserve(hift_allocr, gf); + ggml_gallocr_alloc_graph(hift_allocr, gf); + } else { + s3gen_sched_alloc(m, gf); + } ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "mel_in"), mel.data(), 0, mel.size()*sizeof(float)); ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "s_in"), s_stft.data(), 0, s_stft.size()*sizeof(float)); ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "istft_k"), ik_data.data(),0, ik_data.size()*sizeof(float)); @@ -2100,11 +2180,19 @@ static std::vector run_hift_decode(const model_ctx & m, ggml_backend_tensor_set(ggml_graph_get_tensor(gf, e.first.c_str()), inv.data(), 0, inv.size()*sizeof(float)); } - compute(m.backend, gf); + if (primary_runs_all) { + compute(m.backend, gf); + } else { + s3gen_sched_compute(m, gf); + } ggml_tensor * y_trim_out = ggml_graph_get_tensor(gf, "wav"); std::vector wav(ggml_nelements(y_trim_out)); ggml_backend_tensor_get(y_trim_out, wav.data(), 0, ggml_nbytes(y_trim_out)); + // Free the direct-path allocr only AFTER reading the output — y_trim_out's + // data lives in this buffer (freeing it earlier is a use-after-free in the + // tensor_get above). nullptr on the scheduler path, so the guard covers both. + if (hift_allocr) ggml_gallocr_free(hift_allocr); return wav; } diff --git a/tts-cpp/src/supertonic_engine.cpp b/tts-cpp/src/supertonic_engine.cpp index cc87c09e084..8e45f8191d9 100644 --- a/tts-cpp/src/supertonic_engine.cpp +++ b/tts-cpp/src/supertonic_engine.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -135,7 +136,8 @@ struct Engine::Impl { ::tts_cpp::detail::set_opencl_cache_dir(opts.opencl_cache_dir); } - if (!load_supertonic_gguf(opts.model_gguf_path, model, opts.n_gpu_layers, false)) { + if (!load_supertonic_gguf(opts.model_gguf_path, model, opts.n_gpu_layers, + std::getenv("QVAC_VERBOSE") != nullptr)) { throw std::runtime_error("Supertonic Engine: failed to load GGUF: " + opts.model_gguf_path); } diff --git a/tts-cpp/src/supertonic_gguf.cpp b/tts-cpp/src/supertonic_gguf.cpp index 1c33ebe41e7..eb4420c38a4 100644 --- a/tts-cpp/src/supertonic_gguf.cpp +++ b/tts-cpp/src/supertonic_gguf.cpp @@ -212,6 +212,24 @@ void supertonic_graph_compute(const supertonic_model & model, ggml_cgraph * grap ggml_backend_graph_compute(model.backend, graph); } +void supertonic_sched_alloc(const supertonic_model & model, ggml_cgraph * graph) { + ggml_backend_sched_reset(model.sched); + if (!ggml_backend_sched_alloc_graph(model.sched, graph)) { + throw std::runtime_error("supertonic_sched_alloc: ggml_backend_sched_alloc_graph failed"); + } +} + +void supertonic_sched_compute(const supertonic_model & model, ggml_cgraph * graph) { + // CPU work inside the sched runs on cpu_backend (GPU primary) or on the + // primary itself (CPU-only model). Set its thread count per-call, mirroring + // the single-backend path above. + ggml_backend_t cpu_b = model.cpu_backend ? model.cpu_backend : model.backend; + if (model.n_threads > 0) { + ::tts_cpp::detail::backend_set_n_threads(cpu_b, model.n_threads); + } + ggml_backend_sched_graph_compute(model.sched, graph); +} + static void bind_vocoder_weights(supertonic_model & model) { auto & v = model.vocoder; v.normalizer_scale = require_source_tensor(model, "vocoder:tts.ttl.normalizer.scale"); @@ -310,6 +328,15 @@ bool load_supertonic_gguf(const std::string & path, model.buffer_w = ggml_backend_alloc_ctx_tensors(model.ctx_w, model.backend); if (!model.buffer_w) throw std::runtime_error("ggml_backend_alloc_ctx_tensors failed"); + // Mark the weight buffer as WEIGHTS so the scheduler treats these + // tensors as immovable and inserts GPU->CPU copies when a CPU-only op + // (the GGML_OP_CUSTOM kernels in the vector estimator / vocoder) + // consumes them. Without this they default to USAGE_ANY: sched's + // weight-aware split/copy path (ggml-backend.cpp) does not fire, some + // weights stay on the GPU buffer, and the CPU custom op dereferences a + // device offset -> SIGSEGV. Standard llama.cpp/whisper.cpp pattern. + ggml_backend_buffer_set_usage(model.buffer_w, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + for (ggml_tensor * cur = ggml_get_first_tensor(model.ctx_w); cur; cur = ggml_get_next_tensor(model.ctx_w, cur)) { @@ -348,6 +375,31 @@ bool load_supertonic_gguf(const std::string & path, } bind_vocoder_weights(model); + + // Build the scheduler. With a GPU primary, add a CPU backend so + // ops the GPU can't run (GGML_OP_CUSTOM, and any FA the driver + // rejects) are routed to CPU rather than silently skipped. With a + // CPU primary, the sched is a single-backend pass-through (no + // second CPU backend created). + { + ggml_backend_t backends[2] = { model.backend, nullptr }; + int n_backends = 1; + if (!::tts_cpp::detail::backend_is_cpu(model.backend)) { + model.cpu_backend = ::tts_cpp::detail::init_cpu_backend(); + if (!model.cpu_backend) { + throw std::runtime_error("init CPU backend for scheduler failed"); + } + backends[1] = model.cpu_backend; + n_backends = 2; + } + model.sched = ggml_backend_sched_new(backends, /*bufts=*/nullptr, + n_backends, /*graph_size=*/ 8192, + /*parallel=*/ false, + /*op_offload=*/ false); + if (!model.sched) { + throw std::runtime_error("ggml_backend_sched_new failed"); + } + } } catch (const std::exception & e) { fprintf(stderr, "load_supertonic_gguf: %s\n", e.what()); gguf_free(gguf_ctx); @@ -374,6 +426,11 @@ void free_supertonic_model(supertonic_model & model) { if (model.generation_id != 0) { unregister_supertonic_alive(model.generation_id); } + // Free the scheduler before the backends/buffers it references. + if (model.sched) { + ggml_backend_sched_free(model.sched); + model.sched = nullptr; + } if (model.buffer_w) { ggml_backend_buffer_free(model.buffer_w); model.buffer_w = nullptr; @@ -382,6 +439,10 @@ void free_supertonic_model(supertonic_model & model) { ggml_backend_free(model.backend); model.backend = nullptr; } + if (model.cpu_backend) { + ggml_backend_free(model.cpu_backend); + model.cpu_backend = nullptr; + } if (model.ctx_w) { ggml_free(model.ctx_w); model.ctx_w = nullptr; diff --git a/tts-cpp/src/supertonic_internal.h b/tts-cpp/src/supertonic_internal.h index f0587a72cff..7e157f388f8 100644 --- a/tts-cpp/src/supertonic_internal.h +++ b/tts-cpp/src/supertonic_internal.h @@ -74,6 +74,14 @@ struct supertonic_model { uint64_t generation_id = 0; int n_threads = 0; ggml_backend_t backend = nullptr; + // Scheduler so ops the GPU backend can't run (notably GGML_OP_CUSTOM + // CPU kernels in the vector estimator / vocoder) auto-route to CPU + // instead of being silently skipped on a single backend. Always + // created: [backend, cpu_backend] for a GPU primary, or a degenerate + // [backend] when the primary is itself CPU. cpu_backend stays null in + // the CPU-only case (no second CPU backend). + ggml_backend_t cpu_backend = nullptr; + ggml_backend_sched_t sched = nullptr; ggml_context * ctx_w = nullptr; ggml_backend_buffer_t buffer_w = nullptr; @@ -94,6 +102,16 @@ void free_supertonic_model(supertonic_model & model); void supertonic_set_n_threads(supertonic_model & model, int n_threads); void supertonic_graph_compute(const supertonic_model & model, ggml_cgraph * graph); +// Scheduler-based alloc + compute (Option A), used by stages migrated off +// the per-graph ggml_gallocr. Pairing contract at each call site: +// supertonic_sched_alloc(model, gf); // reset + allocate via sched +// ggml_backend_tensor_set(input_leaf, ...); // inputs now have memory +// supertonic_sched_compute(model, gf); // run (routes customs -> CPU) +// The graph topology may be a reused thread_local cache; sched_reset does not +// touch the user graph, so caches stay valid across calls. +void supertonic_sched_alloc(const supertonic_model & model, ggml_cgraph * graph); +void supertonic_sched_compute(const supertonic_model & model, ggml_cgraph * graph); + ggml_tensor * require_tensor(const supertonic_model & model, const std::string & name); ggml_tensor * require_source_tensor(const supertonic_model & model, const std::string & source_name); diff --git a/tts-cpp/src/supertonic_vector_estimator.cpp b/tts-cpp/src/supertonic_vector_estimator.cpp index b4da8328f91..de60ae8c3e1 100644 --- a/tts-cpp/src/supertonic_vector_estimator.cpp +++ b/tts-cpp/src/supertonic_vector_estimator.cpp @@ -62,13 +62,13 @@ void profile_vector_compute(const supertonic_model & model, int step, const char * island) { if (!vector_profile_enabled()) { - supertonic_graph_compute(model, graph); + supertonic_sched_compute(model, graph); return; } auto & state = vector_profile(); const auto t0 = std::chrono::steady_clock::now(); const double pre_ms = std::chrono::duration(t0 - state.last).count(); - supertonic_graph_compute(model, graph); + supertonic_sched_compute(model, graph); const auto t1 = std::chrono::steady_clock::now(); const double ms = std::chrono::duration(t1 - t0).count(); state.last = t1; @@ -686,12 +686,9 @@ void build_text_attention_cache(vector_text_attention_cache & cache, ggml_set_name(out, "vector_attn_out"); ggml_set_output(out); ggml_build_forward_expand(cache.gf, out); - cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector text attention cache failed"); - if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { - throw std::runtime_error("ggml_gallocr_reserve vector text attention cache failed"); - } - ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + // Allocation is per-call via the model scheduler (supertonic_sched_alloc + // in run), which routes GGML_OP_CUSTOM ops to CPU. No per-cache gallocr; + // cache.allocr stays null (free_*_cache's safe_gallocr_free no-ops on it). } std::vector run_text_attention_cache(vector_text_attention_cache & cache, @@ -708,12 +705,11 @@ std::vector run_text_attention_cache(vector_text_attention_cache & cache, int current_step, const char * island, std::vector * ctx_trace) { - if (cache.model != &model || cache.generation_id != model.generation_id || - cache.q_len != q_len || cache.kv_len != kv_len || - cache.n_heads != n_heads || cache.head_dim != head_dim || - cache.out_w_source != out_w_source || cache.out_b_source != out_b_source) { - build_text_attention_cache(cache, model, q_len, kv_len, n_heads, head_dim, out_w_source, out_b_source); - } + // Rebuild every call: ggml_backend_sched_alloc_graph mutates node->src[] when it + // inserts cross-backend GPU<->CPU copies, corrupting a graph reused across denoise + // steps. Build is microseconds vs millisecond compute, so always rebuilding is free. + build_text_attention_cache(cache, model, q_len, kv_len, n_heads, head_dim, out_w_source, out_b_source); + supertonic_sched_alloc(model, cache.gf); ggml_backend_tensor_set(cache.q_tc_in, q_tc.data(), 0, q_tc.size()*sizeof(float)); ggml_backend_tensor_set(cache.k_tc_in, k_tc.data(), 0, k_tc.size()*sizeof(float)); ggml_backend_tensor_set(cache.v_tc_in, v_tc.data(), 0, v_tc.size()*sizeof(float)); @@ -869,12 +865,8 @@ void build_group_graph_cache(vector_group_graph_cache & cache, ggml_set_name(k, k_name.c_str()); ggml_set_output(k); ggml_build_forward_expand(cache.gf, k); ggml_set_name(v, v_name.c_str()); ggml_set_output(v); ggml_build_forward_expand(cache.gf, v); - cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector group cache failed"); - if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { - throw std::runtime_error("ggml_gallocr_reserve vector group cache failed"); - } - ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + // Allocation is per-call via the model scheduler (supertonic_sched_alloc + // in run), which routes GGML_OP_CUSTOM ops to CPU. No per-cache gallocr. } vector_group_graph_result run_group_graph_cache(vector_group_graph_cache & cache, @@ -899,20 +891,14 @@ vector_group_graph_result run_group_graph_cache(vector_group_graph_cache & cache const std::string & v_name, const char * island, std::vector * trace) { - if (cache.model != &model || cache.generation_id != model.generation_id || - cache.L != L || cache.C != C || cache.text_len != text_len || - cache.group != group || cache.conv_block != conv_block || - cache.linear_block != linear_block || cache.post_block != post_block || - cache.trace_outputs != (trace != nullptr) || - cache.matmul_source != matmul_source || - cache.q_matmul_source != q_matmul_source || cache.k_matmul_source != k_matmul_source || - cache.v_matmul_source != v_matmul_source) { - build_group_graph_cache(cache, model, L, C, group, conv_block, linear_block, matmul_source, post_block, - text_len, q_matmul_source, k_matmul_source, v_matmul_source, - q_name, k_name, v_name, - trace != nullptr); - } + // Rebuild every call — scheduler alloc corrupts a reused graph; see + // run_text_attention_cache for the full rationale. + build_group_graph_cache(cache, model, L, C, group, conv_block, linear_block, matmul_source, post_block, + text_len, q_matmul_source, k_matmul_source, v_matmul_source, + q_name, k_name, v_name, + trace != nullptr); std::vector x_raw = pack_time_channel_for_ggml(x_tc, L, C); + supertonic_sched_alloc(model, cache.gf); ggml_backend_tensor_set(cache.x_in, x_raw.data(), 0, x_raw.size()*sizeof(float)); ggml_backend_tensor_set(cache.temb_in, temb.data(), 0, temb.size()*sizeof(float)); ggml_backend_tensor_set(cache.text_in, text_lc_host, 0, (size_t) text_len * 256 * sizeof(float)); @@ -1069,12 +1055,8 @@ void build_res_style_qkv_cache(vector_res_style_qkv_cache & cache, ggml_set_name(sk, k_name.c_str()); ggml_set_output(sk); ggml_build_forward_expand(cache.gf, sk); ggml_set_name(sv, v_name.c_str()); ggml_set_output(sv); ggml_build_forward_expand(cache.gf, sv); - cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new res-style-qkv failed"); - if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { - throw std::runtime_error("ggml_gallocr_reserve res-style-qkv failed"); - } - ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + // Allocation is per-call via the model scheduler (supertonic_sched_alloc + // in run), which routes GGML_OP_CUSTOM ops to CPU. No per-cache gallocr. } vector_res_style_qkv_result run_res_style_qkv_cache(vector_res_style_qkv_cache & cache, @@ -1101,19 +1083,15 @@ vector_res_style_qkv_result run_res_style_qkv_cache(vector_res_style_qkv_cache & const char * island, std::vector * trace) { const bool want_trace = trace != nullptr; - if (cache.model != &model || cache.generation_id != model.generation_id || - cache.L != L || cache.C != C || - cache.norm_block != norm_block || cache.post_block != post_block || - cache.style_block != style_block || cache.trace_outputs != want_trace || - cache.q_matmul_source != q_matmul_source || cache.k_matmul_source != k_matmul_source || - cache.v_matmul_source != v_matmul_source) { - build_res_style_qkv_cache(cache, model, L, C, norm_block, post_block, style_block, - q_matmul_source, k_matmul_source, v_matmul_source, - residual_name, norm_name, post_name, q_name, k_name, v_name, - want_trace); - } + // Rebuild every call — scheduler alloc corrupts a reused graph; see + // run_text_attention_cache for the full rationale. + build_res_style_qkv_cache(cache, model, L, C, norm_block, post_block, style_block, + q_matmul_source, k_matmul_source, v_matmul_source, + residual_name, norm_name, post_name, q_name, k_name, v_name, + want_trace); std::vector lhs_raw = pack_time_channel_for_ggml(lhs_tc, L, C); std::vector rhs_raw = pack_time_channel_for_ggml(rhs_tc, L, C); + supertonic_sched_alloc(model, cache.gf); ggml_backend_tensor_set(cache.lhs_in, lhs_raw.data(), 0, lhs_raw.size() * sizeof(float)); ggml_backend_tensor_set(cache.rhs_in, rhs_raw.data(), 0, rhs_raw.size() * sizeof(float)); ggml_backend_tensor_set(cache.style_v_in, style_v_raw.data(), 0, style_v_raw.size() * sizeof(float)); @@ -1273,12 +1251,8 @@ void build_tail_graph_cache(vector_tail_graph_cache & cache, ggml_build_forward_expand(cache.gf, next); } - cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector tail cache failed"); - if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { - throw std::runtime_error("ggml_gallocr_reserve vector tail cache failed"); - } - ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + // Allocation is per-call via the model scheduler (supertonic_sched_alloc + // in run), which routes GGML_OP_CUSTOM ops to CPU. No per-cache gallocr. } std::vector run_tail_graph_cache(vector_tail_graph_cache & cache, @@ -1292,12 +1266,9 @@ std::vector run_tail_graph_cache(vector_tail_graph_cache & cache, int current_step, int total_steps, std::vector * trace) { - if (cache.model != &model || cache.generation_id != model.generation_id || - cache.L != L || cache.C != C || - cache.Cin != Cin || cache.total_steps != total_steps || - cache.trace_outputs != (trace != nullptr)) { - build_tail_graph_cache(cache, model, L, C, Cin, total_steps, trace != nullptr); - } + // Rebuild every call — scheduler alloc corrupts a reused graph; see + // run_text_attention_cache for the full rationale. + build_tail_graph_cache(cache, model, L, C, Cin, total_steps, trace != nullptr); std::vector tail_in_raw = pack_time_channel_for_ggml(x_tc, L, C); std::vector noise_tc((size_t)L*Cin); for (int t = 0; t < L; ++t) { @@ -1306,6 +1277,7 @@ std::vector run_tail_graph_cache(vector_tail_graph_cache & cache, } } std::vector noise_raw = pack_time_channel_for_ggml(noise_tc, L, Cin); + supertonic_sched_alloc(model, cache.gf); ggml_backend_tensor_set(cache.tail_in, tail_in_raw.data(), 0, tail_in_raw.size()*sizeof(float)); ggml_backend_tensor_set(cache.tail_mask, latent_mask, 0, (size_t)L*sizeof(float)); ggml_backend_tensor_set(cache.tail_noise, noise_raw.data(), 0, noise_raw.size()*sizeof(float)); @@ -2108,17 +2080,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, ggml_set_output(v_t); ggml_build_forward_expand(gf, v_t); - ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!allocr) { - ggml_free(ctx); - throw std::runtime_error("ggml_gallocr_new failed"); - } - if (!ggml_gallocr_reserve(allocr, gf)) { - ggml_gallocr_free(allocr); - ggml_free(ctx); - throw std::runtime_error("ggml_gallocr_reserve failed"); - } - ggml_gallocr_alloc_graph(allocr, gf); + supertonic_sched_alloc(model, gf); ggml_backend_tensor_set(x, noisy_latent, 0, (size_t) L * Cin * sizeof(float)); ggml_backend_tensor_set(mask, latent_mask, 0, (size_t) L * sizeof(float)); @@ -2217,17 +2179,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.5.norm.norm.bias")); ggml_set_name(style_norm, "ve_style0_norm"); ggml_set_output(style_norm); ggml_build_forward_expand(srgf, style_norm); - ggml_gallocr_t srallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!srallocr) { - ggml_free(srctx); - throw std::runtime_error("ggml_gallocr_new style residual failed"); - } - if (!ggml_gallocr_reserve(srallocr, srgf)) { - ggml_gallocr_free(srallocr); - ggml_free(srctx); - throw std::runtime_error("ggml_gallocr_reserve style residual failed"); - } - ggml_gallocr_alloc_graph(srallocr, srgf); + supertonic_sched_alloc(model, srgf); std::vector style_out_raw = pack_time_channel_for_ggml(style_out_ggml, L, C); std::vector style_lhs_raw = pack_time_channel_for_ggml(post_ggml, L, C); ggml_backend_tensor_set(style_out_in, style_out_raw.data(), 0, style_out_raw.size()*sizeof(float)); @@ -2236,7 +2188,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, PUSH_GGML_TRACE({"ve_style0_residual", {L, C}, tensor_to_time_channel(ggml_graph_get_tensor(srgf, "ve_style0_residual"))}); std::vector style_norm_ggml = tensor_to_time_channel(ggml_graph_get_tensor(srgf, "ve_style0_norm")); PUSH_GGML_TRACE({"ve_style0_norm", {L, C}, style_norm_ggml}); - ggml_gallocr_free(srallocr); ggml_free(srctx); thread_local vector_group_graph_cache g1_group_cache; @@ -2321,17 +2272,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.11.norm.norm.bias")); ggml_set_name(g1_style_norm, "ve_g1_style_norm"); ggml_set_output(g1_style_norm); ggml_build_forward_expand(g1srgf, g1_style_norm); - ggml_gallocr_t g1srallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!g1srallocr) { - ggml_free(g1srctx); - throw std::runtime_error("ggml_gallocr_new group1 style residual failed"); - } - if (!ggml_gallocr_reserve(g1srallocr, g1srgf)) { - ggml_gallocr_free(g1srallocr); - ggml_free(g1srctx); - throw std::runtime_error("ggml_gallocr_reserve group1 style residual failed"); - } - ggml_gallocr_alloc_graph(g1srallocr, g1srgf); + supertonic_sched_alloc(model, g1srgf); std::vector g1_style_lhs_raw = pack_time_channel_for_ggml(g1_block10, L, C); std::vector g1_style_out_raw = pack_time_channel_for_ggml(g1_style_out, L, C); ggml_backend_tensor_set(g1_style_lhs, g1_style_lhs_raw.data(), 0, g1_style_lhs_raw.size()*sizeof(float)); @@ -2340,7 +2281,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, PUSH_GGML_TRACE({"ve_g1_style_residual", {L, C}, tensor_to_time_channel(ggml_graph_get_tensor(g1srgf, "ve_g1_style_residual"))}); std::vector g1_style_norm_vec = tensor_to_time_channel(ggml_graph_get_tensor(g1srgf, "ve_g1_style_norm")); PUSH_GGML_TRACE({"ve_g1_style_norm", {L, C}, g1_style_norm_vec}); - ggml_gallocr_free(g1srallocr); ggml_free(g1srctx); thread_local vector_group_graph_cache g2_group_cache; @@ -2425,17 +2365,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.17.norm.norm.bias")); ggml_set_name(g2_style_norm, "ve_g2_style_norm"); ggml_set_output(g2_style_norm); ggml_build_forward_expand(g2srgf, g2_style_norm); - ggml_gallocr_t g2srallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!g2srallocr) { - ggml_free(g2srctx); - throw std::runtime_error("ggml_gallocr_new group2 style residual failed"); - } - if (!ggml_gallocr_reserve(g2srallocr, g2srgf)) { - ggml_gallocr_free(g2srallocr); - ggml_free(g2srctx); - throw std::runtime_error("ggml_gallocr_reserve group2 style residual failed"); - } - ggml_gallocr_alloc_graph(g2srallocr, g2srgf); + supertonic_sched_alloc(model, g2srgf); std::vector g2_style_lhs_raw = pack_time_channel_for_ggml(g2_block16, L, C); std::vector g2_style_out_raw = pack_time_channel_for_ggml(g2_style_out, L, C); ggml_backend_tensor_set(g2_style_lhs, g2_style_lhs_raw.data(), 0, g2_style_lhs_raw.size()*sizeof(float)); @@ -2444,7 +2374,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, PUSH_GGML_TRACE({"ve_g2_style_residual", {L, C}, tensor_to_time_channel(ggml_graph_get_tensor(g2srgf, "ve_g2_style_residual"))}); std::vector g2_style_norm_vec = tensor_to_time_channel(ggml_graph_get_tensor(g2srgf, "ve_g2_style_norm")); PUSH_GGML_TRACE({"ve_g2_style_norm", {L, C}, g2_style_norm_vec}); - ggml_gallocr_free(g2srallocr); ggml_free(g2srctx); thread_local vector_group_graph_cache g3_group_cache; @@ -2529,17 +2458,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.23.norm.norm.bias")); ggml_set_name(g3_style_norm, "ve_g3_style_norm"); ggml_set_output(g3_style_norm); ggml_build_forward_expand(g3srgf, g3_style_norm); - ggml_gallocr_t g3srallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!g3srallocr) { - ggml_free(g3srctx); - throw std::runtime_error("ggml_gallocr_new group3 style residual failed"); - } - if (!ggml_gallocr_reserve(g3srallocr, g3srgf)) { - ggml_gallocr_free(g3srallocr); - ggml_free(g3srctx); - throw std::runtime_error("ggml_gallocr_reserve group3 style residual failed"); - } - ggml_gallocr_alloc_graph(g3srallocr, g3srgf); + supertonic_sched_alloc(model, g3srgf); std::vector g3_style_lhs_raw = pack_time_channel_for_ggml(g3_block22, L, C); std::vector g3_style_out_raw = pack_time_channel_for_ggml(g3_style_out, L, C); ggml_backend_tensor_set(g3_style_lhs, g3_style_lhs_raw.data(), 0, g3_style_lhs_raw.size()*sizeof(float)); @@ -2548,7 +2467,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, PUSH_GGML_TRACE({"ve_g3_style_residual", {L, C}, tensor_to_time_channel(ggml_graph_get_tensor(g3srgf, "ve_g3_style_residual"))}); std::vector g3_style_norm_vec = tensor_to_time_channel(ggml_graph_get_tensor(g3srgf, "ve_g3_style_norm")); PUSH_GGML_TRACE({"ve_g3_style_norm", {L, C}, g3_style_norm_vec}); - ggml_gallocr_free(g3srallocr); ggml_free(g3srctx); thread_local vector_tail_graph_cache tail_cache; @@ -2557,7 +2475,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, include_ggml_trace ? &ggml_trace : nullptr); if (next_latent_tc_out) *next_latent_tc_out = next_latent_tc; - ggml_gallocr_free(allocr); ggml_free(ctx); profile_vector_step_end(current_step); if (error) error->clear(); diff --git a/tts-cpp/src/supertonic_vocoder.cpp b/tts-cpp/src/supertonic_vocoder.cpp index 5fc86261d0c..3ed254d661e 100644 --- a/tts-cpp/src/supertonic_vocoder.cpp +++ b/tts-cpp/src/supertonic_vocoder.cpp @@ -420,12 +420,8 @@ void build_supertonic_vocoder_cache(vocoder_graph_cache & cache, ggml_build_forward_expand(cache.gf, x); cache.wav = x; - cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vocoder cache failed"); - if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { - throw std::runtime_error("ggml_gallocr_reserve vocoder cache failed"); - } - ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + // Allocation is per-call via the model scheduler (supertonic_sched_alloc in + // the forward path), which routes GGML_OP_CUSTOM ops to CPU. No gallocr. } void linear1x1(const std::vector & x, int L, int IC, @@ -726,18 +722,18 @@ bool supertonic_vocoder_forward_ggml(const supertonic_model & model, profile_vocoder_checkpoint("bn_params", profile_last); thread_local vocoder_graph_cache cache; - if (cache.model != &model || cache.generation_id != model.generation_id || - cache.latent_len != latent_len) { - build_supertonic_vocoder_cache(cache, model, latent_len); - } + // Rebuild every call: the scheduler's alloc_graph mutates node->src[], so a + // cached graph can't be reused (full rationale in the vector estimator). + build_supertonic_vocoder_cache(cache, model, latent_len); profile_vocoder_checkpoint("graph_cache", profile_last); + supertonic_sched_alloc(model, cache.gf); ggml_backend_tensor_set(cache.x_in, x_in.data(), 0, x_in.size() * sizeof(float)); ggml_backend_tensor_set(cache.bn_scale, bn_scale.data(), 0, bn_scale.size() * sizeof(float)); ggml_backend_tensor_set(cache.bn_shift, bn_shift.data(), 0, bn_shift.size() * sizeof(float)); profile_vocoder_checkpoint("set_inputs", profile_last); - supertonic_graph_compute(model, cache.gf); + supertonic_sched_compute(model, cache.gf); profile_vocoder_checkpoint("compute", profile_last); wav_out = ggml_tensor_to_time_channel(cache.wav); profile_vocoder_checkpoint("readback", profile_last); @@ -934,17 +930,7 @@ bool supertonic_vocoder_trace_ggml(const supertonic_model & model, ggml_set_output(cur); ggml_build_forward_expand(gf, cur); - ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!allocr) { - ggml_free(ctx); - throw std::runtime_error("ggml_gallocr_new failed"); - } - if (!ggml_gallocr_reserve(allocr, gf)) { - ggml_gallocr_free(allocr); - ggml_free(ctx); - throw std::runtime_error("ggml_gallocr_reserve failed"); - } - ggml_gallocr_alloc_graph(allocr, gf); + supertonic_sched_alloc(model, gf); std::vector x_host = unpack_latent_ggml_layout(model, latent, latent_len); ggml_backend_tensor_set(x_in, x_host.data(), 0, x_host.size() * sizeof(float)); @@ -959,7 +945,7 @@ bool supertonic_vocoder_trace_ggml(const supertonic_model & model, } ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "trace_bn_scale"), bn_scale_host.data(), 0, bn_scale_host.size() * sizeof(float)); ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "trace_bn_shift"), bn_shift_host.data(), 0, bn_shift_host.size() * sizeof(float)); - supertonic_graph_compute(model, gf); + supertonic_sched_compute(model, gf); trace_out.push_back({"unpack", {T0, C_latent}, unpack_latent_scalar(model, latent, latent_len)}); trace_out.push_back({"denorm", {T0, C_latent}, ggml_tensor_to_time_channel(ggml_graph_get_tensor(gf, "denorm"))}); @@ -978,7 +964,6 @@ bool supertonic_vocoder_trace_ggml(const supertonic_model & model, trace_out.push_back({"head1", {T0, (int) model.vocoder.head1_w->ne[2]}, ggml_tensor_to_time_channel(ggml_graph_get_tensor(gf, "head1"))}); trace_out.push_back({"prelu", {T0, (int) model.vocoder.head1_w->ne[2]}, ggml_tensor_to_time_channel(ggml_graph_get_tensor(gf, "prelu"))}); trace_out.push_back({"wav", {T0, (int) model.vocoder.head2_w->ne[2]}, ggml_tensor_to_time_channel(ggml_graph_get_tensor(gf, "wav"))}); - ggml_gallocr_free(allocr); ggml_free(ctx); if (error) error->clear(); return true; From 78ddab9550b9efb3bbab6b551e72f231dfb7371b Mon Sep 17 00:00:00 2001 From: pratiknarola-t Date: Wed, 3 Jun 2026 18:19:16 +0530 Subject: [PATCH 2/7] Update backend_selection.cpp --- tts-cpp/src/backend_selection.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tts-cpp/src/backend_selection.cpp b/tts-cpp/src/backend_selection.cpp index 065adf90216..b67ee496b2d 100644 --- a/tts-cpp/src/backend_selection.cpp +++ b/tts-cpp/src/backend_selection.cpp @@ -280,8 +280,7 @@ bool is_qualcomm_adreno(const char * name, const char * desc) { } return false; }; - return contains_ci(name, "adreno") || contains_ci(desc, "adreno") || - contains_ci(name, "qualcomm") || contains_ci(desc, "qualcomm"); + return (contains_ci(name, "adreno") || contains_ci(desc, "adreno")) && (contains_ci(name, "qualcomm") || contains_ci(desc, "qualcomm")); } // Pick a GPU backend using the same tier policy as parakeet-cpp's From 83a9a38934767743dc3b34ea5178d032f362c996 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 3 Jun 2026 14:19:21 +0000 Subject: [PATCH 3/7] Preserve HiFT direct-path graph cache Co-authored-by: pratiknarola-t --- tts-cpp/src/chatterbox_tts.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/tts-cpp/src/chatterbox_tts.cpp b/tts-cpp/src/chatterbox_tts.cpp index 7c9514bfa81..9df9d915ba6 100644 --- a/tts-cpp/src/chatterbox_tts.cpp +++ b/tts-cpp/src/chatterbox_tts.cpp @@ -1965,10 +1965,11 @@ static std::vector run_hift_decode(const model_ctx & m, graph_cache & cache = g_hift_graph_cache; const int64_t cache_key = pack_hift_key(T_mel, T_stft); - // Always rebuild: the scheduler's alloc_graph mutates node->src[] (the GPU<->CPU - // copies around the CPU-routed conv_transpose_1d), so a cached graph can't be - // reused. HiFT builds once per synth — negligible cost. - const bool build_graph = true; + // Reuse the same-shape HiFT graph only when the direct backend path owns a + // cached gallocator. The scheduler path leaves cache.allocr null because + // ggml_backend_sched_alloc_graph mutates node->src[] while inserting + // GPU<->CPU copies, so scheduler-routed calls must rebuild from a clean graph. + const bool build_graph = (cache.key != cache_key) || (cache.ctx == nullptr) || (cache.allocr == nullptr); if (build_graph) { if (cache.allocr) { ggml_gallocr_free(cache.allocr); cache.allocr = nullptr; } if (cache.ctx) { ggml_free(cache.ctx); cache.ctx = nullptr; } @@ -2122,8 +2123,8 @@ static std::vector run_hift_decode(const model_ctx & m, y_trim = ggml_clamp(ctx, y_trim, -0.99f, 0.99f); ggml_set_name(y_trim, "wav"); ggml_set_output(y_trim); ggml_build_forward_expand(gf, y_trim); - // No gallocr here — this graph is allocated by the model scheduler - // (s3gen_sched_alloc below) so conv_transpose_1d can be routed to CPU. + // Direct backends allocate cache.allocr below; scheduler-routed backends + // allocate via s3gen_sched_alloc so conv_transpose_1d can run on CPU. } // end build_graph // Cached scaffolding (pulled outside build_graph too — when the graph @@ -2146,11 +2147,12 @@ static std::vector run_hift_decode(const model_ctx & m, for (int i = 0; i < hift_n_nodes; ++i) { if (!ggml_backend_supports_op(m.backend, ggml_graph_node(gf, i))) { primary_runs_all = false; break; } } - ggml_gallocr_t hift_allocr = nullptr; if (primary_runs_all) { - hift_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend)); - ggml_gallocr_reserve(hift_allocr, gf); - ggml_gallocr_alloc_graph(hift_allocr, gf); + if (!cache.allocr) { + cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend)); + ggml_gallocr_reserve(cache.allocr, gf); + } + ggml_gallocr_alloc_graph(cache.allocr, gf); } else { s3gen_sched_alloc(m, gf); } @@ -2189,10 +2191,6 @@ static std::vector run_hift_decode(const model_ctx & m, ggml_tensor * y_trim_out = ggml_graph_get_tensor(gf, "wav"); std::vector wav(ggml_nelements(y_trim_out)); ggml_backend_tensor_get(y_trim_out, wav.data(), 0, ggml_nbytes(y_trim_out)); - // Free the direct-path allocr only AFTER reading the output — y_trim_out's - // data lives in this buffer (freeing it earlier is a use-after-free in the - // tensor_get above). nullptr on the scheduler path, so the guard covers both. - if (hift_allocr) ggml_gallocr_free(hift_allocr); return wav; } From 91f24b932612488445201b8a2a94a20b5143abfd Mon Sep 17 00:00:00 2001 From: Pratik Narola Date: Thu, 4 Jun 2026 11:39:05 +0530 Subject: [PATCH 4/7] tts-cpp: revert is_qualcomm_adreno to OR so Adreno-via-Vulkan is allowlisted The AND required both 'adreno' and 'qualcomm' in the device name/desc, but ggml-vulkan reports deviceName 'Adreno (TM) 740' (no 'qualcomm') with name 'Vulkan0', so an Adreno selected via Vulkan failed the gate and fell back to CPU. Matching 'adreno' alone is sufficient: it appears in both the OpenCL ('QUALCOMM Adreno(TM)') and Vulkan ('Adreno (TM) 740') strings. Reverts 78ddab95. --- tts-cpp/src/backend_selection.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tts-cpp/src/backend_selection.cpp b/tts-cpp/src/backend_selection.cpp index b67ee496b2d..065adf90216 100644 --- a/tts-cpp/src/backend_selection.cpp +++ b/tts-cpp/src/backend_selection.cpp @@ -280,7 +280,8 @@ bool is_qualcomm_adreno(const char * name, const char * desc) { } return false; }; - return (contains_ci(name, "adreno") || contains_ci(desc, "adreno")) && (contains_ci(name, "qualcomm") || contains_ci(desc, "qualcomm")); + return contains_ci(name, "adreno") || contains_ci(desc, "adreno") || + contains_ci(name, "qualcomm") || contains_ci(desc, "qualcomm"); } // Pick a GPU backend using the same tier policy as parakeet-cpp's From 174f47d2f870ca1cad7bd8c7191367a37df75164 Mon Sep 17 00:00:00 2001 From: Pratik Narola Date: Thu, 4 Jun 2026 11:39:20 +0530 Subject: [PATCH 5/7] tts-cpp: reuse Supertonic graph caches on the direct backend path Mirror the HiFT graph-cache fix (83a9a389) in the Supertonic vector estimator and vocoder. run_text_attention_cache, run_group_graph_cache, run_res_style_qkv_cache, run_tail_graph_cache and the vocoder forward rebuilt the graph and re-reserved via the scheduler on every denoise step. Each builder now reuses its shape-keyed graph when the cache already holds one built on the direct path, and each runner takes the direct gallocr + primary-backend compute when ggml_backend_supports_op covers every node, falling back to the scheduler only when an op must route to CPU. The scheduler path leaves the cached allocr null so it keeps rebuilding (its alloc_graph mutates node->src[]); the direct path reuses graph + allocr across steps. Output is bit-identical (allocation-only change). --- tts-cpp/src/supertonic_vector_estimator.cpp | 133 +++++++++++++++++--- tts-cpp/src/supertonic_vocoder.cpp | 29 ++++- 2 files changed, 141 insertions(+), 21 deletions(-) diff --git a/tts-cpp/src/supertonic_vector_estimator.cpp b/tts-cpp/src/supertonic_vector_estimator.cpp index de60ae8c3e1..5510253bd8d 100644 --- a/tts-cpp/src/supertonic_vector_estimator.cpp +++ b/tts-cpp/src/supertonic_vector_estimator.cpp @@ -639,6 +639,16 @@ void build_text_attention_cache(vector_text_attention_cache & cache, int head_dim, const std::string & out_w_source, const std::string & out_b_source) { + // Reuse the cached graph when it already matches this shape AND was built on + // the direct backend path (cache.allocr non-null). The scheduler path leaves + // cache.allocr null, so it always rebuilds from a clean graph + // (ggml_backend_sched_alloc_graph mutates node->src[]). Mirrors run_hift_decode. + if (cache.ctx && cache.allocr && cache.generation_id == model.generation_id + && cache.q_len == q_len && cache.kv_len == kv_len + && cache.n_heads == n_heads && cache.head_dim == head_dim + && cache.out_w_source == out_w_source && cache.out_b_source == out_b_source) { + return; + } free_text_attention_cache(cache); cache.model = &model; cache.generation_id = model.generation_id; @@ -705,15 +715,28 @@ std::vector run_text_attention_cache(vector_text_attention_cache & cache, int current_step, const char * island, std::vector * ctx_trace) { - // Rebuild every call: ggml_backend_sched_alloc_graph mutates node->src[] when it - // inserts cross-backend GPU<->CPU copies, corrupting a graph reused across denoise - // steps. Build is microseconds vs millisecond compute, so always rebuilding is free. + // Reuse the shape-keyed graph on the direct backend path; rebuild + route + // through the scheduler only when an op must run on CPU. Mirrors run_hift_decode. build_text_attention_cache(cache, model, q_len, kv_len, n_heads, head_dim, out_w_source, out_b_source); - supertonic_sched_alloc(model, cache.gf); + bool direct = true; + const int n_nodes = ggml_graph_n_nodes(cache.gf); + for (int i = 0; i < n_nodes; ++i) { + if (!ggml_backend_supports_op(model.backend, ggml_graph_node(cache.gf, i))) { direct = false; break; } + } + if (direct) { + if (!cache.allocr) { + cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + ggml_gallocr_reserve(cache.allocr, cache.gf); + } + ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + } else { + supertonic_sched_alloc(model, cache.gf); + } ggml_backend_tensor_set(cache.q_tc_in, q_tc.data(), 0, q_tc.size()*sizeof(float)); ggml_backend_tensor_set(cache.k_tc_in, k_tc.data(), 0, k_tc.size()*sizeof(float)); ggml_backend_tensor_set(cache.v_tc_in, v_tc.data(), 0, v_tc.size()*sizeof(float)); - profile_vector_compute(model, cache.gf, current_step, island); + if (direct) supertonic_graph_compute(model, cache.gf); + else profile_vector_compute(model, cache.gf, current_step, island); if (ctx_trace) *ctx_trace = tensor_to_time_channel(ggml_graph_get_tensor(cache.gf, "vector_attn_ctx")); return tensor_to_time_channel(ggml_graph_get_tensor(cache.gf, "vector_attn_out")); } @@ -785,6 +808,19 @@ void build_group_graph_cache(vector_group_graph_cache & cache, const std::string & k_name, const std::string & v_name, bool trace_outputs) { + // Reuse the cached graph when it already matches this shape AND was built on + // the direct backend path (cache.allocr non-null). The scheduler path leaves + // cache.allocr null, so it always rebuilds. Mirrors run_hift_decode. + if (cache.ctx && cache.allocr && cache.generation_id == model.generation_id + && cache.L == L && cache.C == C && cache.text_len == text_len + && cache.group == group && cache.conv_block == conv_block + && cache.linear_block == linear_block && cache.post_block == post_block + && cache.trace_outputs == trace_outputs && cache.matmul_source == matmul_source + && cache.q_matmul_source == q_matmul_source && cache.k_matmul_source == k_matmul_source + && cache.v_matmul_source == v_matmul_source && cache.q_name == q_name + && cache.k_name == k_name && cache.v_name == v_name) { + return; + } free_group_graph_cache(cache); cache.model = &model; cache.generation_id = model.generation_id; @@ -891,18 +927,32 @@ vector_group_graph_result run_group_graph_cache(vector_group_graph_cache & cache const std::string & v_name, const char * island, std::vector * trace) { - // Rebuild every call — scheduler alloc corrupts a reused graph; see - // run_text_attention_cache for the full rationale. + // Reuse the shape-keyed graph on the direct backend path; rebuild + route + // through the scheduler only when an op must run on CPU. Mirrors run_hift_decode. build_group_graph_cache(cache, model, L, C, group, conv_block, linear_block, matmul_source, post_block, text_len, q_matmul_source, k_matmul_source, v_matmul_source, q_name, k_name, v_name, trace != nullptr); std::vector x_raw = pack_time_channel_for_ggml(x_tc, L, C); - supertonic_sched_alloc(model, cache.gf); + bool direct = true; + const int n_nodes = ggml_graph_n_nodes(cache.gf); + for (int i = 0; i < n_nodes; ++i) { + if (!ggml_backend_supports_op(model.backend, ggml_graph_node(cache.gf, i))) { direct = false; break; } + } + if (direct) { + if (!cache.allocr) { + cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + ggml_gallocr_reserve(cache.allocr, cache.gf); + } + ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + } else { + supertonic_sched_alloc(model, cache.gf); + } ggml_backend_tensor_set(cache.x_in, x_raw.data(), 0, x_raw.size()*sizeof(float)); ggml_backend_tensor_set(cache.temb_in, temb.data(), 0, temb.size()*sizeof(float)); ggml_backend_tensor_set(cache.text_in, text_lc_host, 0, (size_t) text_len * 256 * sizeof(float)); - profile_vector_compute(model, cache.gf, current_step, island); + if (direct) supertonic_graph_compute(model, cache.gf); + else profile_vector_compute(model, cache.gf, current_step, island); if (trace) { for (int j = 0; j < 4; ++j) { const std::string name = "ve_group" + std::to_string(group) + "_convnext" + std::to_string(j); @@ -985,6 +1035,19 @@ void build_res_style_qkv_cache(vector_res_style_qkv_cache & cache, const std::string & k_name, const std::string & v_name, bool trace_outputs) { + // Reuse the cached graph when it already matches this shape AND was built on + // the direct backend path (cache.allocr non-null). The scheduler path leaves + // cache.allocr null, so it always rebuilds. Mirrors run_hift_decode. + if (cache.ctx && cache.allocr && cache.generation_id == model.generation_id + && cache.L == L && cache.C == C && cache.norm_block == norm_block + && cache.post_block == post_block && cache.style_block == style_block + && cache.trace_outputs == trace_outputs && cache.q_matmul_source == q_matmul_source + && cache.k_matmul_source == k_matmul_source && cache.v_matmul_source == v_matmul_source + && cache.residual_name == residual_name && cache.norm_name == norm_name + && cache.post_name == post_name && cache.q_name == q_name + && cache.k_name == k_name && cache.v_name == v_name) { + return; + } free_res_style_qkv_cache(cache); cache.model = &model; cache.generation_id = model.generation_id; @@ -1083,20 +1146,34 @@ vector_res_style_qkv_result run_res_style_qkv_cache(vector_res_style_qkv_cache & const char * island, std::vector * trace) { const bool want_trace = trace != nullptr; - // Rebuild every call — scheduler alloc corrupts a reused graph; see - // run_text_attention_cache for the full rationale. + // Reuse the shape-keyed graph on the direct backend path; rebuild + route + // through the scheduler only when an op must run on CPU. Mirrors run_hift_decode. build_res_style_qkv_cache(cache, model, L, C, norm_block, post_block, style_block, q_matmul_source, k_matmul_source, v_matmul_source, residual_name, norm_name, post_name, q_name, k_name, v_name, want_trace); std::vector lhs_raw = pack_time_channel_for_ggml(lhs_tc, L, C); std::vector rhs_raw = pack_time_channel_for_ggml(rhs_tc, L, C); - supertonic_sched_alloc(model, cache.gf); + bool direct = true; + const int n_nodes = ggml_graph_n_nodes(cache.gf); + for (int i = 0; i < n_nodes; ++i) { + if (!ggml_backend_supports_op(model.backend, ggml_graph_node(cache.gf, i))) { direct = false; break; } + } + if (direct) { + if (!cache.allocr) { + cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + ggml_gallocr_reserve(cache.allocr, cache.gf); + } + ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + } else { + supertonic_sched_alloc(model, cache.gf); + } ggml_backend_tensor_set(cache.lhs_in, lhs_raw.data(), 0, lhs_raw.size() * sizeof(float)); ggml_backend_tensor_set(cache.rhs_in, rhs_raw.data(), 0, rhs_raw.size() * sizeof(float)); ggml_backend_tensor_set(cache.style_v_in, style_v_raw.data(), 0, style_v_raw.size() * sizeof(float)); ggml_backend_tensor_set(cache.kctx_in, kctx_raw.data(), 0, kctx_raw.size() * sizeof(float)); - profile_vector_compute(model, cache.gf, current_step, island); + if (direct) supertonic_graph_compute(model, cache.gf); + else profile_vector_compute(model, cache.gf, current_step, island); if (trace) { push_trace(*trace, residual_name, L, C, tensor_to_time_channel(ggml_graph_get_tensor(cache.gf, residual_name.c_str()))); push_trace(*trace, norm_name, L, C, tensor_to_time_channel(ggml_graph_get_tensor(cache.gf, norm_name.c_str()))); @@ -1190,6 +1267,14 @@ void build_tail_graph_cache(vector_tail_graph_cache & cache, int Cin, int total_steps, bool trace_outputs) { + // Reuse the cached graph when it already matches this shape AND was built on + // the direct backend path (cache.allocr non-null). The scheduler path leaves + // cache.allocr null, so it always rebuilds. Mirrors run_hift_decode. + if (cache.ctx && cache.allocr && cache.generation_id == model.generation_id + && cache.L == L && cache.C == C && cache.Cin == Cin + && cache.total_steps == total_steps && cache.trace_outputs == trace_outputs) { + return; + } free_tail_graph_cache(cache); cache.model = &model; cache.generation_id = model.generation_id; @@ -1266,8 +1351,8 @@ std::vector run_tail_graph_cache(vector_tail_graph_cache & cache, int current_step, int total_steps, std::vector * trace) { - // Rebuild every call — scheduler alloc corrupts a reused graph; see - // run_text_attention_cache for the full rationale. + // Reuse the shape-keyed graph on the direct backend path; rebuild + route + // through the scheduler only when an op must run on CPU. Mirrors run_hift_decode. build_tail_graph_cache(cache, model, L, C, Cin, total_steps, trace != nullptr); std::vector tail_in_raw = pack_time_channel_for_ggml(x_tc, L, C); std::vector noise_tc((size_t)L*Cin); @@ -1277,11 +1362,25 @@ std::vector run_tail_graph_cache(vector_tail_graph_cache & cache, } } std::vector noise_raw = pack_time_channel_for_ggml(noise_tc, L, Cin); - supertonic_sched_alloc(model, cache.gf); + bool direct = true; + const int n_nodes = ggml_graph_n_nodes(cache.gf); + for (int i = 0; i < n_nodes; ++i) { + if (!ggml_backend_supports_op(model.backend, ggml_graph_node(cache.gf, i))) { direct = false; break; } + } + if (direct) { + if (!cache.allocr) { + cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + ggml_gallocr_reserve(cache.allocr, cache.gf); + } + ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + } else { + supertonic_sched_alloc(model, cache.gf); + } ggml_backend_tensor_set(cache.tail_in, tail_in_raw.data(), 0, tail_in_raw.size()*sizeof(float)); ggml_backend_tensor_set(cache.tail_mask, latent_mask, 0, (size_t)L*sizeof(float)); ggml_backend_tensor_set(cache.tail_noise, noise_raw.data(), 0, noise_raw.size()*sizeof(float)); - profile_vector_compute(model, cache.gf, current_step, "tail"); + if (direct) supertonic_graph_compute(model, cache.gf); + else profile_vector_compute(model, cache.gf, current_step, "tail"); if (trace) { for (int j = 0; j < 4; ++j) { const std::string name = "ve_last_convnext" + std::to_string(j); diff --git a/tts-cpp/src/supertonic_vocoder.cpp b/tts-cpp/src/supertonic_vocoder.cpp index 3ed254d661e..60cc386f0f0 100644 --- a/tts-cpp/src/supertonic_vocoder.cpp +++ b/tts-cpp/src/supertonic_vocoder.cpp @@ -366,6 +366,13 @@ void free_vocoder_cache(vocoder_graph_cache & cache) { void build_supertonic_vocoder_cache(vocoder_graph_cache & cache, const supertonic_model & model, int latent_len) { + // Reuse the cached graph when it already matches this shape AND was built on + // the direct backend path (cache.allocr non-null). The scheduler path leaves + // cache.allocr null, so it always rebuilds. Mirrors run_hift_decode. + if (cache.ctx && cache.allocr && cache.generation_id == model.generation_id + && cache.latent_len == latent_len) { + return; + } free_vocoder_cache(cache); cache.model = &model; cache.generation_id = model.generation_id; @@ -722,18 +729,32 @@ bool supertonic_vocoder_forward_ggml(const supertonic_model & model, profile_vocoder_checkpoint("bn_params", profile_last); thread_local vocoder_graph_cache cache; - // Rebuild every call: the scheduler's alloc_graph mutates node->src[], so a - // cached graph can't be reused (full rationale in the vector estimator). + // Reuse the shape-keyed graph on the direct backend path; rebuild + route + // through the scheduler only when an op must run on CPU. Mirrors run_hift_decode. build_supertonic_vocoder_cache(cache, model, latent_len); profile_vocoder_checkpoint("graph_cache", profile_last); - supertonic_sched_alloc(model, cache.gf); + bool direct = true; + const int n_nodes = ggml_graph_n_nodes(cache.gf); + for (int i = 0; i < n_nodes; ++i) { + if (!ggml_backend_supports_op(model.backend, ggml_graph_node(cache.gf, i))) { direct = false; break; } + } + if (direct) { + if (!cache.allocr) { + cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + ggml_gallocr_reserve(cache.allocr, cache.gf); + } + ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + } else { + supertonic_sched_alloc(model, cache.gf); + } ggml_backend_tensor_set(cache.x_in, x_in.data(), 0, x_in.size() * sizeof(float)); ggml_backend_tensor_set(cache.bn_scale, bn_scale.data(), 0, bn_scale.size() * sizeof(float)); ggml_backend_tensor_set(cache.bn_shift, bn_shift.data(), 0, bn_shift.size() * sizeof(float)); profile_vocoder_checkpoint("set_inputs", profile_last); - supertonic_sched_compute(model, cache.gf); + if (direct) supertonic_graph_compute(model, cache.gf); + else supertonic_sched_compute(model, cache.gf); profile_vocoder_checkpoint("compute", profile_last); wav_out = ggml_tensor_to_time_channel(cache.wav); profile_vocoder_checkpoint("readback", profile_last); From e049b7a59cc9d2f8ff541b5aec08e7b74dc409ff Mon Sep 17 00:00:00 2001 From: Pratik Narola Date: Thu, 4 Jun 2026 14:26:54 +0530 Subject: [PATCH 6/7] tts-cpp: check gallocr allocation failures on the Supertonic direct path The direct-path graph-cache reuse added in 174f47d2 calls ggml_gallocr_new and ggml_gallocr_reserve without checking failure before ggml_gallocr_alloc_graph, so an allocation failure would proceed with a null or unreserved allocator instead of throwing. The scheduler fallback (supertonic_sched_alloc) already throws; the new direct path did not. Add the null/reserve checks the rest of the Supertonic code already uses (e.g. supertonic_text_encoder.cpp) at all five direct-path sites: run_text_attention_cache, run_group_graph_cache, run_res_style_qkv_cache, run_tail_graph_cache and supertonic_vocoder_forward_ggml. The ggml_gallocr_alloc_graph call is left unchecked to match that idiom. Allocation-only: full synth output is bit-identical before and after. --- tts-cpp/src/supertonic_vector_estimator.cpp | 20 ++++++++++++++++---- tts-cpp/src/supertonic_vocoder.cpp | 5 ++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/tts-cpp/src/supertonic_vector_estimator.cpp b/tts-cpp/src/supertonic_vector_estimator.cpp index 5510253bd8d..bd377c55dd1 100644 --- a/tts-cpp/src/supertonic_vector_estimator.cpp +++ b/tts-cpp/src/supertonic_vector_estimator.cpp @@ -726,7 +726,10 @@ std::vector run_text_attention_cache(vector_text_attention_cache & cache, if (direct) { if (!cache.allocr) { cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - ggml_gallocr_reserve(cache.allocr, cache.gf); + if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new supertonic text attention failed"); + if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { + throw std::runtime_error("ggml_gallocr_reserve supertonic text attention failed"); + } } ggml_gallocr_alloc_graph(cache.allocr, cache.gf); } else { @@ -942,7 +945,10 @@ vector_group_graph_result run_group_graph_cache(vector_group_graph_cache & cache if (direct) { if (!cache.allocr) { cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - ggml_gallocr_reserve(cache.allocr, cache.gf); + if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new supertonic group graph failed"); + if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { + throw std::runtime_error("ggml_gallocr_reserve supertonic group graph failed"); + } } ggml_gallocr_alloc_graph(cache.allocr, cache.gf); } else { @@ -1162,7 +1168,10 @@ vector_res_style_qkv_result run_res_style_qkv_cache(vector_res_style_qkv_cache & if (direct) { if (!cache.allocr) { cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - ggml_gallocr_reserve(cache.allocr, cache.gf); + if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new supertonic res style qkv failed"); + if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { + throw std::runtime_error("ggml_gallocr_reserve supertonic res style qkv failed"); + } } ggml_gallocr_alloc_graph(cache.allocr, cache.gf); } else { @@ -1370,7 +1379,10 @@ std::vector run_tail_graph_cache(vector_tail_graph_cache & cache, if (direct) { if (!cache.allocr) { cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - ggml_gallocr_reserve(cache.allocr, cache.gf); + if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new supertonic tail graph failed"); + if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { + throw std::runtime_error("ggml_gallocr_reserve supertonic tail graph failed"); + } } ggml_gallocr_alloc_graph(cache.allocr, cache.gf); } else { diff --git a/tts-cpp/src/supertonic_vocoder.cpp b/tts-cpp/src/supertonic_vocoder.cpp index 60cc386f0f0..bbe00137273 100644 --- a/tts-cpp/src/supertonic_vocoder.cpp +++ b/tts-cpp/src/supertonic_vocoder.cpp @@ -742,7 +742,10 @@ bool supertonic_vocoder_forward_ggml(const supertonic_model & model, if (direct) { if (!cache.allocr) { cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - ggml_gallocr_reserve(cache.allocr, cache.gf); + if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new supertonic vocoder failed"); + if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { + throw std::runtime_error("ggml_gallocr_reserve supertonic vocoder failed"); + } } ggml_gallocr_alloc_graph(cache.allocr, cache.gf); } else { From ccfdf170f387c6883917b160caad3d72c02e692d Mon Sep 17 00:00:00 2001 From: Pratik Narola Date: Thu, 4 Jun 2026 18:11:54 +0530 Subject: [PATCH 7/7] QVAC-19254 tts-cpp: address PR #36 review (sched thread-safety, logging, regex parse) - s3gen_sched_alloc (item A, blocking): guard the lazy sched/cpu_backend creation with std::call_once so a future parallel/batched-synthesis caller can't race two scheds into existence (leaking one) or double-mark buffer_w USAGE. The once_flag is held via unique_ptr so model_ctx stays move-constructible (it is moved into the process-wide cache). - supertonic_engine (item B, blocking): revert the QVAC_VERBOSE drive-by back to false (+ drop the now-unused ); unrelated Supertonic load-time logging shouldn't ride this GPU PR. - backend_selection parse_adreno_version: switch to the same regex as parakeet (PR #38) so the two are identical; validated against 20 device strings incl. the combined OpenCL '(OpenCL 3.0 Adreno(TM) 740)' -> 740. --- tts-cpp/src/backend_selection.cpp | 44 +++++++++++-------------------- tts-cpp/src/chatterbox_tts.cpp | 21 ++++++++++----- tts-cpp/src/supertonic_engine.cpp | 4 +-- 3 files changed, 31 insertions(+), 38 deletions(-) diff --git a/tts-cpp/src/backend_selection.cpp b/tts-cpp/src/backend_selection.cpp index 065adf90216..bcb417d17cc 100644 --- a/tts-cpp/src/backend_selection.cpp +++ b/tts-cpp/src/backend_selection.cpp @@ -4,10 +4,12 @@ #include #include +#include #include #include #include #include +#include #include #include @@ -212,36 +214,20 @@ void ensure_backends_loaded() { // reach the same decision on the same hardware. int parse_adreno_version(const char * s) { if (!s) return -1; - // Scan EVERY "Adreno"/"adreno" marker and keep the largest plausible - // (>= 100, i.e. real 3-digit model) version found. Some OpenCL device - // strings embed the API version before the model number, e.g. - // "QUALCOMM Adreno(TM) (OpenCL 3.0 Adreno(TM) 740)": parsing only the - // first marker yields 3 (from "OpenCL 3.0") and mis-tiers the GPU below - // Vulkan; the second "Adreno 740" marker recovers the real version. + std::string lowered(s); + std::transform(lowered.begin(), lowered.end(), lowered.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + // After an "adreno" marker (skipping "(tm)", spaces, punctuation), the model + // is a 3-4 digit generation ("740"/"830") or the Snapdragon-X "x" token + // ("x1-85" -> 800-tier). Scan every marker and keep the highest; requiring + // 3-4 digits skips the "opencl 3.0" noise in a combined OpenCL description + // like "QUALCOMM Adreno(TM) (OpenCL 3.0 Adreno(TM) 740)" -> 740, not 3. + static const std::regex re(R"(dreno\D*?(\d{3,4}|x\d))", std::regex::optimize); int best = -1; - for (const char * p = s; *p; ++p) { - if (std::strncmp(p, "Adreno", 6) != 0 && - std::strncmp(p, "adreno", 6) != 0) { - continue; - } - const char * q = p + 6; // strlen("Adreno") == strlen("adreno") == 6 - while (*q && !(*q >= '0' && *q <= '9') && *q != 'X' && *q != 'x') ++q; - if (!*q) continue; - if (*q == 'X' || *q == 'x') { - if (*(q + 1) >= '0' && *(q + 1) <= '9') { // "Adreno X1-..." family - if (800 > best) best = 800; - } - continue; // "Xclipse" etc. is not Adreno-X - } - int v = 0; - bool overflow = false; - while (*q >= '0' && *q <= '9') { - v = v * 10 + (*q - '0'); - ++q; - if (v > 100000) { overflow = true; break; } - } - // Adreno models are 3-digit; ignore API-version noise like "OpenCL 3.0". - if (!overflow && v >= 100 && v > best) best = v; + for (std::sregex_iterator it(lowered.begin(), lowered.end(), re), end; it != end; ++it) { + const std::string tok = (*it)[1].str(); + const int v = (tok[0] == 'x') ? 800 : std::stoi(tok); + if (v > best) best = v; } return best; } diff --git a/tts-cpp/src/chatterbox_tts.cpp b/tts-cpp/src/chatterbox_tts.cpp index 9df9d915ba6..91d20c426ce 100644 --- a/tts-cpp/src/chatterbox_tts.cpp +++ b/tts-cpp/src/chatterbox_tts.cpp @@ -94,6 +94,11 @@ struct model_ctx { // the latter runs in the preload thread and would race conditioning's init_cpu_backend(). mutable ggml_backend_t cpu_backend = nullptr; mutable ggml_backend_sched_t sched = nullptr; + // Guards the one-time lazy creation of `sched` / `cpu_backend` in + // s3gen_sched_alloc. A unique_ptr (not a bare std::once_flag) so model_ctx + // stays move-constructible — it is moved into the process-wide cache via + // make_unique(load_s3gen_gguf(...)). + mutable std::unique_ptr sched_once = std::make_unique(); ggml_context * ctx_w = nullptr; ggml_backend_buffer_t buffer_w = nullptr; std::map tensors; @@ -113,12 +118,16 @@ struct model_ctx { // at alloc time, so callers set inputs AFTER s3gen_sched_alloc and before // s3gen_sched_compute (S3Gen sites already follow alloc -> set -> compute). static void s3gen_sched_alloc(const model_ctx & m, ggml_cgraph * gf) { - // Lazy, single-threaded creation: reached only from run_hift_decode on the - // synthesis thread, after preload + conditioning, so init_cpu_backend() races nothing. - if (!m.sched) { + // Thread-safe one-time creation via call_once: the sched, cpu_backend and the + // buffer_w USAGE_WEIGHTS flag are built exactly once, so a future parallel / + // batched-synthesis caller cannot race two scheds into existence (leaking one) + // or double-mark the buffer. The work is still deferred to the synthesis thread + // (not load_s3gen_gguf on the preload thread) so it doesn't race conditioning's + // init_cpu_backend(). call_once re-runs the body if it throws, matching the + // previous retry-on-failure behaviour. + std::call_once(*m.sched_once, [&] { // Mark weights USAGE_WEIGHTS so sched copies a GPU-resident weight to CPU - // when a CPU-routed op (conv_transpose_1d) consumes it. Done here - // (synthesis thread), not in load_s3gen_gguf (preload thread). + // when a CPU-routed op (conv_transpose_1d) consumes it. ggml_backend_buffer_set_usage(m.buffer_w, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); ggml_backend_t sched_backends[2] = { m.backend, nullptr }; int n_sched_backends = 1; @@ -134,7 +143,7 @@ static void s3gen_sched_alloc(const model_ctx & m, ggml_cgraph * gf) { n_sched_backends, /*graph_size=*/131072, /*parallel=*/false, /*op_offload=*/false); if (!m.sched) throw std::runtime_error("s3gen: ggml_backend_sched_new failed"); - } + }); ggml_backend_sched_reset(m.sched); if (!ggml_backend_sched_alloc_graph(m.sched, gf)) { throw std::runtime_error("s3gen_sched_alloc: ggml_backend_sched_alloc_graph failed"); diff --git a/tts-cpp/src/supertonic_engine.cpp b/tts-cpp/src/supertonic_engine.cpp index 8e45f8191d9..cc87c09e084 100644 --- a/tts-cpp/src/supertonic_engine.cpp +++ b/tts-cpp/src/supertonic_engine.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -136,8 +135,7 @@ struct Engine::Impl { ::tts_cpp::detail::set_opencl_cache_dir(opts.opencl_cache_dir); } - if (!load_supertonic_gguf(opts.model_gguf_path, model, opts.n_gpu_layers, - std::getenv("QVAC_VERBOSE") != nullptr)) { + if (!load_supertonic_gguf(opts.model_gguf_path, model, opts.n_gpu_layers, false)) { throw std::runtime_error("Supertonic Engine: failed to load GGUF: " + opts.model_gguf_path); }