diff --git a/tts-cpp/src/backend_selection.cpp b/tts-cpp/src/backend_selection.cpp index 2c36287827c..065adf90216 100644 --- a/tts-cpp/src/backend_selection.cpp +++ b/tts-cpp/src/backend_selection.cpp @@ -212,24 +212,38 @@ void ensure_backends_loaded() { // reach the same decision on the same hardware. int parse_adreno_version(const char * s) { if (!s) return -1; - const char * p = std::strstr(s, "Adreno"); - if (!p) p = std::strstr(s, "adreno"); - if (!p) return -1; - p += 6; // strlen("Adreno") == strlen("adreno") == 6 - while (*p && !(*p >= '0' && *p <= '9') && *p != 'X' && *p != 'x') ++p; - if (!*p) return -1; - if (*p == 'X' || *p == 'x') { - ++p; - if (*p < '0' || *p > '9') return -1; // "Xclipse" etc. is not Adreno-X - return 800; - } - int v = 0; - while (*p >= '0' && *p <= '9') { - v = v * 10 + (*p - '0'); - ++p; - if (v > 100000) return -1; + // Scan EVERY "Adreno"/"adreno" marker and keep the largest plausible + // (>= 100, i.e. real 3-digit model) version found. Some OpenCL device + // strings embed the API version before the model number, e.g. + // "QUALCOMM Adreno(TM) (OpenCL 3.0 Adreno(TM) 740)": parsing only the + // first marker yields 3 (from "OpenCL 3.0") and mis-tiers the GPU below + // Vulkan; the second "Adreno 740" marker recovers the real version. + int best = -1; + for (const char * p = s; *p; ++p) { + if (std::strncmp(p, "Adreno", 6) != 0 && + std::strncmp(p, "adreno", 6) != 0) { + continue; + } + const char * q = p + 6; // strlen("Adreno") == strlen("adreno") == 6 + while (*q && !(*q >= '0' && *q <= '9') && *q != 'X' && *q != 'x') ++q; + if (!*q) continue; + if (*q == 'X' || *q == 'x') { + if (*(q + 1) >= '0' && *(q + 1) <= '9') { // "Adreno X1-..." family + if (800 > best) best = 800; + } + continue; // "Xclipse" etc. is not Adreno-X + } + int v = 0; + bool overflow = false; + while (*q >= '0' && *q <= '9') { + v = v * 10 + (*q - '0'); + ++q; + if (v > 100000) { overflow = true; break; } + } + // Adreno models are 3-digit; ignore API-version noise like "OpenCL 3.0". + if (!overflow && v >= 100 && v > best) best = v; } - return v; + return best; } bool is_adreno_6xx(const char * s) { @@ -242,14 +256,48 @@ bool is_adreno_700plus(const char * s) { return v >= 700; } +// True if the device name/description identifies a Qualcomm Adreno GPU. +// Unlike parse_adreno_version (which needs a 3-digit model number and so +// returns -1 for the bare OpenCL "QUALCOMM Adreno(TM)" string), this is a +// vendor check used to gate Android GPU selection. ASCII case-insensitive +// because the strings vary in capitalisation: ggml-opencl reports +// CL_DEVICE_NAME ("QUALCOMM Adreno(TM)") and ggml-vulkan reports the Vulkan +// deviceName ("Adreno (TM) 740"). +bool is_qualcomm_adreno(const char * name, const char * desc) { + auto contains_ci = [](const char * hay, const char * needle) -> bool { + if (!hay || !needle) return false; + for (const char * h = hay; *h; ++h) { + const char * a = h; + const char * b = needle; + while (*a && *b) { + const char ca = (*a >= 'A' && *a <= 'Z') ? char(*a + 32) : *a; + const char cb = (*b >= 'A' && *b <= 'Z') ? char(*b + 32) : *b; + if (ca != cb) break; + ++a; + ++b; + } + if (!*b) return true; + } + return false; + }; + return contains_ci(name, "adreno") || contains_ci(desc, "adreno") || + contains_ci(name, "qualcomm") || contains_ci(desc, "qualcomm"); +} + // Pick a GPU backend using the same tier policy as parakeet-cpp's // `init_gpu_backend` / llm-llamacpp's BackendSelection: ggml-opencl // is only used when an Adreno 700+ device is present (where its // kernels are validated and faster than Vulkan); every other GPU -// (Vulkan, Metal, CUDA, Mali, Intel iGPU, ...) goes through the -// non-OpenCL preference. Adreno 6xx OpenCL is known broken -// (incorrect outputs) and is force-skipped unless the caller opts -// in via `TTS_CPP_ALLOW_ADRENO_6XX=1`. +// (Vulkan, Metal, CUDA, Intel iGPU, ...) goes through the non-OpenCL +// preference. Adreno 6xx OpenCL is known broken (incorrect outputs) +// and is force-skipped unless the caller opts in via +// `TTS_CPP_ALLOW_ADRENO_6XX=1`. +// +// On Android the device walk is additionally gated to Qualcomm Adreno +// only: other Android GPU vendors are not validated and at least one +// (ARM Mali / Tensor) aborts the host process from inside graph +// compute, so they are skipped and the engine falls back to CPU. +// Desktop GPU vendors are unaffected. // // Routed exclusively through the ggml-backend registry // (`ggml_backend_load_all` + `ggml_backend_dev_*`). No direct calls @@ -292,6 +340,29 @@ ggml_backend_t init_gpu_backend(int n_gpu_layers, const char * reg_name = dev_reg_name(dev); const bool is_opencl = reg_name && std::strcmp(reg_name, "OpenCL") == 0; +#if defined(__ANDROID__) + // Android GPU allowlist: only Qualcomm Adreno is validated for the + // tts-cpp GPU backends (OpenCL on Adreno 700+, Vulkan as the + // bring-up fallback). Other Android GPU vendors are not validated, + // and at least one (ARM Mali / Tensor) aborts the whole host + // process from inside ggml_backend_graph_compute via GGML_ASSERT -> + // ggml_abort(), which cannot be caught from C++. Skip non-Adreno + // devices so the policy falls through to CPU instead of risking a + // fatal abort on an unvalidated driver. + if (!is_qualcomm_adreno(name, desc)) { + if (verbose) { + fprintf(stderr, + "%s: Android GPU '%s' (%s) is not Qualcomm Adreno; " + "skipping (only Adreno is validated on Android; " + "falling through to CPU)\n", + log_prefix, + name ? name : "?", + desc ? desc : "?"); + } + continue; + } +#endif + const int adreno_v = std::max(parse_adreno_version(name), parse_adreno_version(desc)); if (adreno_v > max_adreno_version) max_adreno_version = adreno_v; @@ -331,10 +402,11 @@ ggml_backend_t init_gpu_backend(int n_gpu_layers, // 1. Adreno 700+: prefer OpenCL (validated, faster than Vulkan // on Snapdragon 8 Gen 2/3/4 etc.). // 2. Anything else with a non-OpenCL GPU: prefer that - // (Vulkan on all non-Adreno Android, Metal on Apple, CUDA - // on Linux/Windows desktop, Mali iGPU via Vulkan, ...). - // 3. Last resort: any other OpenCL device (e.g. desktop OpenCL - // or non-Adreno mobile when no Vulkan is registered). + // (Adreno Vulkan on Android — non-Adreno is filtered out + // above; Metal on Apple; CUDA / Vulkan on Linux/Windows + // desktop). + // 3. Last resort: any other OpenCL device (e.g. desktop OpenCL, + // or Adreno OpenCL whose version string lacked a model number). auto try_init = [&](const std::vector & bucket) -> ggml_backend_t { for (const Cand & c : bucket) { ggml_backend_t b = ggml_backend_dev_init(c.dev, nullptr); diff --git a/tts-cpp/src/backend_selection.h b/tts-cpp/src/backend_selection.h index 60c99104e9f..7054cb7273c 100644 --- a/tts-cpp/src/backend_selection.h +++ b/tts-cpp/src/backend_selection.h @@ -87,4 +87,10 @@ int parse_adreno_version(const char * s); bool is_adreno_6xx(const char * s); bool is_adreno_700plus(const char * s); +// Vendor check (name OR description, ASCII case-insensitive): true for a +// Qualcomm Adreno GPU. Unlike parse_adreno_version it does not require a +// model number, so it also matches the bare OpenCL "QUALCOMM Adreno(TM)" +// string. Used to gate Android GPU selection to the only validated vendor. +bool is_qualcomm_adreno(const char * name, const char * desc); + } // namespace tts_cpp::detail diff --git a/tts-cpp/src/chatterbox_cli.cpp b/tts-cpp/src/chatterbox_cli.cpp index d112adcc8a4..c70ad097352 100644 --- a/tts-cpp/src/chatterbox_cli.cpp +++ b/tts-cpp/src/chatterbox_cli.cpp @@ -320,6 +320,7 @@ struct cli_params { std::string tokens_file; // optional pre-tokenized speech tokens (skips T3) std::string text; // input text for T3 std::string output; // legacy: speech-tokens output file (if set, write tokens) + std::string dump_mel_path; // optional: dump S3Gen intermediates (_mu/_step0_dxdt/mel) to .npy for debugging // S3Gen + HiFT vocoder: std::string s3gen_gguf; // enables full text → wav pipeline std::string out_wav; // wav output path (requires --s3gen-gguf) @@ -450,6 +451,7 @@ static void print_usage(const char * argv0) { fprintf(stderr, " With --s3gen-gguf this is interpreted as *speech* tokens\n"); fprintf(stderr, " and the T3 step is skipped.\n"); fprintf(stderr, " --output PATH Write generated speech tokens to PATH (text mode).\n"); + fprintf(stderr, " --dump-mel-path PATH Debug: dump S3Gen mel to PATH, encoder to PATH_mu.npy, CFM step0 to PATH_step0_dxdt.npy.\n"); fprintf(stderr, "\n"); fprintf(stderr, " --s3gen-gguf PATH Enables the full text -> wav pipeline (S3Gen + HiFT).\n"); fprintf(stderr, " --out PATH Output wav file when --s3gen-gguf is set.\n"); @@ -590,6 +592,7 @@ static bool parse_args(int argc, char ** argv, cli_params & params) { else if (arg == "--text") { auto v = next("--text"); if (!v) return false; params.text = v; } else if (arg == "--tokens-file") { auto v = next("--tokens-file"); if (!v) return false; params.tokens_file = v; } else if (arg == "--output") { auto v = next("--output"); if (!v) return false; params.output = v; } + else if (arg == "--dump-mel-path") { auto v = next("--dump-mel-path"); if (!v) return false; params.dump_mel_path = v; } else if (arg == "--s3gen-gguf") { auto v = next("--s3gen-gguf"); if (!v) return false; params.s3gen_gguf = v; } else if (arg == "--out") { auto v = next("--out"); if (!v) return false; params.out_wav = v; } else if (arg == "--ref-dir") { auto v = next("--ref-dir"); if (!v) return false; params.ref_dir = v; } @@ -982,6 +985,7 @@ int tts_cpp_cli_main(int argc, char ** argv) { opts.verbose = params.verbose; opts.n_gpu_layers = params.n_gpu_layers; opts.cfm_steps = params.cfm_steps; + opts.dump_mel_path = params.dump_mel_path; opts.cfm_f16_kv_attn = params.cfm_f16_kv_attn; if (!params.reference_audio.empty()) { if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf, @@ -1265,6 +1269,7 @@ int tts_cpp_cli_main(int argc, char ** argv) { // chunk; --cfm-steps falls in as the per-chunk default below // (`stream_cfm_steps > 0 ? stream_cfm_steps : cfm_steps`). opts.cfm_steps = params.cfm_steps; + opts.dump_mel_path = params.dump_mel_path; opts.cfm_f16_kv_attn = params.cfm_f16_kv_attn; if (!params.reference_audio.empty()) { if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf, @@ -2063,6 +2068,7 @@ int tts_cpp_cli_main(int argc, char ** argv) { // Streaming chunks honour --stream-cfm-steps with --cfm-steps as // fallback when copts is set up further below. opts.cfm_steps = params.cfm_steps; + opts.dump_mel_path = params.dump_mel_path; opts.cfm_f16_kv_attn = params.cfm_f16_kv_attn; if (!params.reference_audio.empty()) { if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf, diff --git a/tts-cpp/src/chatterbox_tts.cpp b/tts-cpp/src/chatterbox_tts.cpp index 24c43b5ecf9..7c9514bfa81 100644 --- a/tts-cpp/src/chatterbox_tts.cpp +++ b/tts-cpp/src/chatterbox_tts.cpp @@ -87,6 +87,13 @@ struct scoped_timer { struct model_ctx { ggml_backend_t backend = nullptr; + // sched [backend, cpu_backend] routes ops the GPU backend can't run + // (GGML_OP_CONV_TRANSPOSE_1D in the HiFT vocoder) to CPU instead of asserting; + // stays a single-backend pass-through (cpu_backend null) when the primary is + // the CPU. Created lazily on the synthesis thread, not in load_s3gen_gguf — + // the latter runs in the preload thread and would race conditioning's init_cpu_backend(). + mutable ggml_backend_t cpu_backend = nullptr; + mutable ggml_backend_sched_t sched = nullptr; ggml_context * ctx_w = nullptr; ggml_backend_buffer_t buffer_w = nullptr; std::map tensors; @@ -101,6 +108,47 @@ struct model_ctx { float cfg_rate = 0.0f; }; +// Allocate + run a graph through the model scheduler — like the single-backend +// compute() above, but lets sched route unsupported ops to CPU. sched allocates +// at alloc time, so callers set inputs AFTER s3gen_sched_alloc and before +// s3gen_sched_compute (S3Gen sites already follow alloc -> set -> compute). +static void s3gen_sched_alloc(const model_ctx & m, ggml_cgraph * gf) { + // Lazy, single-threaded creation: reached only from run_hift_decode on the + // synthesis thread, after preload + conditioning, so init_cpu_backend() races nothing. + if (!m.sched) { + // Mark weights USAGE_WEIGHTS so sched copies a GPU-resident weight to CPU + // when a CPU-routed op (conv_transpose_1d) consumes it. Done here + // (synthesis thread), not in load_s3gen_gguf (preload thread). + ggml_backend_buffer_set_usage(m.buffer_w, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + ggml_backend_t sched_backends[2] = { m.backend, nullptr }; + int n_sched_backends = 1; + if (!::tts_cpp::detail::backend_is_cpu(m.backend)) { + m.cpu_backend = ::tts_cpp::detail::init_cpu_backend(); + if (!m.cpu_backend) throw std::runtime_error("s3gen: init CPU backend for scheduler failed"); + sched_backends[1] = m.cpu_backend; + n_sched_backends = 2; + } + // graph_size matches the HiFT graph's ggml_new_graph_custom capacity (it + // is the only graph routed through sched, and the largest S3Gen graph). + m.sched = ggml_backend_sched_new(sched_backends, /*bufts=*/nullptr, + n_sched_backends, /*graph_size=*/131072, + /*parallel=*/false, /*op_offload=*/false); + if (!m.sched) throw std::runtime_error("s3gen: ggml_backend_sched_new failed"); + } + ggml_backend_sched_reset(m.sched); + if (!ggml_backend_sched_alloc_graph(m.sched, gf)) { + throw std::runtime_error("s3gen_sched_alloc: ggml_backend_sched_alloc_graph failed"); + } +} + +static void s3gen_sched_compute(const model_ctx & m, ggml_cgraph * gf) { + // CPU work inside the sched runs on cpu_backend (GPU primary) or the primary + // itself (CPU-only model). Set its thread count per call, like compute(). + ggml_backend_t cpu_b = m.cpu_backend ? m.cpu_backend : m.backend; + ::tts_cpp::detail::backend_set_n_threads(cpu_b, g_n_threads); + ggml_backend_sched_graph_compute(m.sched, gf); +} + static ggml_backend_t s3gen_init_backend(int n_gpu_layers, bool verbose) { // GPU cascade is centralised in backend_selection.cpp's // `init_gpu_backend` (Adreno 700+ -> OpenCL, every other GPU -> @@ -185,9 +233,12 @@ static void s3gen_model_cache_release() { if (!g_s3gen_cache_entry) return; model_ctx * m = g_s3gen_cache_entry->m.get(); if (m) { + // Free the scheduler before the backends/buffers it references. + if (m->sched) { ggml_backend_sched_free(m->sched); m->sched = nullptr; } if (m->buffer_w) { ggml_backend_buffer_free(m->buffer_w); m->buffer_w = nullptr; } if (m->ctx_w) { ggml_free(m->ctx_w); m->ctx_w = nullptr; } if (m->backend) { ggml_backend_free(m->backend); m->backend = nullptr; } + if (m->cpu_backend) { ggml_backend_free(m->cpu_backend); m->cpu_backend = nullptr; } m->tensors.clear(); } g_s3gen_cache_entry.reset(); @@ -258,6 +309,12 @@ static model_ctx load_s3gen_gguf(const std::string & path, int n_gpu_layers, boo ggml_tensor * src = ggml_get_tensor(tmp_ctx, ggml_get_name(cur)); ggml_backend_tensor_set(cur, ggml_get_data(src), 0, ggml_nbytes(src)); } + // NOTE: ALL scheduler setup (m.sched, m.cpu_backend, and the buffer_w + // USAGE_WEIGHTS flag) is done lazily in s3gen_sched_alloc on the synthesis + // thread — NOT here. load_s3gen_gguf runs in the s3gen_preload background + // thread concurrently with the main thread's reference-audio conditioning; + // doing backend/buffer setup here disturbs that path + // (-> "mel_graph_run: init_cpu_backend failed"). { int64_t k_mf = gguf_find_key(g, "s3gen.meanflow"); @@ -1908,7 +1965,10 @@ static std::vector run_hift_decode(const model_ctx & m, graph_cache & cache = g_hift_graph_cache; const int64_t cache_key = pack_hift_key(T_mel, T_stft); - const bool build_graph = (cache.key != cache_key) || (cache.ctx == nullptr); + // Always rebuild: the scheduler's alloc_graph mutates node->src[] (the GPU<->CPU + // copies around the CPU-routed conv_transpose_1d), so a cached graph can't be + // reused. HiFT builds once per synth — negligible cost. + const bool build_graph = true; if (build_graph) { if (cache.allocr) { ggml_gallocr_free(cache.allocr); cache.allocr = nullptr; } if (cache.ctx) { ggml_free(cache.ctx); cache.ctx = nullptr; } @@ -2062,9 +2122,8 @@ static std::vector run_hift_decode(const model_ctx & m, y_trim = ggml_clamp(ctx, y_trim, -0.99f, 0.99f); ggml_set_name(y_trim, "wav"); ggml_set_output(y_trim); ggml_build_forward_expand(gf, y_trim); - - cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend)); - ggml_gallocr_reserve(cache.allocr, gf); + // No gallocr here — this graph is allocated by the model scheduler + // (s3gen_sched_alloc below) so conv_transpose_1d can be routed to CPU. } // end build_graph // Cached scaffolding (pulled outside build_graph too — when the graph @@ -2073,7 +2132,28 @@ static std::vector run_hift_decode(const model_ctx & m, const std::vector & ik_data = cached_istft_kernel(n_fft); const std::vector & ws_data = cached_window_sum(T_stft, n_fft, hop); - ggml_gallocr_alloc_graph(cache.allocr, gf); + // Capability-gate the scheduler. The [GPU,CPU] ggml_backend_sched exists only + // to route CONV_TRANSPOSE_1D to CPU because ggml-opencl / ggml-vulkan lack that + // kernel. A backend that can run every op in this graph itself (Metal, CUDA, + // CPU) does not need the scheduler — and the scheduler's graph-split aborts on + // the iOS Metal driver — so run those directly on the primary backend (the + // pre-scheduler path). Only use the scheduler when the primary backend can't + // run some op. Generic: asks the actual backend about the actual graph, with + // no platform / backend-name hardcoding, so iOS Metal is not regressed by the + // Android-motivated routing. + bool primary_runs_all = true; + const int hift_n_nodes = ggml_graph_n_nodes(gf); + for (int i = 0; i < hift_n_nodes; ++i) { + if (!ggml_backend_supports_op(m.backend, ggml_graph_node(gf, i))) { primary_runs_all = false; break; } + } + ggml_gallocr_t hift_allocr = nullptr; + if (primary_runs_all) { + hift_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend)); + ggml_gallocr_reserve(hift_allocr, gf); + ggml_gallocr_alloc_graph(hift_allocr, gf); + } else { + s3gen_sched_alloc(m, gf); + } ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "mel_in"), mel.data(), 0, mel.size()*sizeof(float)); ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "s_in"), s_stft.data(), 0, s_stft.size()*sizeof(float)); ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "istft_k"), ik_data.data(),0, ik_data.size()*sizeof(float)); @@ -2100,11 +2180,19 @@ static std::vector run_hift_decode(const model_ctx & m, ggml_backend_tensor_set(ggml_graph_get_tensor(gf, e.first.c_str()), inv.data(), 0, inv.size()*sizeof(float)); } - compute(m.backend, gf); + if (primary_runs_all) { + compute(m.backend, gf); + } else { + s3gen_sched_compute(m, gf); + } ggml_tensor * y_trim_out = ggml_graph_get_tensor(gf, "wav"); std::vector wav(ggml_nelements(y_trim_out)); ggml_backend_tensor_get(y_trim_out, wav.data(), 0, ggml_nbytes(y_trim_out)); + // Free the direct-path allocr only AFTER reading the output — y_trim_out's + // data lives in this buffer (freeing it earlier is a use-after-free in the + // tensor_get above). nullptr on the scheduler path, so the guard covers both. + if (hift_allocr) ggml_gallocr_free(hift_allocr); return wav; } diff --git a/tts-cpp/src/supertonic_engine.cpp b/tts-cpp/src/supertonic_engine.cpp index cc87c09e084..8e45f8191d9 100644 --- a/tts-cpp/src/supertonic_engine.cpp +++ b/tts-cpp/src/supertonic_engine.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -135,7 +136,8 @@ struct Engine::Impl { ::tts_cpp::detail::set_opencl_cache_dir(opts.opencl_cache_dir); } - if (!load_supertonic_gguf(opts.model_gguf_path, model, opts.n_gpu_layers, false)) { + if (!load_supertonic_gguf(opts.model_gguf_path, model, opts.n_gpu_layers, + std::getenv("QVAC_VERBOSE") != nullptr)) { throw std::runtime_error("Supertonic Engine: failed to load GGUF: " + opts.model_gguf_path); } diff --git a/tts-cpp/src/supertonic_gguf.cpp b/tts-cpp/src/supertonic_gguf.cpp index 1c33ebe41e7..eb4420c38a4 100644 --- a/tts-cpp/src/supertonic_gguf.cpp +++ b/tts-cpp/src/supertonic_gguf.cpp @@ -212,6 +212,24 @@ void supertonic_graph_compute(const supertonic_model & model, ggml_cgraph * grap ggml_backend_graph_compute(model.backend, graph); } +void supertonic_sched_alloc(const supertonic_model & model, ggml_cgraph * graph) { + ggml_backend_sched_reset(model.sched); + if (!ggml_backend_sched_alloc_graph(model.sched, graph)) { + throw std::runtime_error("supertonic_sched_alloc: ggml_backend_sched_alloc_graph failed"); + } +} + +void supertonic_sched_compute(const supertonic_model & model, ggml_cgraph * graph) { + // CPU work inside the sched runs on cpu_backend (GPU primary) or on the + // primary itself (CPU-only model). Set its thread count per-call, mirroring + // the single-backend path above. + ggml_backend_t cpu_b = model.cpu_backend ? model.cpu_backend : model.backend; + if (model.n_threads > 0) { + ::tts_cpp::detail::backend_set_n_threads(cpu_b, model.n_threads); + } + ggml_backend_sched_graph_compute(model.sched, graph); +} + static void bind_vocoder_weights(supertonic_model & model) { auto & v = model.vocoder; v.normalizer_scale = require_source_tensor(model, "vocoder:tts.ttl.normalizer.scale"); @@ -310,6 +328,15 @@ bool load_supertonic_gguf(const std::string & path, model.buffer_w = ggml_backend_alloc_ctx_tensors(model.ctx_w, model.backend); if (!model.buffer_w) throw std::runtime_error("ggml_backend_alloc_ctx_tensors failed"); + // Mark the weight buffer as WEIGHTS so the scheduler treats these + // tensors as immovable and inserts GPU->CPU copies when a CPU-only op + // (the GGML_OP_CUSTOM kernels in the vector estimator / vocoder) + // consumes them. Without this they default to USAGE_ANY: sched's + // weight-aware split/copy path (ggml-backend.cpp) does not fire, some + // weights stay on the GPU buffer, and the CPU custom op dereferences a + // device offset -> SIGSEGV. Standard llama.cpp/whisper.cpp pattern. + ggml_backend_buffer_set_usage(model.buffer_w, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + for (ggml_tensor * cur = ggml_get_first_tensor(model.ctx_w); cur; cur = ggml_get_next_tensor(model.ctx_w, cur)) { @@ -348,6 +375,31 @@ bool load_supertonic_gguf(const std::string & path, } bind_vocoder_weights(model); + + // Build the scheduler. With a GPU primary, add a CPU backend so + // ops the GPU can't run (GGML_OP_CUSTOM, and any FA the driver + // rejects) are routed to CPU rather than silently skipped. With a + // CPU primary, the sched is a single-backend pass-through (no + // second CPU backend created). + { + ggml_backend_t backends[2] = { model.backend, nullptr }; + int n_backends = 1; + if (!::tts_cpp::detail::backend_is_cpu(model.backend)) { + model.cpu_backend = ::tts_cpp::detail::init_cpu_backend(); + if (!model.cpu_backend) { + throw std::runtime_error("init CPU backend for scheduler failed"); + } + backends[1] = model.cpu_backend; + n_backends = 2; + } + model.sched = ggml_backend_sched_new(backends, /*bufts=*/nullptr, + n_backends, /*graph_size=*/ 8192, + /*parallel=*/ false, + /*op_offload=*/ false); + if (!model.sched) { + throw std::runtime_error("ggml_backend_sched_new failed"); + } + } } catch (const std::exception & e) { fprintf(stderr, "load_supertonic_gguf: %s\n", e.what()); gguf_free(gguf_ctx); @@ -374,6 +426,11 @@ void free_supertonic_model(supertonic_model & model) { if (model.generation_id != 0) { unregister_supertonic_alive(model.generation_id); } + // Free the scheduler before the backends/buffers it references. + if (model.sched) { + ggml_backend_sched_free(model.sched); + model.sched = nullptr; + } if (model.buffer_w) { ggml_backend_buffer_free(model.buffer_w); model.buffer_w = nullptr; @@ -382,6 +439,10 @@ void free_supertonic_model(supertonic_model & model) { ggml_backend_free(model.backend); model.backend = nullptr; } + if (model.cpu_backend) { + ggml_backend_free(model.cpu_backend); + model.cpu_backend = nullptr; + } if (model.ctx_w) { ggml_free(model.ctx_w); model.ctx_w = nullptr; diff --git a/tts-cpp/src/supertonic_internal.h b/tts-cpp/src/supertonic_internal.h index f0587a72cff..7e157f388f8 100644 --- a/tts-cpp/src/supertonic_internal.h +++ b/tts-cpp/src/supertonic_internal.h @@ -74,6 +74,14 @@ struct supertonic_model { uint64_t generation_id = 0; int n_threads = 0; ggml_backend_t backend = nullptr; + // Scheduler so ops the GPU backend can't run (notably GGML_OP_CUSTOM + // CPU kernels in the vector estimator / vocoder) auto-route to CPU + // instead of being silently skipped on a single backend. Always + // created: [backend, cpu_backend] for a GPU primary, or a degenerate + // [backend] when the primary is itself CPU. cpu_backend stays null in + // the CPU-only case (no second CPU backend). + ggml_backend_t cpu_backend = nullptr; + ggml_backend_sched_t sched = nullptr; ggml_context * ctx_w = nullptr; ggml_backend_buffer_t buffer_w = nullptr; @@ -94,6 +102,16 @@ void free_supertonic_model(supertonic_model & model); void supertonic_set_n_threads(supertonic_model & model, int n_threads); void supertonic_graph_compute(const supertonic_model & model, ggml_cgraph * graph); +// Scheduler-based alloc + compute (Option A), used by stages migrated off +// the per-graph ggml_gallocr. Pairing contract at each call site: +// supertonic_sched_alloc(model, gf); // reset + allocate via sched +// ggml_backend_tensor_set(input_leaf, ...); // inputs now have memory +// supertonic_sched_compute(model, gf); // run (routes customs -> CPU) +// The graph topology may be a reused thread_local cache; sched_reset does not +// touch the user graph, so caches stay valid across calls. +void supertonic_sched_alloc(const supertonic_model & model, ggml_cgraph * graph); +void supertonic_sched_compute(const supertonic_model & model, ggml_cgraph * graph); + ggml_tensor * require_tensor(const supertonic_model & model, const std::string & name); ggml_tensor * require_source_tensor(const supertonic_model & model, const std::string & source_name); diff --git a/tts-cpp/src/supertonic_vector_estimator.cpp b/tts-cpp/src/supertonic_vector_estimator.cpp index b4da8328f91..de60ae8c3e1 100644 --- a/tts-cpp/src/supertonic_vector_estimator.cpp +++ b/tts-cpp/src/supertonic_vector_estimator.cpp @@ -62,13 +62,13 @@ void profile_vector_compute(const supertonic_model & model, int step, const char * island) { if (!vector_profile_enabled()) { - supertonic_graph_compute(model, graph); + supertonic_sched_compute(model, graph); return; } auto & state = vector_profile(); const auto t0 = std::chrono::steady_clock::now(); const double pre_ms = std::chrono::duration(t0 - state.last).count(); - supertonic_graph_compute(model, graph); + supertonic_sched_compute(model, graph); const auto t1 = std::chrono::steady_clock::now(); const double ms = std::chrono::duration(t1 - t0).count(); state.last = t1; @@ -686,12 +686,9 @@ void build_text_attention_cache(vector_text_attention_cache & cache, ggml_set_name(out, "vector_attn_out"); ggml_set_output(out); ggml_build_forward_expand(cache.gf, out); - cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector text attention cache failed"); - if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { - throw std::runtime_error("ggml_gallocr_reserve vector text attention cache failed"); - } - ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + // Allocation is per-call via the model scheduler (supertonic_sched_alloc + // in run), which routes GGML_OP_CUSTOM ops to CPU. No per-cache gallocr; + // cache.allocr stays null (free_*_cache's safe_gallocr_free no-ops on it). } std::vector run_text_attention_cache(vector_text_attention_cache & cache, @@ -708,12 +705,11 @@ std::vector run_text_attention_cache(vector_text_attention_cache & cache, int current_step, const char * island, std::vector * ctx_trace) { - if (cache.model != &model || cache.generation_id != model.generation_id || - cache.q_len != q_len || cache.kv_len != kv_len || - cache.n_heads != n_heads || cache.head_dim != head_dim || - cache.out_w_source != out_w_source || cache.out_b_source != out_b_source) { - build_text_attention_cache(cache, model, q_len, kv_len, n_heads, head_dim, out_w_source, out_b_source); - } + // Rebuild every call: ggml_backend_sched_alloc_graph mutates node->src[] when it + // inserts cross-backend GPU<->CPU copies, corrupting a graph reused across denoise + // steps. Build is microseconds vs millisecond compute, so always rebuilding is free. + build_text_attention_cache(cache, model, q_len, kv_len, n_heads, head_dim, out_w_source, out_b_source); + supertonic_sched_alloc(model, cache.gf); ggml_backend_tensor_set(cache.q_tc_in, q_tc.data(), 0, q_tc.size()*sizeof(float)); ggml_backend_tensor_set(cache.k_tc_in, k_tc.data(), 0, k_tc.size()*sizeof(float)); ggml_backend_tensor_set(cache.v_tc_in, v_tc.data(), 0, v_tc.size()*sizeof(float)); @@ -869,12 +865,8 @@ void build_group_graph_cache(vector_group_graph_cache & cache, ggml_set_name(k, k_name.c_str()); ggml_set_output(k); ggml_build_forward_expand(cache.gf, k); ggml_set_name(v, v_name.c_str()); ggml_set_output(v); ggml_build_forward_expand(cache.gf, v); - cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector group cache failed"); - if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { - throw std::runtime_error("ggml_gallocr_reserve vector group cache failed"); - } - ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + // Allocation is per-call via the model scheduler (supertonic_sched_alloc + // in run), which routes GGML_OP_CUSTOM ops to CPU. No per-cache gallocr. } vector_group_graph_result run_group_graph_cache(vector_group_graph_cache & cache, @@ -899,20 +891,14 @@ vector_group_graph_result run_group_graph_cache(vector_group_graph_cache & cache const std::string & v_name, const char * island, std::vector * trace) { - if (cache.model != &model || cache.generation_id != model.generation_id || - cache.L != L || cache.C != C || cache.text_len != text_len || - cache.group != group || cache.conv_block != conv_block || - cache.linear_block != linear_block || cache.post_block != post_block || - cache.trace_outputs != (trace != nullptr) || - cache.matmul_source != matmul_source || - cache.q_matmul_source != q_matmul_source || cache.k_matmul_source != k_matmul_source || - cache.v_matmul_source != v_matmul_source) { - build_group_graph_cache(cache, model, L, C, group, conv_block, linear_block, matmul_source, post_block, - text_len, q_matmul_source, k_matmul_source, v_matmul_source, - q_name, k_name, v_name, - trace != nullptr); - } + // Rebuild every call — scheduler alloc corrupts a reused graph; see + // run_text_attention_cache for the full rationale. + build_group_graph_cache(cache, model, L, C, group, conv_block, linear_block, matmul_source, post_block, + text_len, q_matmul_source, k_matmul_source, v_matmul_source, + q_name, k_name, v_name, + trace != nullptr); std::vector x_raw = pack_time_channel_for_ggml(x_tc, L, C); + supertonic_sched_alloc(model, cache.gf); ggml_backend_tensor_set(cache.x_in, x_raw.data(), 0, x_raw.size()*sizeof(float)); ggml_backend_tensor_set(cache.temb_in, temb.data(), 0, temb.size()*sizeof(float)); ggml_backend_tensor_set(cache.text_in, text_lc_host, 0, (size_t) text_len * 256 * sizeof(float)); @@ -1069,12 +1055,8 @@ void build_res_style_qkv_cache(vector_res_style_qkv_cache & cache, ggml_set_name(sk, k_name.c_str()); ggml_set_output(sk); ggml_build_forward_expand(cache.gf, sk); ggml_set_name(sv, v_name.c_str()); ggml_set_output(sv); ggml_build_forward_expand(cache.gf, sv); - cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new res-style-qkv failed"); - if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { - throw std::runtime_error("ggml_gallocr_reserve res-style-qkv failed"); - } - ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + // Allocation is per-call via the model scheduler (supertonic_sched_alloc + // in run), which routes GGML_OP_CUSTOM ops to CPU. No per-cache gallocr. } vector_res_style_qkv_result run_res_style_qkv_cache(vector_res_style_qkv_cache & cache, @@ -1101,19 +1083,15 @@ vector_res_style_qkv_result run_res_style_qkv_cache(vector_res_style_qkv_cache & const char * island, std::vector * trace) { const bool want_trace = trace != nullptr; - if (cache.model != &model || cache.generation_id != model.generation_id || - cache.L != L || cache.C != C || - cache.norm_block != norm_block || cache.post_block != post_block || - cache.style_block != style_block || cache.trace_outputs != want_trace || - cache.q_matmul_source != q_matmul_source || cache.k_matmul_source != k_matmul_source || - cache.v_matmul_source != v_matmul_source) { - build_res_style_qkv_cache(cache, model, L, C, norm_block, post_block, style_block, - q_matmul_source, k_matmul_source, v_matmul_source, - residual_name, norm_name, post_name, q_name, k_name, v_name, - want_trace); - } + // Rebuild every call — scheduler alloc corrupts a reused graph; see + // run_text_attention_cache for the full rationale. + build_res_style_qkv_cache(cache, model, L, C, norm_block, post_block, style_block, + q_matmul_source, k_matmul_source, v_matmul_source, + residual_name, norm_name, post_name, q_name, k_name, v_name, + want_trace); std::vector lhs_raw = pack_time_channel_for_ggml(lhs_tc, L, C); std::vector rhs_raw = pack_time_channel_for_ggml(rhs_tc, L, C); + supertonic_sched_alloc(model, cache.gf); ggml_backend_tensor_set(cache.lhs_in, lhs_raw.data(), 0, lhs_raw.size() * sizeof(float)); ggml_backend_tensor_set(cache.rhs_in, rhs_raw.data(), 0, rhs_raw.size() * sizeof(float)); ggml_backend_tensor_set(cache.style_v_in, style_v_raw.data(), 0, style_v_raw.size() * sizeof(float)); @@ -1273,12 +1251,8 @@ void build_tail_graph_cache(vector_tail_graph_cache & cache, ggml_build_forward_expand(cache.gf, next); } - cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector tail cache failed"); - if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { - throw std::runtime_error("ggml_gallocr_reserve vector tail cache failed"); - } - ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + // Allocation is per-call via the model scheduler (supertonic_sched_alloc + // in run), which routes GGML_OP_CUSTOM ops to CPU. No per-cache gallocr. } std::vector run_tail_graph_cache(vector_tail_graph_cache & cache, @@ -1292,12 +1266,9 @@ std::vector run_tail_graph_cache(vector_tail_graph_cache & cache, int current_step, int total_steps, std::vector * trace) { - if (cache.model != &model || cache.generation_id != model.generation_id || - cache.L != L || cache.C != C || - cache.Cin != Cin || cache.total_steps != total_steps || - cache.trace_outputs != (trace != nullptr)) { - build_tail_graph_cache(cache, model, L, C, Cin, total_steps, trace != nullptr); - } + // Rebuild every call — scheduler alloc corrupts a reused graph; see + // run_text_attention_cache for the full rationale. + build_tail_graph_cache(cache, model, L, C, Cin, total_steps, trace != nullptr); std::vector tail_in_raw = pack_time_channel_for_ggml(x_tc, L, C); std::vector noise_tc((size_t)L*Cin); for (int t = 0; t < L; ++t) { @@ -1306,6 +1277,7 @@ std::vector run_tail_graph_cache(vector_tail_graph_cache & cache, } } std::vector noise_raw = pack_time_channel_for_ggml(noise_tc, L, Cin); + supertonic_sched_alloc(model, cache.gf); ggml_backend_tensor_set(cache.tail_in, tail_in_raw.data(), 0, tail_in_raw.size()*sizeof(float)); ggml_backend_tensor_set(cache.tail_mask, latent_mask, 0, (size_t)L*sizeof(float)); ggml_backend_tensor_set(cache.tail_noise, noise_raw.data(), 0, noise_raw.size()*sizeof(float)); @@ -2108,17 +2080,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, ggml_set_output(v_t); ggml_build_forward_expand(gf, v_t); - ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!allocr) { - ggml_free(ctx); - throw std::runtime_error("ggml_gallocr_new failed"); - } - if (!ggml_gallocr_reserve(allocr, gf)) { - ggml_gallocr_free(allocr); - ggml_free(ctx); - throw std::runtime_error("ggml_gallocr_reserve failed"); - } - ggml_gallocr_alloc_graph(allocr, gf); + supertonic_sched_alloc(model, gf); ggml_backend_tensor_set(x, noisy_latent, 0, (size_t) L * Cin * sizeof(float)); ggml_backend_tensor_set(mask, latent_mask, 0, (size_t) L * sizeof(float)); @@ -2217,17 +2179,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.5.norm.norm.bias")); ggml_set_name(style_norm, "ve_style0_norm"); ggml_set_output(style_norm); ggml_build_forward_expand(srgf, style_norm); - ggml_gallocr_t srallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!srallocr) { - ggml_free(srctx); - throw std::runtime_error("ggml_gallocr_new style residual failed"); - } - if (!ggml_gallocr_reserve(srallocr, srgf)) { - ggml_gallocr_free(srallocr); - ggml_free(srctx); - throw std::runtime_error("ggml_gallocr_reserve style residual failed"); - } - ggml_gallocr_alloc_graph(srallocr, srgf); + supertonic_sched_alloc(model, srgf); std::vector style_out_raw = pack_time_channel_for_ggml(style_out_ggml, L, C); std::vector style_lhs_raw = pack_time_channel_for_ggml(post_ggml, L, C); ggml_backend_tensor_set(style_out_in, style_out_raw.data(), 0, style_out_raw.size()*sizeof(float)); @@ -2236,7 +2188,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, PUSH_GGML_TRACE({"ve_style0_residual", {L, C}, tensor_to_time_channel(ggml_graph_get_tensor(srgf, "ve_style0_residual"))}); std::vector style_norm_ggml = tensor_to_time_channel(ggml_graph_get_tensor(srgf, "ve_style0_norm")); PUSH_GGML_TRACE({"ve_style0_norm", {L, C}, style_norm_ggml}); - ggml_gallocr_free(srallocr); ggml_free(srctx); thread_local vector_group_graph_cache g1_group_cache; @@ -2321,17 +2272,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.11.norm.norm.bias")); ggml_set_name(g1_style_norm, "ve_g1_style_norm"); ggml_set_output(g1_style_norm); ggml_build_forward_expand(g1srgf, g1_style_norm); - ggml_gallocr_t g1srallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!g1srallocr) { - ggml_free(g1srctx); - throw std::runtime_error("ggml_gallocr_new group1 style residual failed"); - } - if (!ggml_gallocr_reserve(g1srallocr, g1srgf)) { - ggml_gallocr_free(g1srallocr); - ggml_free(g1srctx); - throw std::runtime_error("ggml_gallocr_reserve group1 style residual failed"); - } - ggml_gallocr_alloc_graph(g1srallocr, g1srgf); + supertonic_sched_alloc(model, g1srgf); std::vector g1_style_lhs_raw = pack_time_channel_for_ggml(g1_block10, L, C); std::vector g1_style_out_raw = pack_time_channel_for_ggml(g1_style_out, L, C); ggml_backend_tensor_set(g1_style_lhs, g1_style_lhs_raw.data(), 0, g1_style_lhs_raw.size()*sizeof(float)); @@ -2340,7 +2281,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, PUSH_GGML_TRACE({"ve_g1_style_residual", {L, C}, tensor_to_time_channel(ggml_graph_get_tensor(g1srgf, "ve_g1_style_residual"))}); std::vector g1_style_norm_vec = tensor_to_time_channel(ggml_graph_get_tensor(g1srgf, "ve_g1_style_norm")); PUSH_GGML_TRACE({"ve_g1_style_norm", {L, C}, g1_style_norm_vec}); - ggml_gallocr_free(g1srallocr); ggml_free(g1srctx); thread_local vector_group_graph_cache g2_group_cache; @@ -2425,17 +2365,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.17.norm.norm.bias")); ggml_set_name(g2_style_norm, "ve_g2_style_norm"); ggml_set_output(g2_style_norm); ggml_build_forward_expand(g2srgf, g2_style_norm); - ggml_gallocr_t g2srallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!g2srallocr) { - ggml_free(g2srctx); - throw std::runtime_error("ggml_gallocr_new group2 style residual failed"); - } - if (!ggml_gallocr_reserve(g2srallocr, g2srgf)) { - ggml_gallocr_free(g2srallocr); - ggml_free(g2srctx); - throw std::runtime_error("ggml_gallocr_reserve group2 style residual failed"); - } - ggml_gallocr_alloc_graph(g2srallocr, g2srgf); + supertonic_sched_alloc(model, g2srgf); std::vector g2_style_lhs_raw = pack_time_channel_for_ggml(g2_block16, L, C); std::vector g2_style_out_raw = pack_time_channel_for_ggml(g2_style_out, L, C); ggml_backend_tensor_set(g2_style_lhs, g2_style_lhs_raw.data(), 0, g2_style_lhs_raw.size()*sizeof(float)); @@ -2444,7 +2374,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, PUSH_GGML_TRACE({"ve_g2_style_residual", {L, C}, tensor_to_time_channel(ggml_graph_get_tensor(g2srgf, "ve_g2_style_residual"))}); std::vector g2_style_norm_vec = tensor_to_time_channel(ggml_graph_get_tensor(g2srgf, "ve_g2_style_norm")); PUSH_GGML_TRACE({"ve_g2_style_norm", {L, C}, g2_style_norm_vec}); - ggml_gallocr_free(g2srallocr); ggml_free(g2srctx); thread_local vector_group_graph_cache g3_group_cache; @@ -2529,17 +2458,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.23.norm.norm.bias")); ggml_set_name(g3_style_norm, "ve_g3_style_norm"); ggml_set_output(g3_style_norm); ggml_build_forward_expand(g3srgf, g3_style_norm); - ggml_gallocr_t g3srallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!g3srallocr) { - ggml_free(g3srctx); - throw std::runtime_error("ggml_gallocr_new group3 style residual failed"); - } - if (!ggml_gallocr_reserve(g3srallocr, g3srgf)) { - ggml_gallocr_free(g3srallocr); - ggml_free(g3srctx); - throw std::runtime_error("ggml_gallocr_reserve group3 style residual failed"); - } - ggml_gallocr_alloc_graph(g3srallocr, g3srgf); + supertonic_sched_alloc(model, g3srgf); std::vector g3_style_lhs_raw = pack_time_channel_for_ggml(g3_block22, L, C); std::vector g3_style_out_raw = pack_time_channel_for_ggml(g3_style_out, L, C); ggml_backend_tensor_set(g3_style_lhs, g3_style_lhs_raw.data(), 0, g3_style_lhs_raw.size()*sizeof(float)); @@ -2548,7 +2467,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, PUSH_GGML_TRACE({"ve_g3_style_residual", {L, C}, tensor_to_time_channel(ggml_graph_get_tensor(g3srgf, "ve_g3_style_residual"))}); std::vector g3_style_norm_vec = tensor_to_time_channel(ggml_graph_get_tensor(g3srgf, "ve_g3_style_norm")); PUSH_GGML_TRACE({"ve_g3_style_norm", {L, C}, g3_style_norm_vec}); - ggml_gallocr_free(g3srallocr); ggml_free(g3srctx); thread_local vector_tail_graph_cache tail_cache; @@ -2557,7 +2475,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, include_ggml_trace ? &ggml_trace : nullptr); if (next_latent_tc_out) *next_latent_tc_out = next_latent_tc; - ggml_gallocr_free(allocr); ggml_free(ctx); profile_vector_step_end(current_step); if (error) error->clear(); diff --git a/tts-cpp/src/supertonic_vocoder.cpp b/tts-cpp/src/supertonic_vocoder.cpp index 5fc86261d0c..3ed254d661e 100644 --- a/tts-cpp/src/supertonic_vocoder.cpp +++ b/tts-cpp/src/supertonic_vocoder.cpp @@ -420,12 +420,8 @@ void build_supertonic_vocoder_cache(vocoder_graph_cache & cache, ggml_build_forward_expand(cache.gf, x); cache.wav = x; - cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vocoder cache failed"); - if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { - throw std::runtime_error("ggml_gallocr_reserve vocoder cache failed"); - } - ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + // Allocation is per-call via the model scheduler (supertonic_sched_alloc in + // the forward path), which routes GGML_OP_CUSTOM ops to CPU. No gallocr. } void linear1x1(const std::vector & x, int L, int IC, @@ -726,18 +722,18 @@ bool supertonic_vocoder_forward_ggml(const supertonic_model & model, profile_vocoder_checkpoint("bn_params", profile_last); thread_local vocoder_graph_cache cache; - if (cache.model != &model || cache.generation_id != model.generation_id || - cache.latent_len != latent_len) { - build_supertonic_vocoder_cache(cache, model, latent_len); - } + // Rebuild every call: the scheduler's alloc_graph mutates node->src[], so a + // cached graph can't be reused (full rationale in the vector estimator). + build_supertonic_vocoder_cache(cache, model, latent_len); profile_vocoder_checkpoint("graph_cache", profile_last); + supertonic_sched_alloc(model, cache.gf); ggml_backend_tensor_set(cache.x_in, x_in.data(), 0, x_in.size() * sizeof(float)); ggml_backend_tensor_set(cache.bn_scale, bn_scale.data(), 0, bn_scale.size() * sizeof(float)); ggml_backend_tensor_set(cache.bn_shift, bn_shift.data(), 0, bn_shift.size() * sizeof(float)); profile_vocoder_checkpoint("set_inputs", profile_last); - supertonic_graph_compute(model, cache.gf); + supertonic_sched_compute(model, cache.gf); profile_vocoder_checkpoint("compute", profile_last); wav_out = ggml_tensor_to_time_channel(cache.wav); profile_vocoder_checkpoint("readback", profile_last); @@ -934,17 +930,7 @@ bool supertonic_vocoder_trace_ggml(const supertonic_model & model, ggml_set_output(cur); ggml_build_forward_expand(gf, cur); - ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - if (!allocr) { - ggml_free(ctx); - throw std::runtime_error("ggml_gallocr_new failed"); - } - if (!ggml_gallocr_reserve(allocr, gf)) { - ggml_gallocr_free(allocr); - ggml_free(ctx); - throw std::runtime_error("ggml_gallocr_reserve failed"); - } - ggml_gallocr_alloc_graph(allocr, gf); + supertonic_sched_alloc(model, gf); std::vector x_host = unpack_latent_ggml_layout(model, latent, latent_len); ggml_backend_tensor_set(x_in, x_host.data(), 0, x_host.size() * sizeof(float)); @@ -959,7 +945,7 @@ bool supertonic_vocoder_trace_ggml(const supertonic_model & model, } ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "trace_bn_scale"), bn_scale_host.data(), 0, bn_scale_host.size() * sizeof(float)); ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "trace_bn_shift"), bn_shift_host.data(), 0, bn_shift_host.size() * sizeof(float)); - supertonic_graph_compute(model, gf); + supertonic_sched_compute(model, gf); trace_out.push_back({"unpack", {T0, C_latent}, unpack_latent_scalar(model, latent, latent_len)}); trace_out.push_back({"denorm", {T0, C_latent}, ggml_tensor_to_time_channel(ggml_graph_get_tensor(gf, "denorm"))}); @@ -978,7 +964,6 @@ bool supertonic_vocoder_trace_ggml(const supertonic_model & model, trace_out.push_back({"head1", {T0, (int) model.vocoder.head1_w->ne[2]}, ggml_tensor_to_time_channel(ggml_graph_get_tensor(gf, "head1"))}); trace_out.push_back({"prelu", {T0, (int) model.vocoder.head1_w->ne[2]}, ggml_tensor_to_time_channel(ggml_graph_get_tensor(gf, "prelu"))}); trace_out.push_back({"wav", {T0, (int) model.vocoder.head2_w->ne[2]}, ggml_tensor_to_time_channel(ggml_graph_get_tensor(gf, "wav"))}); - ggml_gallocr_free(allocr); ggml_free(ctx); if (error) error->clear(); return true;