diff --git a/tts-cpp/src/backend_selection.cpp b/tts-cpp/src/backend_selection.cpp
index 2c36287827c..065adf90216 100644
--- a/tts-cpp/src/backend_selection.cpp
+++ b/tts-cpp/src/backend_selection.cpp
@@ -212,24 +212,38 @@ void ensure_backends_loaded() {
 // reach the same decision on the same hardware.
 int parse_adreno_version(const char * s) {
     if (!s) return -1;
-    const char * p = std::strstr(s, "Adreno");
-    if (!p) p = std::strstr(s, "adreno");
-    if (!p) return -1;
-    p += 6; // strlen("Adreno") == strlen("adreno") == 6
-    while (*p && !(*p >= '0' && *p <= '9') && *p != 'X' && *p != 'x') ++p;
-    if (!*p) return -1;
-    if (*p == 'X' || *p == 'x') {
-        ++p;
-        if (*p < '0' || *p > '9') return -1; // "Xclipse" etc. is not Adreno-X
-        return 800;
-    }
-    int v = 0;
-    while (*p >= '0' && *p <= '9') {
-        v = v * 10 + (*p - '0');
-        ++p;
-        if (v > 100000) return -1;
+    // Scan EVERY "Adreno"/"adreno" marker and keep the largest plausible
+    // (>= 100, i.e. real 3-digit model) version found. Some OpenCL device
+    // strings embed the API version before the model number, e.g.
+    // "QUALCOMM Adreno(TM) (OpenCL 3.0 Adreno(TM) 740)": parsing only the
+    // first marker yields 3 (from "OpenCL 3.0") and mis-tiers the GPU below
+    // Vulkan; the second "Adreno 740" marker recovers the real version.
+    int best = -1;
+    for (const char * p = s; *p; ++p) {
+        if (std::strncmp(p, "Adreno", 6) != 0 &&
+            std::strncmp(p, "adreno", 6) != 0) {
+            continue;
+        }
+        const char * q = p + 6; // strlen("Adreno") == strlen("adreno") == 6
+        while (*q && !(*q >= '0' && *q <= '9') && *q != 'X' && *q != 'x') ++q;
+        if (!*q) continue;
+        if (*q == 'X' || *q == 'x') {
+            if (*(q + 1) >= '0' && *(q + 1) <= '9') { // "Adreno X1-..." family
+                if (800 > best) best = 800;
+            }
+            continue; // "Xclipse" etc. is not Adreno-X
+        }
+        int v = 0;
+        bool overflow = false;
+        while (*q >= '0' && *q <= '9') {
+            v = v * 10 + (*q - '0');
+            ++q;
+            if (v > 100000) { overflow = true; break; }
+        }
+        // Adreno models are 3-digit; ignore API-version noise like "OpenCL 3.0".
+        if (!overflow && v >= 100 && v > best) best = v;
     }
-    return v;
+    return best;
 }
 
 bool is_adreno_6xx(const char * s) {
@@ -242,14 +256,48 @@ bool is_adreno_700plus(const char * s) {
     return v >= 700;
 }
 
+// True if the device name/description identifies a Qualcomm Adreno GPU.
+// Unlike parse_adreno_version (which needs a 3-digit model number and so
+// returns -1 for the bare OpenCL "QUALCOMM Adreno(TM)" string), this is a
+// vendor check used to gate Android GPU selection. ASCII case-insensitive
+// because the strings vary in capitalisation: ggml-opencl reports
+// CL_DEVICE_NAME ("QUALCOMM Adreno(TM)") and ggml-vulkan reports the Vulkan
+// deviceName ("Adreno (TM) 740").
+bool is_qualcomm_adreno(const char * name, const char * desc) {
+    auto contains_ci = [](const char * hay, const char * needle) -> bool {
+        if (!hay || !needle) return false;
+        for (const char * h = hay; *h; ++h) {
+            const char * a = h;
+            const char * b = needle;
+            while (*a && *b) {
+                const char ca = (*a >= 'A' && *a <= 'Z') ? char(*a + 32) : *a;
+                const char cb = (*b >= 'A' && *b <= 'Z') ? char(*b + 32) : *b;
+                if (ca != cb) break;
+                ++a;
+                ++b;
+            }
+            if (!*b) return true;
+        }
+        return false;
+    };
+    return contains_ci(name, "adreno")   || contains_ci(desc, "adreno") ||
+           contains_ci(name, "qualcomm") || contains_ci(desc, "qualcomm");
+}
+
 // Pick a GPU backend using the same tier policy as parakeet-cpp's
 // `init_gpu_backend` / llm-llamacpp's BackendSelection: ggml-opencl
 // is only used when an Adreno 700+ device is present (where its
 // kernels are validated and faster than Vulkan); every other GPU
-// (Vulkan, Metal, CUDA, Mali, Intel iGPU, ...) goes through the
-// non-OpenCL preference. Adreno 6xx OpenCL is known broken
-// (incorrect outputs) and is force-skipped unless the caller opts
-// in via `TTS_CPP_ALLOW_ADRENO_6XX=1`.
+// (Vulkan, Metal, CUDA, Intel iGPU, ...) goes through the non-OpenCL
+// preference. Adreno 6xx OpenCL is known broken (incorrect outputs)
+// and is force-skipped unless the caller opts in via
+// `TTS_CPP_ALLOW_ADRENO_6XX=1`.
+//
+// On Android the device walk is additionally gated to Qualcomm Adreno
+// only: other Android GPU vendors are not validated and at least one
+// (ARM Mali / Tensor) aborts the host process from inside graph
+// compute, so they are skipped and the engine falls back to CPU.
+// Desktop GPU vendors are unaffected.
 //
 // Routed exclusively through the ggml-backend registry
 // (`ggml_backend_load_all` + `ggml_backend_dev_*`). No direct calls
@@ -292,6 +340,29 @@ ggml_backend_t init_gpu_backend(int n_gpu_layers,
         const char * reg_name = dev_reg_name(dev);
         const bool   is_opencl = reg_name && std::strcmp(reg_name, "OpenCL") == 0;
 
+#if defined(__ANDROID__)
+        // Android GPU allowlist: only Qualcomm Adreno is validated for the
+        // tts-cpp GPU backends (OpenCL on Adreno 700+, Vulkan as the
+        // bring-up fallback). Other Android GPU vendors are not validated,
+        // and at least one (ARM Mali / Tensor) aborts the whole host
+        // process from inside ggml_backend_graph_compute via GGML_ASSERT ->
+        // ggml_abort(), which cannot be caught from C++. Skip non-Adreno
+        // devices so the policy falls through to CPU instead of risking a
+        // fatal abort on an unvalidated driver.
+        if (!is_qualcomm_adreno(name, desc)) {
+            if (verbose) {
+                fprintf(stderr,
+                    "%s: Android GPU '%s' (%s) is not Qualcomm Adreno; "
+                    "skipping (only Adreno is validated on Android; "
+                    "falling through to CPU)\n",
+                    log_prefix,
+                    name ? name : "?",
+                    desc ? desc : "?");
+            }
+            continue;
+        }
+#endif
+
         const int adreno_v = std::max(parse_adreno_version(name),
                                       parse_adreno_version(desc));
         if (adreno_v > max_adreno_version) max_adreno_version = adreno_v;
@@ -331,10 +402,11 @@ ggml_backend_t init_gpu_backend(int n_gpu_layers,
     //   1. Adreno 700+: prefer OpenCL (validated, faster than Vulkan
     //      on Snapdragon 8 Gen 2/3/4 etc.).
     //   2. Anything else with a non-OpenCL GPU: prefer that
-    //      (Vulkan on all non-Adreno Android, Metal on Apple, CUDA
-    //      on Linux/Windows desktop, Mali iGPU via Vulkan, ...).
-    //   3. Last resort: any other OpenCL device (e.g. desktop OpenCL
-    //      or non-Adreno mobile when no Vulkan is registered).
+    //      (Adreno Vulkan on Android — non-Adreno is filtered out
+    //      above; Metal on Apple; CUDA / Vulkan on Linux/Windows
+    //      desktop).
+    //   3. Last resort: any other OpenCL device (e.g. desktop OpenCL,
+    //      or Adreno OpenCL whose version string lacked a model number).
     auto try_init = [&](const std::vector<Cand> & bucket) -> ggml_backend_t {
         for (const Cand & c : bucket) {
             ggml_backend_t b = ggml_backend_dev_init(c.dev, nullptr);
diff --git a/tts-cpp/src/backend_selection.h b/tts-cpp/src/backend_selection.h
index 60c99104e9f..7054cb7273c 100644
--- a/tts-cpp/src/backend_selection.h
+++ b/tts-cpp/src/backend_selection.h
@@ -87,4 +87,10 @@ int parse_adreno_version(const char * s);
 bool is_adreno_6xx(const char * s);
 bool is_adreno_700plus(const char * s);
 
+// Vendor check (name OR description, ASCII case-insensitive): true for a
+// Qualcomm Adreno GPU. Unlike parse_adreno_version it does not require a
+// model number, so it also matches the bare OpenCL "QUALCOMM Adreno(TM)"
+// string. Used to gate Android GPU selection to the only validated vendor.
+bool is_qualcomm_adreno(const char * name, const char * desc);
+
 } // namespace tts_cpp::detail
diff --git a/tts-cpp/src/chatterbox_cli.cpp b/tts-cpp/src/chatterbox_cli.cpp
index d112adcc8a4..c70ad097352 100644
--- a/tts-cpp/src/chatterbox_cli.cpp
+++ b/tts-cpp/src/chatterbox_cli.cpp
@@ -320,6 +320,7 @@ struct cli_params {
     std::string tokens_file;     // optional pre-tokenized speech tokens (skips T3)
     std::string text;            // input text for T3
     std::string output;          // legacy: speech-tokens output file (if set, write tokens)
+    std::string dump_mel_path;   // optional: dump S3Gen intermediates (_mu/_step0_dxdt/mel) to .npy for debugging
     // S3Gen + HiFT vocoder:
     std::string s3gen_gguf;      // enables full text → wav pipeline
     std::string out_wav;         // wav output path (requires --s3gen-gguf)
@@ -450,6 +451,7 @@ static void print_usage(const char * argv0) {
     fprintf(stderr, "                          With --s3gen-gguf this is interpreted as *speech* tokens\n");
     fprintf(stderr, "                          and the T3 step is skipped.\n");
     fprintf(stderr, "  --output PATH           Write generated speech tokens to PATH (text mode).\n");
+    fprintf(stderr, "  --dump-mel-path PATH    Debug: dump S3Gen mel to PATH, encoder to PATH_mu.npy, CFM step0 to PATH_step0_dxdt.npy.\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "  --s3gen-gguf PATH       Enables the full text -> wav pipeline (S3Gen + HiFT).\n");
     fprintf(stderr, "  --out PATH              Output wav file when --s3gen-gguf is set.\n");
@@ -590,6 +592,7 @@ static bool parse_args(int argc, char ** argv, cli_params & params) {
         else if (arg == "--text")           { auto v = next("--text");           if (!v) return false; params.text = v; }
         else if (arg == "--tokens-file")    { auto v = next("--tokens-file");    if (!v) return false; params.tokens_file = v; }
         else if (arg == "--output")         { auto v = next("--output");         if (!v) return false; params.output = v; }
+        else if (arg == "--dump-mel-path")  { auto v = next("--dump-mel-path");   if (!v) return false; params.dump_mel_path = v; }
         else if (arg == "--s3gen-gguf")     { auto v = next("--s3gen-gguf");     if (!v) return false; params.s3gen_gguf = v; }
         else if (arg == "--out")            { auto v = next("--out");            if (!v) return false; params.out_wav = v; }
         else if (arg == "--ref-dir")        { auto v = next("--ref-dir");        if (!v) return false; params.ref_dir = v; }
@@ -982,6 +985,7 @@ int tts_cpp_cli_main(int argc, char ** argv) {
             opts.verbose         = params.verbose;
             opts.n_gpu_layers    = params.n_gpu_layers;
             opts.cfm_steps       = params.cfm_steps;
+            opts.dump_mel_path   = params.dump_mel_path;
             opts.cfm_f16_kv_attn = params.cfm_f16_kv_attn;
             if (!params.reference_audio.empty()) {
                 if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf,
@@ -1265,6 +1269,7 @@ int tts_cpp_cli_main(int argc, char ** argv) {
             // chunk; --cfm-steps falls in as the per-chunk default below
             // (`stream_cfm_steps > 0 ? stream_cfm_steps : cfm_steps`).
             opts.cfm_steps       = params.cfm_steps;
+            opts.dump_mel_path   = params.dump_mel_path;
             opts.cfm_f16_kv_attn = params.cfm_f16_kv_attn;
             if (!params.reference_audio.empty()) {
                 if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf,
@@ -2063,6 +2068,7 @@ int tts_cpp_cli_main(int argc, char ** argv) {
             // Streaming chunks honour --stream-cfm-steps with --cfm-steps as
             // fallback when copts is set up further below.
             opts.cfm_steps       = params.cfm_steps;
+            opts.dump_mel_path   = params.dump_mel_path;
             opts.cfm_f16_kv_attn = params.cfm_f16_kv_attn;
             if (!params.reference_audio.empty()) {
                 if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf,
diff --git a/tts-cpp/src/chatterbox_tts.cpp b/tts-cpp/src/chatterbox_tts.cpp
index 24c43b5ecf9..7c9514bfa81 100644
--- a/tts-cpp/src/chatterbox_tts.cpp
+++ b/tts-cpp/src/chatterbox_tts.cpp
@@ -87,6 +87,13 @@ struct scoped_timer {
 
 struct model_ctx {
     ggml_backend_t backend = nullptr;
+    // sched [backend, cpu_backend] routes ops the GPU backend can't run
+    // (GGML_OP_CONV_TRANSPOSE_1D in the HiFT vocoder) to CPU instead of asserting;
+    // stays a single-backend pass-through (cpu_backend null) when the primary is
+    // the CPU. Created lazily on the synthesis thread, not in load_s3gen_gguf —
+    // the latter runs in the preload thread and would race conditioning's init_cpu_backend().
+    mutable ggml_backend_t cpu_backend = nullptr;
+    mutable ggml_backend_sched_t sched = nullptr;
     ggml_context * ctx_w = nullptr;
     ggml_backend_buffer_t buffer_w = nullptr;
     std::map<std::string, ggml_tensor*> tensors;
@@ -101,6 +108,47 @@ struct model_ctx {
     float cfg_rate    = 0.0f;
 };
 
+// Allocate + run a graph through the model scheduler — like the single-backend
+// compute() above, but lets sched route unsupported ops to CPU. sched allocates
+// at alloc time, so callers set inputs AFTER s3gen_sched_alloc and before
+// s3gen_sched_compute (S3Gen sites already follow alloc -> set -> compute).
+static void s3gen_sched_alloc(const model_ctx & m, ggml_cgraph * gf) {
+    // Lazy, single-threaded creation: reached only from run_hift_decode on the
+    // synthesis thread, after preload + conditioning, so init_cpu_backend() races nothing.
+    if (!m.sched) {
+        // Mark weights USAGE_WEIGHTS so sched copies a GPU-resident weight to CPU
+        // when a CPU-routed op (conv_transpose_1d) consumes it. Done here
+        // (synthesis thread), not in load_s3gen_gguf (preload thread).
+        ggml_backend_buffer_set_usage(m.buffer_w, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        ggml_backend_t sched_backends[2] = { m.backend, nullptr };
+        int n_sched_backends = 1;
+        if (!::tts_cpp::detail::backend_is_cpu(m.backend)) {
+            m.cpu_backend = ::tts_cpp::detail::init_cpu_backend();
+            if (!m.cpu_backend) throw std::runtime_error("s3gen: init CPU backend for scheduler failed");
+            sched_backends[1] = m.cpu_backend;
+            n_sched_backends = 2;
+        }
+        // graph_size matches the HiFT graph's ggml_new_graph_custom capacity (it
+        // is the only graph routed through sched, and the largest S3Gen graph).
+        m.sched = ggml_backend_sched_new(sched_backends, /*bufts=*/nullptr,
+                                         n_sched_backends, /*graph_size=*/131072,
+                                         /*parallel=*/false, /*op_offload=*/false);
+        if (!m.sched) throw std::runtime_error("s3gen: ggml_backend_sched_new failed");
+    }
+    ggml_backend_sched_reset(m.sched);
+    if (!ggml_backend_sched_alloc_graph(m.sched, gf)) {
+        throw std::runtime_error("s3gen_sched_alloc: ggml_backend_sched_alloc_graph failed");
+    }
+}
+
+static void s3gen_sched_compute(const model_ctx & m, ggml_cgraph * gf) {
+    // CPU work inside the sched runs on cpu_backend (GPU primary) or the primary
+    // itself (CPU-only model). Set its thread count per call, like compute().
+    ggml_backend_t cpu_b = m.cpu_backend ? m.cpu_backend : m.backend;
+    ::tts_cpp::detail::backend_set_n_threads(cpu_b, g_n_threads);
+    ggml_backend_sched_graph_compute(m.sched, gf);
+}
+
 static ggml_backend_t s3gen_init_backend(int n_gpu_layers, bool verbose) {
     // GPU cascade is centralised in backend_selection.cpp's
     // `init_gpu_backend` (Adreno 700+ -> OpenCL, every other GPU ->
@@ -185,9 +233,12 @@ static void s3gen_model_cache_release() {
     if (!g_s3gen_cache_entry) return;
     model_ctx * m = g_s3gen_cache_entry->m.get();
     if (m) {
+        // Free the scheduler before the backends/buffers it references.
+        if (m->sched)    { ggml_backend_sched_free(m->sched);     m->sched    = nullptr; }
         if (m->buffer_w) { ggml_backend_buffer_free(m->buffer_w); m->buffer_w = nullptr; }
         if (m->ctx_w)    { ggml_free(m->ctx_w);                   m->ctx_w    = nullptr; }
         if (m->backend)  { ggml_backend_free(m->backend);         m->backend  = nullptr; }
+        if (m->cpu_backend) { ggml_backend_free(m->cpu_backend);  m->cpu_backend = nullptr; }
         m->tensors.clear();
     }
     g_s3gen_cache_entry.reset();
@@ -258,6 +309,12 @@ static model_ctx load_s3gen_gguf(const std::string & path, int n_gpu_layers, boo
         ggml_tensor * src = ggml_get_tensor(tmp_ctx, ggml_get_name(cur));
         ggml_backend_tensor_set(cur, ggml_get_data(src), 0, ggml_nbytes(src));
     }
+    // NOTE: ALL scheduler setup (m.sched, m.cpu_backend, and the buffer_w
+    // USAGE_WEIGHTS flag) is done lazily in s3gen_sched_alloc on the synthesis
+    // thread — NOT here. load_s3gen_gguf runs in the s3gen_preload background
+    // thread concurrently with the main thread's reference-audio conditioning;
+    // doing backend/buffer setup here disturbs that path
+    // (-> "mel_graph_run: init_cpu_backend failed").
 
     {
         int64_t k_mf = gguf_find_key(g, "s3gen.meanflow");
@@ -1908,7 +1965,10 @@ static std::vector<float> run_hift_decode(const model_ctx & m,
 
     graph_cache & cache = g_hift_graph_cache;
     const int64_t cache_key = pack_hift_key(T_mel, T_stft);
-    const bool build_graph = (cache.key != cache_key) || (cache.ctx == nullptr);
+    // Always rebuild: the scheduler's alloc_graph mutates node->src[] (the GPU<->CPU
+    // copies around the CPU-routed conv_transpose_1d), so a cached graph can't be
+    // reused. HiFT builds once per synth — negligible cost.
+    const bool build_graph = true;
     if (build_graph) {
         if (cache.allocr) { ggml_gallocr_free(cache.allocr); cache.allocr = nullptr; }
         if (cache.ctx)    { ggml_free(cache.ctx);            cache.ctx    = nullptr; }
@@ -2062,9 +2122,8 @@ static std::vector<float> run_hift_decode(const model_ctx & m,
     y_trim = ggml_clamp(ctx, y_trim, -0.99f, 0.99f);
     ggml_set_name(y_trim, "wav"); ggml_set_output(y_trim);
     ggml_build_forward_expand(gf, y_trim);
-
-    cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend));
-    ggml_gallocr_reserve(cache.allocr, gf);
+    // No gallocr here — this graph is allocated by the model scheduler
+    // (s3gen_sched_alloc below) so conv_transpose_1d can be routed to CPU.
     }  // end build_graph
 
     // Cached scaffolding (pulled outside build_graph too — when the graph
@@ -2073,7 +2132,28 @@ static std::vector<float> run_hift_decode(const model_ctx & m,
     const std::vector<float> & ik_data = cached_istft_kernel(n_fft);
     const std::vector<float> & ws_data = cached_window_sum(T_stft, n_fft, hop);
 
-    ggml_gallocr_alloc_graph(cache.allocr, gf);
+    // Capability-gate the scheduler. The [GPU,CPU] ggml_backend_sched exists only
+    // to route CONV_TRANSPOSE_1D to CPU because ggml-opencl / ggml-vulkan lack that
+    // kernel. A backend that can run every op in this graph itself (Metal, CUDA,
+    // CPU) does not need the scheduler — and the scheduler's graph-split aborts on
+    // the iOS Metal driver — so run those directly on the primary backend (the
+    // pre-scheduler path). Only use the scheduler when the primary backend can't
+    // run some op. Generic: asks the actual backend about the actual graph, with
+    // no platform / backend-name hardcoding, so iOS Metal is not regressed by the
+    // Android-motivated routing.
+    bool primary_runs_all = true;
+    const int hift_n_nodes = ggml_graph_n_nodes(gf);
+    for (int i = 0; i < hift_n_nodes; ++i) {
+        if (!ggml_backend_supports_op(m.backend, ggml_graph_node(gf, i))) { primary_runs_all = false; break; }
+    }
+    ggml_gallocr_t hift_allocr = nullptr;
+    if (primary_runs_all) {
+        hift_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend));
+        ggml_gallocr_reserve(hift_allocr, gf);
+        ggml_gallocr_alloc_graph(hift_allocr, gf);
+    } else {
+        s3gen_sched_alloc(m, gf);
+    }
     ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "mel_in"),  mel.data(),    0, mel.size()*sizeof(float));
     ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "s_in"),    s_stft.data(), 0, s_stft.size()*sizeof(float));
     ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "istft_k"), ik_data.data(),0, ik_data.size()*sizeof(float));
@@ -2100,11 +2180,19 @@ static std::vector<float> run_hift_decode(const model_ctx & m,
         ggml_backend_tensor_set(ggml_graph_get_tensor(gf, e.first.c_str()),
                                 inv.data(), 0, inv.size()*sizeof(float));
     }
-    compute(m.backend, gf);
+    if (primary_runs_all) {
+        compute(m.backend, gf);
+    } else {
+        s3gen_sched_compute(m, gf);
+    }
 
     ggml_tensor * y_trim_out = ggml_graph_get_tensor(gf, "wav");
     std::vector<float> wav(ggml_nelements(y_trim_out));
     ggml_backend_tensor_get(y_trim_out, wav.data(), 0, ggml_nbytes(y_trim_out));
+    // Free the direct-path allocr only AFTER reading the output — y_trim_out's
+    // data lives in this buffer (freeing it earlier is a use-after-free in the
+    // tensor_get above). nullptr on the scheduler path, so the guard covers both.
+    if (hift_allocr) ggml_gallocr_free(hift_allocr);
     return wav;
 }
 
diff --git a/tts-cpp/src/supertonic_engine.cpp b/tts-cpp/src/supertonic_engine.cpp
index cc87c09e084..8e45f8191d9 100644
--- a/tts-cpp/src/supertonic_engine.cpp
+++ b/tts-cpp/src/supertonic_engine.cpp
@@ -9,6 +9,7 @@
 #include <cmath>
 #include <cstring>
 #include <cstdint>
+#include <cstdlib>
 #include <filesystem>
 #include <stdexcept>
 
@@ -135,7 +136,8 @@ struct Engine::Impl {
             ::tts_cpp::detail::set_opencl_cache_dir(opts.opencl_cache_dir);
         }
 
-        if (!load_supertonic_gguf(opts.model_gguf_path, model, opts.n_gpu_layers, false)) {
+        if (!load_supertonic_gguf(opts.model_gguf_path, model, opts.n_gpu_layers,
+                                  std::getenv("QVAC_VERBOSE") != nullptr)) {
             throw std::runtime_error("Supertonic Engine: failed to load GGUF: " +
                                      opts.model_gguf_path);
         }
diff --git a/tts-cpp/src/supertonic_gguf.cpp b/tts-cpp/src/supertonic_gguf.cpp
index 1c33ebe41e7..eb4420c38a4 100644
--- a/tts-cpp/src/supertonic_gguf.cpp
+++ b/tts-cpp/src/supertonic_gguf.cpp
@@ -212,6 +212,24 @@ void supertonic_graph_compute(const supertonic_model & model, ggml_cgraph * grap
     ggml_backend_graph_compute(model.backend, graph);
 }
 
+void supertonic_sched_alloc(const supertonic_model & model, ggml_cgraph * graph) {
+    ggml_backend_sched_reset(model.sched);
+    if (!ggml_backend_sched_alloc_graph(model.sched, graph)) {
+        throw std::runtime_error("supertonic_sched_alloc: ggml_backend_sched_alloc_graph failed");
+    }
+}
+
+void supertonic_sched_compute(const supertonic_model & model, ggml_cgraph * graph) {
+    // CPU work inside the sched runs on cpu_backend (GPU primary) or on the
+    // primary itself (CPU-only model). Set its thread count per-call, mirroring
+    // the single-backend path above.
+    ggml_backend_t cpu_b = model.cpu_backend ? model.cpu_backend : model.backend;
+    if (model.n_threads > 0) {
+        ::tts_cpp::detail::backend_set_n_threads(cpu_b, model.n_threads);
+    }
+    ggml_backend_sched_graph_compute(model.sched, graph);
+}
+
 static void bind_vocoder_weights(supertonic_model & model) {
     auto & v = model.vocoder;
     v.normalizer_scale = require_source_tensor(model, "vocoder:tts.ttl.normalizer.scale");
@@ -310,6 +328,15 @@ bool load_supertonic_gguf(const std::string & path,
         model.buffer_w = ggml_backend_alloc_ctx_tensors(model.ctx_w, model.backend);
         if (!model.buffer_w) throw std::runtime_error("ggml_backend_alloc_ctx_tensors failed");
 
+        // Mark the weight buffer as WEIGHTS so the scheduler treats these
+        // tensors as immovable and inserts GPU->CPU copies when a CPU-only op
+        // (the GGML_OP_CUSTOM kernels in the vector estimator / vocoder)
+        // consumes them. Without this they default to USAGE_ANY: sched's
+        // weight-aware split/copy path (ggml-backend.cpp) does not fire, some
+        // weights stay on the GPU buffer, and the CPU custom op dereferences a
+        // device offset -> SIGSEGV. Standard llama.cpp/whisper.cpp pattern.
+        ggml_backend_buffer_set_usage(model.buffer_w, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
         for (ggml_tensor * cur = ggml_get_first_tensor(model.ctx_w);
              cur;
              cur = ggml_get_next_tensor(model.ctx_w, cur)) {
@@ -348,6 +375,31 @@ bool load_supertonic_gguf(const std::string & path,
         }
 
         bind_vocoder_weights(model);
+
+        // Build the scheduler. With a GPU primary, add a CPU backend so
+        // ops the GPU can't run (GGML_OP_CUSTOM, and any FA the driver
+        // rejects) are routed to CPU rather than silently skipped. With a
+        // CPU primary, the sched is a single-backend pass-through (no
+        // second CPU backend created).
+        {
+            ggml_backend_t backends[2] = { model.backend, nullptr };
+            int n_backends = 1;
+            if (!::tts_cpp::detail::backend_is_cpu(model.backend)) {
+                model.cpu_backend = ::tts_cpp::detail::init_cpu_backend();
+                if (!model.cpu_backend) {
+                    throw std::runtime_error("init CPU backend for scheduler failed");
+                }
+                backends[1] = model.cpu_backend;
+                n_backends = 2;
+            }
+            model.sched = ggml_backend_sched_new(backends, /*bufts=*/nullptr,
+                                                 n_backends, /*graph_size=*/ 8192,
+                                                 /*parallel=*/ false,
+                                                 /*op_offload=*/ false);
+            if (!model.sched) {
+                throw std::runtime_error("ggml_backend_sched_new failed");
+            }
+        }
     } catch (const std::exception & e) {
         fprintf(stderr, "load_supertonic_gguf: %s\n", e.what());
         gguf_free(gguf_ctx);
@@ -374,6 +426,11 @@ void free_supertonic_model(supertonic_model & model) {
     if (model.generation_id != 0) {
         unregister_supertonic_alive(model.generation_id);
     }
+    // Free the scheduler before the backends/buffers it references.
+    if (model.sched) {
+        ggml_backend_sched_free(model.sched);
+        model.sched = nullptr;
+    }
     if (model.buffer_w) {
         ggml_backend_buffer_free(model.buffer_w);
         model.buffer_w = nullptr;
@@ -382,6 +439,10 @@ void free_supertonic_model(supertonic_model & model) {
         ggml_backend_free(model.backend);
         model.backend = nullptr;
     }
+    if (model.cpu_backend) {
+        ggml_backend_free(model.cpu_backend);
+        model.cpu_backend = nullptr;
+    }
     if (model.ctx_w) {
         ggml_free(model.ctx_w);
         model.ctx_w = nullptr;
diff --git a/tts-cpp/src/supertonic_internal.h b/tts-cpp/src/supertonic_internal.h
index f0587a72cff..7e157f388f8 100644
--- a/tts-cpp/src/supertonic_internal.h
+++ b/tts-cpp/src/supertonic_internal.h
@@ -74,6 +74,14 @@ struct supertonic_model {
     uint64_t generation_id = 0;
     int n_threads = 0;
     ggml_backend_t backend = nullptr;
+    // Scheduler so ops the GPU backend can't run (notably GGML_OP_CUSTOM
+    // CPU kernels in the vector estimator / vocoder) auto-route to CPU
+    // instead of being silently skipped on a single backend. Always
+    // created: [backend, cpu_backend] for a GPU primary, or a degenerate
+    // [backend] when the primary is itself CPU. cpu_backend stays null in
+    // the CPU-only case (no second CPU backend).
+    ggml_backend_t cpu_backend = nullptr;
+    ggml_backend_sched_t sched = nullptr;
     ggml_context * ctx_w = nullptr;
     ggml_backend_buffer_t buffer_w = nullptr;
 
@@ -94,6 +102,16 @@ void free_supertonic_model(supertonic_model & model);
 void supertonic_set_n_threads(supertonic_model & model, int n_threads);
 void supertonic_graph_compute(const supertonic_model & model, ggml_cgraph * graph);
 
+// Scheduler-based alloc + compute (Option A), used by stages migrated off
+// the per-graph ggml_gallocr. Pairing contract at each call site:
+//   supertonic_sched_alloc(model, gf);            // reset + allocate via sched
+//   ggml_backend_tensor_set(input_leaf, ...);     // inputs now have memory
+//   supertonic_sched_compute(model, gf);          // run (routes customs -> CPU)
+// The graph topology may be a reused thread_local cache; sched_reset does not
+// touch the user graph, so caches stay valid across calls.
+void supertonic_sched_alloc(const supertonic_model & model, ggml_cgraph * graph);
+void supertonic_sched_compute(const supertonic_model & model, ggml_cgraph * graph);
+
 ggml_tensor * require_tensor(const supertonic_model & model, const std::string & name);
 ggml_tensor * require_source_tensor(const supertonic_model & model, const std::string & source_name);
 
diff --git a/tts-cpp/src/supertonic_vector_estimator.cpp b/tts-cpp/src/supertonic_vector_estimator.cpp
index b4da8328f91..de60ae8c3e1 100644
--- a/tts-cpp/src/supertonic_vector_estimator.cpp
+++ b/tts-cpp/src/supertonic_vector_estimator.cpp
@@ -62,13 +62,13 @@ void profile_vector_compute(const supertonic_model & model,
                             int step,
                             const char * island) {
     if (!vector_profile_enabled()) {
-        supertonic_graph_compute(model, graph);
+        supertonic_sched_compute(model, graph);
         return;
     }
     auto & state = vector_profile();
     const auto t0 = std::chrono::steady_clock::now();
     const double pre_ms = std::chrono::duration<double, std::milli>(t0 - state.last).count();
-    supertonic_graph_compute(model, graph);
+    supertonic_sched_compute(model, graph);
     const auto t1 = std::chrono::steady_clock::now();
     const double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
     state.last = t1;
@@ -686,12 +686,9 @@ void build_text_attention_cache(vector_text_attention_cache & cache,
     ggml_set_name(out, "vector_attn_out"); ggml_set_output(out);
     ggml_build_forward_expand(cache.gf, out);
 
-    cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-    if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector text attention cache failed");
-    if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) {
-        throw std::runtime_error("ggml_gallocr_reserve vector text attention cache failed");
-    }
-    ggml_gallocr_alloc_graph(cache.allocr, cache.gf);
+    // Allocation is per-call via the model scheduler (supertonic_sched_alloc
+    // in run), which routes GGML_OP_CUSTOM ops to CPU. No per-cache gallocr;
+    // cache.allocr stays null (free_*_cache's safe_gallocr_free no-ops on it).
 }
 
 std::vector<float> run_text_attention_cache(vector_text_attention_cache & cache,
@@ -708,12 +705,11 @@ std::vector<float> run_text_attention_cache(vector_text_attention_cache & cache,
                                             int current_step,
                                             const char * island,
                                             std::vector<float> * ctx_trace) {
-    if (cache.model != &model || cache.generation_id != model.generation_id ||
-        cache.q_len != q_len || cache.kv_len != kv_len ||
-        cache.n_heads != n_heads || cache.head_dim != head_dim ||
-        cache.out_w_source != out_w_source || cache.out_b_source != out_b_source) {
-        build_text_attention_cache(cache, model, q_len, kv_len, n_heads, head_dim, out_w_source, out_b_source);
-    }
+    // Rebuild every call: ggml_backend_sched_alloc_graph mutates node->src[] when it
+    // inserts cross-backend GPU<->CPU copies, corrupting a graph reused across denoise
+    // steps. Build is microseconds vs millisecond compute, so always rebuilding is free.
+    build_text_attention_cache(cache, model, q_len, kv_len, n_heads, head_dim, out_w_source, out_b_source);
+    supertonic_sched_alloc(model, cache.gf);
     ggml_backend_tensor_set(cache.q_tc_in, q_tc.data(), 0, q_tc.size()*sizeof(float));
     ggml_backend_tensor_set(cache.k_tc_in, k_tc.data(), 0, k_tc.size()*sizeof(float));
     ggml_backend_tensor_set(cache.v_tc_in, v_tc.data(), 0, v_tc.size()*sizeof(float));
@@ -869,12 +865,8 @@ void build_group_graph_cache(vector_group_graph_cache & cache,
     ggml_set_name(k, k_name.c_str()); ggml_set_output(k); ggml_build_forward_expand(cache.gf, k);
     ggml_set_name(v, v_name.c_str()); ggml_set_output(v); ggml_build_forward_expand(cache.gf, v);
 
-    cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-    if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector group cache failed");
-    if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) {
-        throw std::runtime_error("ggml_gallocr_reserve vector group cache failed");
-    }
-    ggml_gallocr_alloc_graph(cache.allocr, cache.gf);
+    // Allocation is per-call via the model scheduler (supertonic_sched_alloc
+    // in run), which routes GGML_OP_CUSTOM ops to CPU. No per-cache gallocr.
 }
 
 vector_group_graph_result run_group_graph_cache(vector_group_graph_cache & cache,
@@ -899,20 +891,14 @@ vector_group_graph_result run_group_graph_cache(vector_group_graph_cache & cache
                                                 const std::string & v_name,
                                                 const char * island,
                                                 std::vector<supertonic_trace_tensor> * trace) {
-    if (cache.model != &model || cache.generation_id != model.generation_id ||
-        cache.L != L || cache.C != C || cache.text_len != text_len ||
-        cache.group != group || cache.conv_block != conv_block ||
-        cache.linear_block != linear_block || cache.post_block != post_block ||
-        cache.trace_outputs != (trace != nullptr) ||
-        cache.matmul_source != matmul_source ||
-        cache.q_matmul_source != q_matmul_source || cache.k_matmul_source != k_matmul_source ||
-        cache.v_matmul_source != v_matmul_source) {
-        build_group_graph_cache(cache, model, L, C, group, conv_block, linear_block, matmul_source, post_block,
-                                text_len, q_matmul_source, k_matmul_source, v_matmul_source,
-                                q_name, k_name, v_name,
-                                trace != nullptr);
-    }
+    // Rebuild every call — scheduler alloc corrupts a reused graph; see
+    // run_text_attention_cache for the full rationale.
+    build_group_graph_cache(cache, model, L, C, group, conv_block, linear_block, matmul_source, post_block,
+                            text_len, q_matmul_source, k_matmul_source, v_matmul_source,
+                            q_name, k_name, v_name,
+                            trace != nullptr);
     std::vector<float> x_raw = pack_time_channel_for_ggml(x_tc, L, C);
+    supertonic_sched_alloc(model, cache.gf);
     ggml_backend_tensor_set(cache.x_in, x_raw.data(), 0, x_raw.size()*sizeof(float));
     ggml_backend_tensor_set(cache.temb_in, temb.data(), 0, temb.size()*sizeof(float));
     ggml_backend_tensor_set(cache.text_in, text_lc_host, 0, (size_t) text_len * 256 * sizeof(float));
@@ -1069,12 +1055,8 @@ void build_res_style_qkv_cache(vector_res_style_qkv_cache & cache,
     ggml_set_name(sk, k_name.c_str()); ggml_set_output(sk); ggml_build_forward_expand(cache.gf, sk);
     ggml_set_name(sv, v_name.c_str()); ggml_set_output(sv); ggml_build_forward_expand(cache.gf, sv);
 
-    cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-    if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new res-style-qkv failed");
-    if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) {
-        throw std::runtime_error("ggml_gallocr_reserve res-style-qkv failed");
-    }
-    ggml_gallocr_alloc_graph(cache.allocr, cache.gf);
+    // Allocation is per-call via the model scheduler (supertonic_sched_alloc
+    // in run), which routes GGML_OP_CUSTOM ops to CPU. No per-cache gallocr.
 }
 
 vector_res_style_qkv_result run_res_style_qkv_cache(vector_res_style_qkv_cache & cache,
@@ -1101,19 +1083,15 @@ vector_res_style_qkv_result run_res_style_qkv_cache(vector_res_style_qkv_cache &
                                                     const char * island,
                                                     std::vector<supertonic_trace_tensor> * trace) {
     const bool want_trace = trace != nullptr;
-    if (cache.model != &model || cache.generation_id != model.generation_id ||
-        cache.L != L || cache.C != C ||
-        cache.norm_block != norm_block || cache.post_block != post_block ||
-        cache.style_block != style_block || cache.trace_outputs != want_trace ||
-        cache.q_matmul_source != q_matmul_source || cache.k_matmul_source != k_matmul_source ||
-        cache.v_matmul_source != v_matmul_source) {
-        build_res_style_qkv_cache(cache, model, L, C, norm_block, post_block, style_block,
-                                  q_matmul_source, k_matmul_source, v_matmul_source,
-                                  residual_name, norm_name, post_name, q_name, k_name, v_name,
-                                  want_trace);
-    }
+    // Rebuild every call — scheduler alloc corrupts a reused graph; see
+    // run_text_attention_cache for the full rationale.
+    build_res_style_qkv_cache(cache, model, L, C, norm_block, post_block, style_block,
+                              q_matmul_source, k_matmul_source, v_matmul_source,
+                              residual_name, norm_name, post_name, q_name, k_name, v_name,
+                              want_trace);
     std::vector<float> lhs_raw = pack_time_channel_for_ggml(lhs_tc, L, C);
     std::vector<float> rhs_raw = pack_time_channel_for_ggml(rhs_tc, L, C);
+    supertonic_sched_alloc(model, cache.gf);
     ggml_backend_tensor_set(cache.lhs_in, lhs_raw.data(), 0, lhs_raw.size() * sizeof(float));
     ggml_backend_tensor_set(cache.rhs_in, rhs_raw.data(), 0, rhs_raw.size() * sizeof(float));
     ggml_backend_tensor_set(cache.style_v_in, style_v_raw.data(), 0, style_v_raw.size() * sizeof(float));
@@ -1273,12 +1251,8 @@ void build_tail_graph_cache(vector_tail_graph_cache & cache,
         ggml_build_forward_expand(cache.gf, next);
     }
 
-    cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-    if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector tail cache failed");
-    if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) {
-        throw std::runtime_error("ggml_gallocr_reserve vector tail cache failed");
-    }
-    ggml_gallocr_alloc_graph(cache.allocr, cache.gf);
+    // Allocation is per-call via the model scheduler (supertonic_sched_alloc
+    // in run), which routes GGML_OP_CUSTOM ops to CPU. No per-cache gallocr.
 }
 
 std::vector<float> run_tail_graph_cache(vector_tail_graph_cache & cache,
@@ -1292,12 +1266,9 @@ std::vector<float> run_tail_graph_cache(vector_tail_graph_cache & cache,
                                         int current_step,
                                         int total_steps,
                                         std::vector<supertonic_trace_tensor> * trace) {
-    if (cache.model != &model || cache.generation_id != model.generation_id ||
-        cache.L != L || cache.C != C ||
-        cache.Cin != Cin || cache.total_steps != total_steps ||
-        cache.trace_outputs != (trace != nullptr)) {
-        build_tail_graph_cache(cache, model, L, C, Cin, total_steps, trace != nullptr);
-    }
+    // Rebuild every call — scheduler alloc corrupts a reused graph; see
+    // run_text_attention_cache for the full rationale.
+    build_tail_graph_cache(cache, model, L, C, Cin, total_steps, trace != nullptr);
     std::vector<float> tail_in_raw = pack_time_channel_for_ggml(x_tc, L, C);
     std::vector<float> noise_tc((size_t)L*Cin);
     for (int t = 0; t < L; ++t) {
@@ -1306,6 +1277,7 @@ std::vector<float> run_tail_graph_cache(vector_tail_graph_cache & cache,
         }
     }
     std::vector<float> noise_raw = pack_time_channel_for_ggml(noise_tc, L, Cin);
+    supertonic_sched_alloc(model, cache.gf);
     ggml_backend_tensor_set(cache.tail_in, tail_in_raw.data(), 0, tail_in_raw.size()*sizeof(float));
     ggml_backend_tensor_set(cache.tail_mask, latent_mask, 0, (size_t)L*sizeof(float));
     ggml_backend_tensor_set(cache.tail_noise, noise_raw.data(), 0, noise_raw.size()*sizeof(float));
@@ -2108,17 +2080,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model,
         ggml_set_output(v_t);
         ggml_build_forward_expand(gf, v_t);
 
-        ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-        if (!allocr) {
-            ggml_free(ctx);
-            throw std::runtime_error("ggml_gallocr_new failed");
-        }
-        if (!ggml_gallocr_reserve(allocr, gf)) {
-            ggml_gallocr_free(allocr);
-            ggml_free(ctx);
-            throw std::runtime_error("ggml_gallocr_reserve failed");
-        }
-        ggml_gallocr_alloc_graph(allocr, gf);
+        supertonic_sched_alloc(model, gf);
 
         ggml_backend_tensor_set(x, noisy_latent, 0, (size_t) L * Cin * sizeof(float));
         ggml_backend_tensor_set(mask, latent_mask, 0, (size_t) L * sizeof(float));
@@ -2217,17 +2179,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model,
             require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.5.norm.norm.bias"));
         ggml_set_name(style_norm, "ve_style0_norm"); ggml_set_output(style_norm);
         ggml_build_forward_expand(srgf, style_norm);
-        ggml_gallocr_t srallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-        if (!srallocr) {
-            ggml_free(srctx);
-            throw std::runtime_error("ggml_gallocr_new style residual failed");
-        }
-        if (!ggml_gallocr_reserve(srallocr, srgf)) {
-            ggml_gallocr_free(srallocr);
-            ggml_free(srctx);
-            throw std::runtime_error("ggml_gallocr_reserve style residual failed");
-        }
-        ggml_gallocr_alloc_graph(srallocr, srgf);
+        supertonic_sched_alloc(model, srgf);
         std::vector<float> style_out_raw = pack_time_channel_for_ggml(style_out_ggml, L, C);
         std::vector<float> style_lhs_raw = pack_time_channel_for_ggml(post_ggml, L, C);
         ggml_backend_tensor_set(style_out_in, style_out_raw.data(), 0, style_out_raw.size()*sizeof(float));
@@ -2236,7 +2188,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model,
         PUSH_GGML_TRACE({"ve_style0_residual", {L, C}, tensor_to_time_channel(ggml_graph_get_tensor(srgf, "ve_style0_residual"))});
         std::vector<float> style_norm_ggml = tensor_to_time_channel(ggml_graph_get_tensor(srgf, "ve_style0_norm"));
         PUSH_GGML_TRACE({"ve_style0_norm", {L, C}, style_norm_ggml});
-        ggml_gallocr_free(srallocr);
         ggml_free(srctx);
 
         thread_local vector_group_graph_cache g1_group_cache;
@@ -2321,17 +2272,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model,
             require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.11.norm.norm.bias"));
         ggml_set_name(g1_style_norm, "ve_g1_style_norm"); ggml_set_output(g1_style_norm);
         ggml_build_forward_expand(g1srgf, g1_style_norm);
-        ggml_gallocr_t g1srallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-        if (!g1srallocr) {
-            ggml_free(g1srctx);
-            throw std::runtime_error("ggml_gallocr_new group1 style residual failed");
-        }
-        if (!ggml_gallocr_reserve(g1srallocr, g1srgf)) {
-            ggml_gallocr_free(g1srallocr);
-            ggml_free(g1srctx);
-            throw std::runtime_error("ggml_gallocr_reserve group1 style residual failed");
-        }
-        ggml_gallocr_alloc_graph(g1srallocr, g1srgf);
+        supertonic_sched_alloc(model, g1srgf);
         std::vector<float> g1_style_lhs_raw = pack_time_channel_for_ggml(g1_block10, L, C);
         std::vector<float> g1_style_out_raw = pack_time_channel_for_ggml(g1_style_out, L, C);
         ggml_backend_tensor_set(g1_style_lhs, g1_style_lhs_raw.data(), 0, g1_style_lhs_raw.size()*sizeof(float));
@@ -2340,7 +2281,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model,
         PUSH_GGML_TRACE({"ve_g1_style_residual", {L, C}, tensor_to_time_channel(ggml_graph_get_tensor(g1srgf, "ve_g1_style_residual"))});
         std::vector<float> g1_style_norm_vec = tensor_to_time_channel(ggml_graph_get_tensor(g1srgf, "ve_g1_style_norm"));
         PUSH_GGML_TRACE({"ve_g1_style_norm", {L, C}, g1_style_norm_vec});
-        ggml_gallocr_free(g1srallocr);
         ggml_free(g1srctx);
 
         thread_local vector_group_graph_cache g2_group_cache;
@@ -2425,17 +2365,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model,
             require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.17.norm.norm.bias"));
         ggml_set_name(g2_style_norm, "ve_g2_style_norm"); ggml_set_output(g2_style_norm);
         ggml_build_forward_expand(g2srgf, g2_style_norm);
-        ggml_gallocr_t g2srallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-        if (!g2srallocr) {
-            ggml_free(g2srctx);
-            throw std::runtime_error("ggml_gallocr_new group2 style residual failed");
-        }
-        if (!ggml_gallocr_reserve(g2srallocr, g2srgf)) {
-            ggml_gallocr_free(g2srallocr);
-            ggml_free(g2srctx);
-            throw std::runtime_error("ggml_gallocr_reserve group2 style residual failed");
-        }
-        ggml_gallocr_alloc_graph(g2srallocr, g2srgf);
+        supertonic_sched_alloc(model, g2srgf);
         std::vector<float> g2_style_lhs_raw = pack_time_channel_for_ggml(g2_block16, L, C);
         std::vector<float> g2_style_out_raw = pack_time_channel_for_ggml(g2_style_out, L, C);
         ggml_backend_tensor_set(g2_style_lhs, g2_style_lhs_raw.data(), 0, g2_style_lhs_raw.size()*sizeof(float));
@@ -2444,7 +2374,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model,
         PUSH_GGML_TRACE({"ve_g2_style_residual", {L, C}, tensor_to_time_channel(ggml_graph_get_tensor(g2srgf, "ve_g2_style_residual"))});
         std::vector<float> g2_style_norm_vec = tensor_to_time_channel(ggml_graph_get_tensor(g2srgf, "ve_g2_style_norm"));
         PUSH_GGML_TRACE({"ve_g2_style_norm", {L, C}, g2_style_norm_vec});
-        ggml_gallocr_free(g2srallocr);
         ggml_free(g2srctx);
 
         thread_local vector_group_graph_cache g3_group_cache;
@@ -2529,17 +2458,7 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model,
             require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.23.norm.norm.bias"));
         ggml_set_name(g3_style_norm, "ve_g3_style_norm"); ggml_set_output(g3_style_norm);
         ggml_build_forward_expand(g3srgf, g3_style_norm);
-        ggml_gallocr_t g3srallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-        if (!g3srallocr) {
-            ggml_free(g3srctx);
-            throw std::runtime_error("ggml_gallocr_new group3 style residual failed");
-        }
-        if (!ggml_gallocr_reserve(g3srallocr, g3srgf)) {
-            ggml_gallocr_free(g3srallocr);
-            ggml_free(g3srctx);
-            throw std::runtime_error("ggml_gallocr_reserve group3 style residual failed");
-        }
-        ggml_gallocr_alloc_graph(g3srallocr, g3srgf);
+        supertonic_sched_alloc(model, g3srgf);
         std::vector<float> g3_style_lhs_raw = pack_time_channel_for_ggml(g3_block22, L, C);
         std::vector<float> g3_style_out_raw = pack_time_channel_for_ggml(g3_style_out, L, C);
         ggml_backend_tensor_set(g3_style_lhs, g3_style_lhs_raw.data(), 0, g3_style_lhs_raw.size()*sizeof(float));
@@ -2548,7 +2467,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model,
         PUSH_GGML_TRACE({"ve_g3_style_residual", {L, C}, tensor_to_time_channel(ggml_graph_get_tensor(g3srgf, "ve_g3_style_residual"))});
         std::vector<float> g3_style_norm_vec = tensor_to_time_channel(ggml_graph_get_tensor(g3srgf, "ve_g3_style_norm"));
         PUSH_GGML_TRACE({"ve_g3_style_norm", {L, C}, g3_style_norm_vec});
-        ggml_gallocr_free(g3srallocr);
         ggml_free(g3srctx);
 
         thread_local vector_tail_graph_cache tail_cache;
@@ -2557,7 +2475,6 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model,
             include_ggml_trace ? &ggml_trace : nullptr);
         if (next_latent_tc_out) *next_latent_tc_out = next_latent_tc;
 
-        ggml_gallocr_free(allocr);
         ggml_free(ctx);
         profile_vector_step_end(current_step);
         if (error) error->clear();
diff --git a/tts-cpp/src/supertonic_vocoder.cpp b/tts-cpp/src/supertonic_vocoder.cpp
index 5fc86261d0c..3ed254d661e 100644
--- a/tts-cpp/src/supertonic_vocoder.cpp
+++ b/tts-cpp/src/supertonic_vocoder.cpp
@@ -420,12 +420,8 @@ void build_supertonic_vocoder_cache(vocoder_graph_cache & cache,
     ggml_build_forward_expand(cache.gf, x);
     cache.wav = x;
 
-    cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-    if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vocoder cache failed");
-    if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) {
-        throw std::runtime_error("ggml_gallocr_reserve vocoder cache failed");
-    }
-    ggml_gallocr_alloc_graph(cache.allocr, cache.gf);
+    // Allocation is per-call via the model scheduler (supertonic_sched_alloc in
+    // the forward path), which routes GGML_OP_CUSTOM ops to CPU. No gallocr.
 }
 
 void linear1x1(const std::vector<float> & x, int L, int IC,
@@ -726,18 +722,18 @@ bool supertonic_vocoder_forward_ggml(const supertonic_model & model,
         profile_vocoder_checkpoint("bn_params", profile_last);
 
         thread_local vocoder_graph_cache cache;
-        if (cache.model != &model || cache.generation_id != model.generation_id ||
-            cache.latent_len != latent_len) {
-            build_supertonic_vocoder_cache(cache, model, latent_len);
-        }
+        // Rebuild every call: the scheduler's alloc_graph mutates node->src[], so a
+        // cached graph can't be reused (full rationale in the vector estimator).
+        build_supertonic_vocoder_cache(cache, model, latent_len);
         profile_vocoder_checkpoint("graph_cache", profile_last);
 
+        supertonic_sched_alloc(model, cache.gf);
         ggml_backend_tensor_set(cache.x_in, x_in.data(), 0, x_in.size() * sizeof(float));
         ggml_backend_tensor_set(cache.bn_scale, bn_scale.data(), 0, bn_scale.size() * sizeof(float));
         ggml_backend_tensor_set(cache.bn_shift, bn_shift.data(), 0, bn_shift.size() * sizeof(float));
         profile_vocoder_checkpoint("set_inputs", profile_last);
 
-        supertonic_graph_compute(model, cache.gf);
+        supertonic_sched_compute(model, cache.gf);
         profile_vocoder_checkpoint("compute", profile_last);
         wav_out = ggml_tensor_to_time_channel(cache.wav);
         profile_vocoder_checkpoint("readback", profile_last);
@@ -934,17 +930,7 @@ bool supertonic_vocoder_trace_ggml(const supertonic_model & model,
         ggml_set_output(cur);
         ggml_build_forward_expand(gf, cur);
 
-        ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-        if (!allocr) {
-            ggml_free(ctx);
-            throw std::runtime_error("ggml_gallocr_new failed");
-        }
-        if (!ggml_gallocr_reserve(allocr, gf)) {
-            ggml_gallocr_free(allocr);
-            ggml_free(ctx);
-            throw std::runtime_error("ggml_gallocr_reserve failed");
-        }
-        ggml_gallocr_alloc_graph(allocr, gf);
+        supertonic_sched_alloc(model, gf);
 
         std::vector<float> x_host = unpack_latent_ggml_layout(model, latent, latent_len);
         ggml_backend_tensor_set(x_in, x_host.data(), 0, x_host.size() * sizeof(float));
@@ -959,7 +945,7 @@ bool supertonic_vocoder_trace_ggml(const supertonic_model & model,
         }
         ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "trace_bn_scale"), bn_scale_host.data(), 0, bn_scale_host.size() * sizeof(float));
         ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "trace_bn_shift"), bn_shift_host.data(), 0, bn_shift_host.size() * sizeof(float));
-        supertonic_graph_compute(model, gf);
+        supertonic_sched_compute(model, gf);
 
         trace_out.push_back({"unpack", {T0, C_latent}, unpack_latent_scalar(model, latent, latent_len)});
         trace_out.push_back({"denorm", {T0, C_latent}, ggml_tensor_to_time_channel(ggml_graph_get_tensor(gf, "denorm"))});
@@ -978,7 +964,6 @@ bool supertonic_vocoder_trace_ggml(const supertonic_model & model,
         trace_out.push_back({"head1", {T0, (int) model.vocoder.head1_w->ne[2]}, ggml_tensor_to_time_channel(ggml_graph_get_tensor(gf, "head1"))});
         trace_out.push_back({"prelu", {T0, (int) model.vocoder.head1_w->ne[2]}, ggml_tensor_to_time_channel(ggml_graph_get_tensor(gf, "prelu"))});
         trace_out.push_back({"wav", {T0, (int) model.vocoder.head2_w->ne[2]}, ggml_tensor_to_time_channel(ggml_graph_get_tensor(gf, "wav"))});
-        ggml_gallocr_free(allocr);
         ggml_free(ctx);
         if (error) error->clear();
         return true;