From 2ead7edcf53d6e52b12231eabbd092eb2d42200d Mon Sep 17 00:00:00 2001
From: Zbigniew Herman <zbigniew.herman@tether.io>
Date: Tue, 5 May 2026 10:08:22 +0200
Subject: [PATCH 1/6] QVAC-18422 [TTS GGML] Optimize cpp backend multilingual
 for CPU

---
 CMakeLists.txt                  |  11 +
 src/chatterbox_tts.cpp          | 100 ++++++++-
 src/chatterbox_tts_test_hooks.h |  66 ++++++
 src/test_cpu_caches.cpp         | 366 ++++++++++++++++++++++++++++++++
 4 files changed, 534 insertions(+), 9 deletions(-)
 create mode 100644 src/chatterbox_tts_test_hooks.h
 create mode 100644 src/test_cpu_caches.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6401476..8c01ff7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -205,6 +205,17 @@ if (TTS_CPP_BUILD_TESTS)
     target_link_libraries(test-streaming PRIVATE ggml)
     target_include_directories(test-streaming PRIVATE ggml/include src include)
 
+    # CPU-side persistent-cache validation (QVAC-18422).
+    # Exercises the time_mlp / time_emb / cfm_estimator / weight_mirror
+    # caches that amortise per-synth overhead on the multilingual CPU
+    # path.  Links the chatterbox_tts.cpp directly so it can reach the
+    # internal test-hook entrypoints.
+    add_executable(test-cpu-caches
+        src/test_cpu_caches.cpp
+        src/chatterbox_tts.cpp)
+    target_link_libraries(test-cpu-caches PRIVATE ggml)
+    target_include_directories(test-cpu-caches PRIVATE ggml/include src include)
+
     add_executable(test-metal-ops src/test_metal_ops.cpp)
     target_link_libraries(test-metal-ops PRIVATE ggml)
     target_include_directories(test-metal-ops PRIVATE ggml/include src)
diff --git a/src/chatterbox_tts.cpp b/src/chatterbox_tts.cpp
index 22c00f6..1bb139c 100644
--- a/src/chatterbox_tts.cpp
+++ b/src/chatterbox_tts.cpp
@@ -27,6 +27,7 @@
 #include "ggml-cpu.h"
 #include "gguf.h"
 #include "npy.h"
+#include "chatterbox_tts_test_hooks.h"
 
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
@@ -179,6 +180,15 @@ static std::unique_ptr<s3gen_cache_entry>    g_s3gen_cache_entry;
 static double                                g_s3gen_cache_last_load_ms = 0.0;
 }  // namespace
 
+// Forward declaration: clear all per-synth caches.  The persistent
+// graph caches (cfm_estimator + time_mlp scaffolding) and the CPU
+// weight mirrors are tied to the model's backend, so they must be
+// torn down BEFORE ggml_backend_free or the gallocators / backend
+// buffers freed there would be released against a dead device.
+//
+// Defined further down (after cfm_estimator_cache is in scope).
+static void s3gen_release_synth_caches();
+
 // Release any cached model_ctx (frees its backend buffer, ggml context and
 // backend).  Must run before the ggml-metal / ggml-cuda / ggml-vulkan dylib
 // tears down its static device list; otherwise their static destructors hit
@@ -186,6 +196,13 @@ static double                                g_s3gen_cache_last_load_ms = 0.0;
 // orphan backend buffer).  We register it with atexit() on first cache
 // insertion so it runs before process-exit dylib finalisers.
 static void s3gen_model_cache_release() {
+    // Tear down the per-synth caches first so any gallocrs they hold
+    // (cfm_estimator_cache::allocr) are freed against the still-alive
+    // backend, then drop the model.  Reverse order would crash on
+    // Vulkan/Metal/CUDA where ggml_gallocr_free against a freed
+    // backend asserts.
+    s3gen_release_synth_caches();
+
     std::lock_guard<std::mutex> lk(g_s3gen_cache_mu);
     // QVAC-17872 round-HIFT + round 2: tear down every persistent host-side
     // cache BEFORE freeing the backend.  The graph caches own
@@ -206,16 +223,18 @@ static void s3gen_model_cache_release() {
 }
 
 static model_ctx * s3gen_model_cache_get(const std::string & path, int n_gpu_layers, bool verbose) {
-    std::lock_guard<std::mutex> lk(g_s3gen_cache_mu);
-    if (g_s3gen_cache_entry &&
-        g_s3gen_cache_entry->path == path &&
-        g_s3gen_cache_entry->gpu  == n_gpu_layers) {
-        if (verbose) {
-            fprintf(stderr, "  %zu tensors (cached — skip GGUF load)\n",
-                    g_s3gen_cache_entry->m->tensors.size());
+    {
+        std::lock_guard<std::mutex> lk(g_s3gen_cache_mu);
+        if (g_s3gen_cache_entry &&
+            g_s3gen_cache_entry->path == path &&
+            g_s3gen_cache_entry->gpu  == n_gpu_layers) {
+            if (verbose) {
+                fprintf(stderr, "  %zu tensors (cached — skip GGUF load)\n",
+                        g_s3gen_cache_entry->m->tensors.size());
+            }
+            g_s3gen_cache_last_load_ms = 0.0;
+            return g_s3gen_cache_entry->m.get();
         }
-        g_s3gen_cache_last_load_ms = 0.0;
-        return g_s3gen_cache_entry->m.get();
     }
     // QVAC-17872 round-HIFT + round 2: backend swap (different path or
     // n_gpu_layers).  Tear down every persistent cache against the OLD
@@ -1244,6 +1263,20 @@ static const float * cached_cpu_weights_f32(const ggml_tensor * t) {
     }
 }
 
+// QVAC-18422: bit-cast cache key helpers used by the test-hooks bridge
+// to query g_time_mlp_results / g_time_emb_results without re-deriving
+// the (uint32_t / uint64_t) keys that compute_time_mlp_cached and
+// compute_time_emb_cached compute inline above.  Defined here so the
+// test_hooks namespace at the bottom of the file can call them.
+static uint32_t g_float_bits(float t_val) {
+    uint32_t bits;
+    std::memcpy(&bits, &t_val, sizeof(bits));
+    return bits;
+}
+static uint64_t g_float_pair_bits(float t_val, float r_val) {
+    return ((uint64_t) g_float_bits(t_val) << 32) | (uint64_t) g_float_bits(r_val);
+}
+
 // QVAC-17872 round 2: definition of s3gen_release_synth_caches (forward-
 // declared near s3gen_model_cache_release).  Defined here once the
 // graph_cache + cfm_estimator_cache structs and globals are all visible.
@@ -2779,3 +2812,52 @@ int s3gen_preload(const std::string & s3gen_gguf_path, int n_gpu_layers) {
 void s3gen_unload() {
     s3gen_model_cache_release();
 }
+
+// ============================================================================
+// QVAC-18422 — internal test hooks
+// ============================================================================
+//
+// Implementations of the read-only cache-state queries declared in
+// chatterbox_tts_test_hooks.h.  Defined here so they sit in the same
+// translation unit as the caches themselves and don't need any extra
+// linkage gymnastics.
+
+namespace tts_cpp::chatterbox::test_hooks {
+
+size_t time_mlp_result_cache_size() {
+    std::lock_guard<std::mutex> lk(g_time_emb_results_mu);
+    return g_time_mlp_results.size();
+}
+size_t time_emb_result_cache_size() {
+    std::lock_guard<std::mutex> lk(g_time_emb_results_mu);
+    return g_time_emb_results.size();
+}
+size_t weight_mirror_cache_size() {
+    std::lock_guard<std::mutex> lk(g_weight_cpu_mirror_mu);
+    return g_weight_cpu_mirror.size();
+}
+bool cfm_estimator_cache_built() {
+    // g_cfm_estimator_cache is mutated only under s3gen_release_synth_caches
+    // (which holds g_synth_caches_mu around the round-2 caches but not this
+    // one) and during the per-synth fast-path inside cfm_estimator_forward.
+    // The single-pointer load below is atomic on x86/ARM; tests treat it
+    // as a snapshot.
+    return g_cfm_estimator_cache.ctx != nullptr;
+}
+bool cfm_estimator_cache_b2() {
+    return g_cfm_estimator_cache.b2;
+}
+uint32_t float_cache_key(float t_val) {
+    return g_float_bits(t_val);
+}
+uint64_t float_pair_cache_key(float t_val, float r_val) {
+    return g_float_pair_bits(t_val, r_val);
+}
+std::vector<float> peek_time_mlp_cached(float t_val) {
+    std::lock_guard<std::mutex> lk(g_time_emb_results_mu);
+    auto it = g_time_mlp_results.find(g_float_bits(t_val));
+    if (it == g_time_mlp_results.end()) return {};
+    return it->second;
+}
+
+}  // namespace tts_cpp::chatterbox::test_hooks
diff --git a/src/chatterbox_tts_test_hooks.h b/src/chatterbox_tts_test_hooks.h
new file mode 100644
index 0000000..c9fdb91
--- /dev/null
+++ b/src/chatterbox_tts_test_hooks.h
@@ -0,0 +1,66 @@
+// Internal test hooks for chatterbox_tts.cpp's CPU optimisation caches.
+//
+// These declarations let tests in src/test_*.cpp inspect cache state that is
+// otherwise file-static.  They are deliberately NOT included in
+// include/tts-cpp/chatterbox/s3gen_pipeline.h because production callers must
+// not depend on cache layout.
+//
+// The hooks are populated by the persistent-cache work landed for QVAC-18422
+// (CPU-side multilingual perf): see PROGRESS.md §3.32.
+//
+// Rules:
+//  - Read-only.  Tests must NOT mutate cache state via these hooks; use
+//    the public s3gen_unload() helper if a clean slate is required.
+//  - Locking is internal.  All hooks acquire the same mutex used by the
+//    cache writers, so concurrent calls during a synthesize() in another
+//    thread are safe but may briefly block.
+//  - Stable across the QVAC-18422 series.  Adding new caches must add new
+//    hooks rather than reshape existing ones.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace tts_cpp::chatterbox::test_hooks {
+
+// Number of (t_val) entries in the time_mlp result cache populated lazily
+// by compute_time_mlp_cached().  Multilingual = up to n_timesteps + 1
+// distinct t-values per process; Turbo = up to 3 (t_span = [0, 0.5, 1]).
+size_t time_mlp_result_cache_size();
+
+// Number of ((t_val, r_val)) entries in the time_mixed result cache used
+// only by the Turbo meanflow path.  Multilingual never populates this.
+size_t time_emb_result_cache_size();
+
+// Number of ggml_tensor* entries in the CPU weight mirror cache.
+// Populated by cached_cpu_weights_f32(); covers flow/input_embedding +
+// spk_embed_affine/{w,b} + any other weight that synthesize() reads via
+// ggml_backend_tensor_get on the hot path.
+size_t weight_mirror_cache_size();
+
+// True iff the persistent (global) cfm_estimator_cache currently holds
+// a built graph.  Initially false; flips to true after the first call to
+// cfm_estimator_forward() and stays true until s3gen_unload().
+bool cfm_estimator_cache_built();
+
+// Returns true iff the persistent cfm_estimator_cache last built a B=2
+// (CFG cond+uncond batched) graph.  Always false on CPU because the
+// CPU code path keeps use_b2 = false; useful for verifying that future
+// edits don't accidentally flip CPU into the B=2 path.
+bool cfm_estimator_cache_b2();
+
+// Cache key generators — exposed so tests can verify the hashing rules
+// for floats (bit-cast into uint32_t / uint64_t).  Important because
+// std::hash<float> mishandles -0.0 / +0.0 and NaN inconsistently across
+// libstdc++/libc++.
+uint32_t float_cache_key(float t_val);
+uint64_t float_pair_cache_key(float t_val, float r_val);
+
+// Returns the cached time_mlp output for `t_val` if present, or an
+// empty vector if there's no entry.  Lets tests probe whether a given
+// t-value was actually warmed without re-entering compute_time_mlp.
+std::vector<float> peek_time_mlp_cached(float t_val);
+
+}  // namespace tts_cpp::chatterbox::test_hooks
diff --git a/src/test_cpu_caches.cpp b/src/test_cpu_caches.cpp
new file mode 100644
index 0000000..47e0d29
--- /dev/null
+++ b/src/test_cpu_caches.cpp
@@ -0,0 +1,366 @@
+// CPU-side persistent-cache validation harness for QVAC-18422
+// "[TTS GGML] Optimize cpp backend multilingual for CPU".
+//
+// Verifies the four cache layers added to chatterbox_tts.cpp:
+//
+//  1. compute_time_mlp_cached() — t_val (float) → (1024,) t_emb vector.
+//     Multilingual fires 10 distinct t-values per synth (cosine schedule);
+//     Turbo fires 3.  Across synth calls the schedule is constant, so the
+//     cache amortises every subsequent synth to zero compute_time_mlp work.
+//
+//  2. compute_time_emb_cached() — (t_val, r_val) → (1024,) mixed embedding.
+//     Turbo meanflow only; multilingual leaves this cache empty.
+//
+//  3. g_cfm_estimator_cache — promotes the local-scope cfm_estimator_cache
+//     to global lifetime so subsequent synth calls don't rebuild the
+//     ~5500-node CFM graph or pay the gallocr_reserve cost.
+//
+//  4. g_weight_cpu_mirror — CPU mirror of large per-synth weight reads
+//     (flow/input_embedding ~28 MB on multilingual, spk_embed_affine
+//     ~60 KB).  Saves the ggml_backend_tensor_get round-trip every synth.
+//
+// All caches are invalidated together by s3gen_unload() so that switching
+// to a different backend (e.g. CPU → Vulkan) doesn't reuse stale state.
+//
+// Usage (with model)        : ./test-cpu-caches MODEL_S3GEN.gguf [REF_DIR]
+// Usage (cache-key only)    : ./test-cpu-caches
+//
+// Without a GGUF the harness still runs the lightweight cache-key tests
+// that catch the typical -0/+0/NaN / std::hash<float> portability traps.
+
+#include "tts-cpp/chatterbox/s3gen_pipeline.h"
+#include "chatterbox_tts_test_hooks.h"
+#include "npy.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <chrono>
+#include <limits>
+#include <string>
+#include <sys/stat.h>
+#include <vector>
+
+namespace th = tts_cpp::chatterbox::test_hooks;
+
+namespace {
+
+int g_failures = 0;
+int g_checks   = 0;
+
+#define CHECK(cond, ...) do {                                            \
+    ++g_checks;                                                          \
+    if (!(cond)) {                                                       \
+        ++g_failures;                                                    \
+        fprintf(stderr, "FAIL %s:%d  %s\n        ",                      \
+                __FILE__, __LINE__, #cond);                              \
+        fprintf(stderr, __VA_ARGS__);                                    \
+        fprintf(stderr, "\n");                                           \
+    }                                                                    \
+} while (0)
+
+bool path_exists(const std::string & p) {
+    struct stat st; return ::stat(p.c_str(), &st) == 0;
+}
+
+double now_ms() {
+    using clock = std::chrono::steady_clock;
+    return std::chrono::duration<double, std::milli>(
+        clock::now().time_since_epoch()).count();
+}
+
+// ---------------- 1. cache-key bit-cast tests ----------------
+//
+// These run unconditionally — no model needed.  They guard the rule
+// that the time_mlp result cache uses a bit-cast hash of the float
+// (so +0/-0 land in different buckets, NaNs are stable per-bit-pattern,
+// and equal floats always hash to the same bucket regardless of how
+// they were computed).
+
+void test_cache_keys() {
+    fprintf(stderr, "=== cache key (bit-cast) tests ===\n");
+
+    // Equal floats → equal keys.
+    CHECK(th::float_cache_key(0.5f) == th::float_cache_key(0.5f),
+          "0.5 should be stable");
+
+    // +0.0 and -0.0 are NOT equal under bit-cast (sign bit differs).
+    // std::hash<float> typically collapses them — we deliberately don't.
+    const float pos_zero = 0.0f;
+    const float neg_zero = -0.0f;
+    CHECK(th::float_cache_key(pos_zero) != th::float_cache_key(neg_zero),
+          "+0 and -0 must produce distinct cache keys");
+
+    // Distinct values → distinct keys (sanity).
+    CHECK(th::float_cache_key(0.5f) != th::float_cache_key(0.25f),
+          "0.5 vs 0.25 must differ");
+
+    // NaN: bit-pattern stable (we don't normalise) — same NaN payload
+    // hashes the same.  This is fine because the time_mlp_cache is
+    // only ever queried with t_span values, none of which are NaN.
+    uint32_t nan_bits = 0x7fc00001u;  // a quiet NaN
+    float nan_val;
+    std::memcpy(&nan_val, &nan_bits, sizeof(float));
+    CHECK(th::float_cache_key(nan_val) == 0x7fc00001u,
+          "NaN bit pattern must round-trip");
+
+    // Pair key: high 32 bits = t_val, low 32 bits = r_val.
+    const float t = 0.5f;
+    const float r = 1.0f;
+    const uint64_t expect =
+        ((uint64_t) th::float_cache_key(t) << 32) |
+         (uint64_t) th::float_cache_key(r);
+    CHECK(th::float_pair_cache_key(t, r) == expect,
+          "pair key must compose from individual float keys");
+
+    // Order matters: (t, r) ≠ (r, t).
+    CHECK(th::float_pair_cache_key(0.5f, 1.0f) !=
+          th::float_pair_cache_key(1.0f, 0.5f),
+          "pair key must not be commutative");
+
+    // Cosine schedule used by multilingual (n_timesteps=10) — verify
+    // 10 distinct keys.  Mirrors the t_span = 1 - cos(i/10 * pi/2) loop
+    // in s3gen_synthesize_to_wav.
+    std::vector<uint32_t> keys;
+    keys.reserve(10);
+    for (int i = 0; i < 10; ++i) {
+        float tau = (float) i / 10.0f;
+        float t_cos = 1.0f - std::cos(tau * 0.5f * (float) M_PI);
+        keys.push_back(th::float_cache_key(t_cos));
+    }
+    bool all_distinct = true;
+    for (size_t i = 0; i < keys.size(); ++i) {
+        for (size_t j = i + 1; j < keys.size(); ++j) {
+            if (keys[i] == keys[j]) { all_distinct = false; break; }
+        }
+    }
+    CHECK(all_distinct,
+          "multilingual t-span (n_timesteps=10 cosine) must produce 10 "
+          "distinct cache keys, otherwise compute_time_mlp_cached would "
+          "alias unrelated steps");
+}
+
+// ---------------- 2. starting cache state ----------------
+
+void test_initial_state() {
+    fprintf(stderr, "=== initial cache state ===\n");
+
+    // s3gen_unload() before any synth must succeed even if no caches
+    // were ever populated (idempotent).  Production callers in the
+    // bare-addon teardown rely on this.
+    s3gen_unload();
+    CHECK(th::time_mlp_result_cache_size() == 0,
+          "time_mlp result cache must start empty");
+    CHECK(th::time_emb_result_cache_size() == 0,
+          "time_emb result cache must start empty");
+    CHECK(th::weight_mirror_cache_size() == 0,
+          "weight mirror cache must start empty");
+    CHECK(!th::cfm_estimator_cache_built(),
+          "persistent cfm_estimator_cache must not be built before any "
+          "synth");
+    CHECK(!th::cfm_estimator_cache_b2(),
+          "persistent cfm_estimator_cache b2 flag must default false");
+}
+
+// ---------------- 3. determinism + cache wiring on a real synth ----------
+
+// Read built-in voice tokens.  No multilingual model available locally,
+// so the harness uses the Turbo built-in voice if --ref-dir wasn't
+// passed.  The cache logic is model-agnostic by construction; the
+// multilingual benefit factor is larger but the bit-exact + lifecycle
+// invariants this test verifies are identical across variants.
+std::vector<int32_t> sample_speech_tokens() {
+    // 24 tokens — enough to exercise the encoder + a single CFM batch
+    // without bloating run-time.  Values are within [0, 6561) (S3 vocab).
+    return {
+        12, 34, 56, 78, 90, 121, 152, 173, 195, 217, 239, 261,
+        283, 305, 327, 349, 371, 393, 415, 437, 459, 481, 503, 525,
+    };
+}
+
+bool synthesize_once(const std::string & gguf,
+                     const std::string & ref_dir,
+                     std::vector<float> & wav,
+                     double & wall_ms) {
+    s3gen_synthesize_opts opts;
+    opts.s3gen_gguf_path = gguf;
+    opts.ref_dir         = ref_dir;
+    opts.out_wav_path    = "";          // stay in-memory
+    opts.pcm_out         = &wav;
+    opts.seed            = 42;
+    opts.n_threads       = 0;           // auto: hardware_concurrency
+    opts.sr              = 24000;
+    opts.verbose         = false;
+    opts.n_gpu_layers    = 0;           // CPU-only for this test
+    opts.apply_trim_fade = true;
+    opts.finalize        = true;
+
+    const auto tokens = sample_speech_tokens();
+    const double t0 = now_ms();
+    int rc = s3gen_synthesize_to_wav(tokens, opts);
+    wall_ms = now_ms() - t0;
+    return rc == 0 && !wav.empty();
+}
+
+void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf,
+                                             const std::string & ref_dir) {
+    fprintf(stderr, "=== warm-cache bit-exact + lifecycle ===\n");
+
+    // First call populates every cache.  Subsequent calls must (a)
+    // produce bit-exact output and (b) skip every cache that was
+    // already warmed.
+    std::vector<float> wav_a, wav_b, wav_c;
+    double t_a = 0, t_b = 0, t_c = 0;
+    if (!synthesize_once(gguf, ref_dir, wav_a, t_a)) {
+        fprintf(stderr, "skip: synth #1 failed (model load / arch?)\n");
+        return;
+    }
+
+    const size_t n_time_mlp_after_a = th::time_mlp_result_cache_size();
+    const size_t n_time_emb_after_a = th::time_emb_result_cache_size();
+    const size_t n_weights_after_a  = th::weight_mirror_cache_size();
+    const bool   cfm_built_after_a  = th::cfm_estimator_cache_built();
+
+    CHECK(cfm_built_after_a,
+          "after first synth, persistent cfm_estimator_cache must be built");
+    CHECK(n_time_mlp_after_a > 0,
+          "after first synth, time_mlp result cache must have at least one "
+          "entry (n_timesteps for multilingual / 3 for Turbo)");
+    CHECK(n_weights_after_a > 0,
+          "after first synth, weight_mirror_cache must have at least one "
+          "entry (input_embedding + spk_embed_affine/{w,b})");
+    fprintf(stderr,
+            "  synth #1: time_mlp=%zu  time_emb=%zu  weights=%zu  cfm=%s "
+            "(%.1f ms)\n",
+            n_time_mlp_after_a, n_time_emb_after_a, n_weights_after_a,
+            cfm_built_after_a ? "built" : "fresh", t_a);
+
+    // Second call: every cache must already be warm.  Its size must
+    // not grow because the t-schedule and the model weights are
+    // constant across synth calls.
+    if (!synthesize_once(gguf, ref_dir, wav_b, t_b)) {
+        fprintf(stderr, "skip: synth #2 failed\n");
+        return;
+    }
+    CHECK(th::time_mlp_result_cache_size() == n_time_mlp_after_a,
+          "synth #2 must NOT add new time_mlp entries (saw %zu, expected %zu)",
+          th::time_mlp_result_cache_size(), n_time_mlp_after_a);
+    CHECK(th::time_emb_result_cache_size() == n_time_emb_after_a,
+          "synth #2 must NOT add new time_emb entries");
+    CHECK(th::weight_mirror_cache_size() == n_weights_after_a,
+          "synth #2 must NOT add new weight_mirror entries");
+    CHECK(th::cfm_estimator_cache_built(),
+          "synth #2 must keep the persistent cfm graph built");
+
+    CHECK(wav_a.size() == wav_b.size(),
+          "warm-cache synth #2 wav length must match cold-cache synth #1 "
+          "(%zu vs %zu)", wav_a.size(), wav_b.size());
+    if (wav_a.size() == wav_b.size() && !wav_a.empty()) {
+        size_t diff = 0;
+        float  max_abs = 0;
+        for (size_t i = 0; i < wav_a.size(); ++i) {
+            float d = std::fabs(wav_a[i] - wav_b[i]);
+            if (d > 0) diff++;
+            if (d > max_abs) max_abs = d;
+        }
+        CHECK(diff == 0,
+              "warm-cache synth #2 must be byte-for-byte identical to "
+              "synth #1 (mismatched samples=%zu, max_abs=%.6e)", diff, max_abs);
+    }
+    fprintf(stderr, "  synth #2: %.1f ms (warm caches, bit-exact ok)\n", t_b);
+
+    // Third call after s3gen_unload() — every cache must have been
+    // reset.  Subsequent synth must repopulate them and still
+    // produce bit-exact output (deterministic seed=42).
+    s3gen_unload();
+    CHECK(th::time_mlp_result_cache_size() == 0,
+          "s3gen_unload must clear time_mlp result cache");
+    CHECK(th::time_emb_result_cache_size() == 0,
+          "s3gen_unload must clear time_emb result cache");
+    CHECK(th::weight_mirror_cache_size() == 0,
+          "s3gen_unload must clear weight_mirror cache");
+    CHECK(!th::cfm_estimator_cache_built(),
+          "s3gen_unload must tear down the persistent cfm cache");
+
+    // Idempotent: a second unload must not crash or produce errors.
+    s3gen_unload();
+
+    if (!synthesize_once(gguf, ref_dir, wav_c, t_c)) {
+        fprintf(stderr, "skip: synth #3 (post-unload) failed\n");
+        return;
+    }
+    CHECK(th::cfm_estimator_cache_built(),
+          "synth #3 must rebuild the cfm cache after unload");
+    CHECK(wav_a.size() == wav_c.size(),
+          "post-unload synth wav length must match");
+    if (wav_a.size() == wav_c.size() && !wav_a.empty()) {
+        size_t diff = 0;
+        float max_abs = 0;
+        for (size_t i = 0; i < wav_a.size(); ++i) {
+            float d = std::fabs(wav_a[i] - wav_c[i]);
+            if (d > 0) diff++;
+            if (d > max_abs) max_abs = d;
+        }
+        CHECK(diff == 0,
+              "post-unload synth must be byte-for-byte identical to first "
+              "synth (mismatched samples=%zu, max_abs=%.6e)",
+              diff, max_abs);
+    }
+    fprintf(stderr, "  synth #3 (post-unload): %.1f ms — bit-exact ok\n", t_c);
+
+    // peek_time_mlp_cached: warm value should round-trip.
+    auto cosine_t = [](int i, int n) {
+        float tau = (float) i / (float) n;
+        return 1.0f - std::cos(tau * 0.5f * (float) M_PI);
+    };
+    // For Turbo (meanflow=true, n_timesteps=2) the schedule is linear:
+    // [0, 0.5, 1.0].  For multilingual (cosine, n_timesteps=10) the
+    // schedule is cosine.  We probe both candidates non-destructively;
+    // at least one of {0.5f, cosine_t(1,10)} should be present.
+    auto a = th::peek_time_mlp_cached(0.5f);
+    auto b = th::peek_time_mlp_cached(cosine_t(1, 10));
+    CHECK(!a.empty() || !b.empty(),
+          "peek_time_mlp_cached must return a populated entry for at least "
+          "one of the canonical t-values (0.5 for Turbo or cosine[1] for "
+          "multilingual)");
+    if (!a.empty()) {
+        CHECK(a.size() == 1024,
+              "time_mlp cached entry must be (1024,) — saw %zu", a.size());
+    }
+    if (!b.empty()) {
+        CHECK(b.size() == 1024,
+              "time_mlp cached entry must be (1024,) — saw %zu", b.size());
+    }
+}
+
+}  // namespace
+
+int main(int argc, char ** argv) {
+    fprintf(stderr, "test-cpu-caches: QVAC-18422 cache validation\n");
+
+    test_cache_keys();
+    test_initial_state();
+
+    if (argc < 2) {
+        fprintf(stderr, "\n(no GGUF given — skipping warm-cache + lifecycle "
+                        "tests; run as `%s MODEL.gguf [REF_DIR]` to exercise "
+                        "the full pipeline)\n", argv[0]);
+    } else {
+        const std::string gguf = argv[1];
+        const std::string ref_dir = (argc >= 3 ? argv[2] : "");
+        if (!path_exists(gguf)) {
+            fprintf(stderr, "error: GGUF not found at %s\n", gguf.c_str());
+            return 2;
+        }
+        test_warm_cache_bit_exact_and_lifecycle(gguf, ref_dir);
+    }
+
+    // Always release at exit so the next test invocation starts clean.
+    s3gen_unload();
+
+    fprintf(stderr, "\n=== summary ===\n  checks:   %d\n  failures: %d\n",
+            g_checks, g_failures);
+    return g_failures == 0 ? 0 : 1;
+}

From 4d18b37dfe0ff9d99d887541d60dc57aab7a6d81 Mon Sep 17 00:00:00 2001
From: Zbigniew Herman <zbigniew.herman@tether.io>
Date: Tue, 5 May 2026 13:03:21 +0200
Subject: [PATCH 2/6] QVAC-18422 [TTS GGML] Optimize cpp backend multilingual
 for CPU (round 2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PROGRESS.md §3.33 — persistent encoder/HiFT/F0 graph caches +
pos_emb / inv_alpha / hann_window / istft_kernel / window_sum
scaffolding caches on top of the round-1 CFM caches (§3.32).
Turbo single-utterance S3GEN_INFER_MS -22 %, streaming wall -27 %.
Tests: 79/79 pass (49 new round-2 checks).
---
 src/chatterbox_tts.cpp          | 652 +++++++++++++++++---------------
 src/chatterbox_tts_test_hooks.h |  31 ++
 src/test_cpu_caches.cpp         | 241 +++++++++++-
 3 files changed, 616 insertions(+), 308 deletions(-)

diff --git a/src/chatterbox_tts.cpp b/src/chatterbox_tts.cpp
index 1bb139c..afd87e4 100644
--- a/src/chatterbox_tts.cpp
+++ b/src/chatterbox_tts.cpp
@@ -443,6 +443,197 @@ static ggml_tensor * reflect_pad_1d(ggml_context * ctx, ggml_tensor * x, int p_l
     return y;
 }
 
+// ============================================================================
+// QVAC-18422 — CPU-side persistent caches (multilingual TTS optimisation)
+// ============================================================================
+//
+// Round 1 (already shipped above) targeted three host-side bottlenecks:
+//   (a) compute_time_mlp graph submissions (10× / synth on multilingual)
+//   (b) the local-scope cfm_estimator_cache rebuild on every synth
+//   (c) per-synth ggml_backend_tensor_get of the 13–28 MB
+//       flow/input_embedding + the speaker affine matrices
+//
+// Round 2 closes the gap between "host overhead" and "real compute" for
+// the remaining per-synth pipeline stages:
+//
+//   (d) S3Gen Conformer encoder graph + gallocator built from scratch
+//       every synth (~700 nodes; ~3-5 ms saved per synth)
+//   (e) HiFT decoder graph built from scratch every synth (~3000 nodes
+//       across 3 upsample stages × 9 ResBlocks; ~10-30 ms saved)
+//   (f) F0 predictor graph built every synth (~25 nodes; <1 ms saved)
+//   (g) compute_pos_emb result (T trig ops, fired twice per encoder run)
+//   (h) build_hann_window / build_istft_kernel scaffolding for HiFT
+//       (~1.85M F32 mults + cos/sin in build_istft_kernel alone)
+//   (i) build_window_sum scaffolding (T_stft × n_fft F32 ops)
+//   (j) invert_alpha_cpu fired ~72× per HiFT call (12 ResBlocks × 6
+//       alpha tensors; each does a tensor_get + per-element reciprocal)
+//
+// Every cache is process-wide, keyed by the shape parameters that
+// drive graph topology (so streaming chunks of varying length still
+// produce correct output — the cache rebuilds when its key
+// diverges).  Cleanup happens in s3gen_release_synth_caches before
+// ggml_backend_free, so the gallocators in the graph caches release
+// against a still-valid backend.
+
+// Generic per-stage graph cache (encoder / HiFT / F0 predictor).  Owns
+// the ggml_context, graph, and gallocator.  `key` encodes the shape
+// parameters that drive graph topology (e.g. T for the encoder,
+// pack(T_mel, T_stft) for HiFT) — a build is reused iff the requested
+// `key` matches the cached one.  -1 means "no graph built".
+struct graph_cache {
+    int64_t                key = -1;
+    ggml_context *         ctx = nullptr;
+    ggml_cgraph *          gf  = nullptr;
+    ggml_gallocr_t         allocr = nullptr;
+    std::vector<uint8_t>   buf;
+
+    void destroy() {
+        if (allocr) { ggml_gallocr_free(allocr); allocr = nullptr; }
+        if (ctx)    { ggml_free(ctx);            ctx    = nullptr; }
+        gf  = nullptr;
+        key = -1;
+        // Keep `buf` reservation; reusing it avoids a multi-MB malloc
+        // on the next rebuild.
+    }
+};
+
+// Pack (T_mel, T_stft) into a single int64_t key for the HiFT graph
+// cache.  Both dimensions are positive int32 in practice; combining
+// them this way gives a unique key with no collision.
+static int64_t pack_hift_key(int T_mel, int T_stft) {
+    return ((int64_t) T_mel << 32) | (uint32_t) T_stft;
+}
+
+// Round-1 CFM estimator graph cache (struct definition; the global
+// instance lives in the cache-state block below alongside the round-2
+// graph caches).  Cache key is (T, b2): a graph built for batch=1
+// (cfm_estimator_forward) cannot be reused for the batch=2 path
+// (cfm_estimator_forward_b2) since the input tensor layouts differ
+// (ne[2] = 1 vs 2).  Today `use_b2` is constant per
+// `s3gen_synthesize_to_wav` invocation so the key disambiguation is
+// belt-and-braces — but a future change that switches modes
+// mid-utterance (e.g. CFG warm-up where step 0 is single-pass and
+// steps 1+ are batched) would silently reuse a wrong-shape graph and
+// crash inside the allocator.
+struct cfm_estimator_cache {
+    int  T  = -1;
+    bool b2 = false;
+    ggml_context * ctx = nullptr;
+    ggml_cgraph * gf = nullptr;
+    ggml_gallocr_t allocr = nullptr;
+    std::vector<uint8_t> buf;
+    ~cfm_estimator_cache() {
+        if (allocr) ggml_gallocr_free(allocr);
+        if (ctx) ggml_free(ctx);
+    }
+    // Explicit reset usable from s3gen_release_synth_caches() — the
+    // global instance never goes out of scope, so the destructor alone
+    // wouldn't run before ggml_backend_free in the normal teardown
+    // ordering.  Idempotent.
+    void destroy() {
+        if (allocr) { ggml_gallocr_free(allocr); allocr = nullptr; }
+        if (ctx)    { ggml_free(ctx);            ctx    = nullptr; }
+        gf  = nullptr;
+        T   = -1;
+        b2  = false;
+        // Keep `buf` allocated — it's just a heap arena, no backend
+        // resource bound to it.  Reusing it avoids a 64 MB malloc on
+        // the next synth.
+    }
+};
+
+// Bit-cast cache key for floats — avoids ambiguous std::hash<float>
+// behaviour on -0.0/+0.0 and NaN bit patterns.  Tested by
+// test_cpu_caches.cpp::test_cache_keys.
+static uint32_t g_float_bits(float t_val) {
+    uint32_t bits;
+    std::memcpy(&bits, &t_val, sizeof(bits));
+    return bits;
+}
+static uint64_t g_float_pair_bits(float t_val, float r_val) {
+    return ((uint64_t) g_float_bits(t_val) << 32) | (uint64_t) g_float_bits(r_val);
+}
+
+namespace {
+// Single mutex around every cache.  Held only across cache-state
+// mutations (insert / clear / size queries), not across the heavy
+// compute itself.
+static std::mutex                                                        g_synth_caches_mu;
+
+// Round 1 result caches.
+static std::unordered_map<uint32_t, std::vector<float>>                  g_time_mlp_results;
+static std::unordered_map<uint64_t, std::vector<float>>                  g_time_emb_results;
+static std::unordered_map<const ggml_tensor *, std::vector<float>>       g_weight_cpu_mirror;
+static cfm_estimator_cache                                               g_cfm_estimator_cache;
+
+// Round 2 graph caches.
+static graph_cache                                                       g_encoder_graph_cache;
+static graph_cache                                                       g_hift_graph_cache;
+static graph_cache                                                       g_f0_graph_cache;
+// Parallel metadata for HiFT: the (graph-input-name, model-tensor-ptr)
+// pairs for every alpha tensor referenced by the cached HiFT graph.
+// Used on cache hits to refresh each alpha-input slot with the data
+// from g_inv_alpha_results without rebuilding the graph.
+static std::vector<std::pair<std::string, const ggml_tensor *>>          g_hift_inv_alpha_entries;
+
+// Round 2 result caches (pure-compute scaffolding).
+static std::unordered_map<int64_t, std::vector<float>>                   g_pos_emb_results;
+static std::unordered_map<const ggml_tensor *, std::vector<float>>       g_inv_alpha_results;
+static std::unordered_map<int, std::vector<float>>                       g_hann_window_cache;
+static std::unordered_map<int, std::vector<float>>                       g_istft_kernel_cache;
+static std::unordered_map<int64_t, std::vector<float>>                   g_window_sum_cache;
+}  // namespace
+
+// Cached F32 mirror of a model tensor.  Returns a pointer into the
+// cache; valid until s3gen_unload().  Caller must NOT free.
+//
+// First call: ggml_backend_tensor_get into a freshly allocated
+// std::vector<float>.  Subsequent calls: hit-cache and return the
+// existing pointer.
+//
+// Requires the source tensor to be F32; chatterbox's bandwidth-heavy
+// per-synth weights (input_embedding, spk_embed_affine/{w,b}) all
+// live as F32, so a templated variant for F16/Q8_0 isn't needed here.
+static const float * cached_cpu_weights_f32(const ggml_tensor * t) {
+    if (!t) return nullptr;
+    {
+        std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+        auto it = g_weight_cpu_mirror.find(t);
+        if (it != g_weight_cpu_mirror.end()) {
+            return it->second.data();
+        }
+    }
+    // Read outside the lock (the get is ~ms-scale on a GPU backend).
+    std::vector<float> staged(ggml_nelements(t));
+    ggml_backend_tensor_get(t, staged.data(), 0, ggml_nbytes(t));
+
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    auto [it, inserted] = g_weight_cpu_mirror.try_emplace(t, std::move(staged));
+    return it->second.data();
+}
+
+// Tear down every per-synth cache.  Safe to call multiple times; safe
+// before/after s3gen_model_cache_release.  Mutex held just long
+// enough to flip the data structures — if a synth is mid-flight on
+// another thread it must finish before this returns (gallocr_free on
+// a graph that's about to be reused is undefined).
+static void s3gen_release_synth_caches() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    g_cfm_estimator_cache.destroy();
+    g_encoder_graph_cache.destroy();
+    g_hift_graph_cache.destroy();
+    g_f0_graph_cache.destroy();
+    g_hift_inv_alpha_entries.clear();
+    g_time_mlp_results.clear();
+    g_time_emb_results.clear();
+    g_weight_cpu_mirror.clear();
+    g_pos_emb_results.clear();
+    g_inv_alpha_results.clear();
+    g_hann_window_cache.clear();
+    g_istft_kernel_cache.clear();
+    g_window_sum_cache.clear();
+}
+
 // ============================================================================
 // Encoder (Conformer) — produces mu for CFM
 // ============================================================================
@@ -547,85 +738,14 @@ static ggml_tensor * conformer_block(ggml_context * ctx, const conformer_w & w,
     return ggml_add(ctx, residual, ff);
 }
 
-// ============================================================================
-// QVAC-17872 round 2: persistent graph + scaffolding caches (declarations).
-// ----------------------------------------------------------------------------
-// All host-side, model-agnostic, no GGUF-format change.  Same teardown
-// discipline as g_cfm_estimator_cache (destroy() before ggml_backend_free).
-//
-// Targeted bottlenecks on multilingual on Vulkan (after round-1 / round-HIFT
-// already shipped):
-//   - run_encoder rebuilds its full graph + gallocr per synth (~17 ms host
-//     overhead on multilingual T=350+).
-//   - run_hift_decode rebuilds its graph + gallocr + computes
-//     hann_window/istft_kernel/window_sum + ~72 inv_alpha tensor_get calls
-//     per synth (~7-10 ms compounded host overhead, multilingual is the
-//     biggest beneficiary because audio length scales with the prompt).
-//   - run_f0_predictor rebuilds its (smaller) graph per synth.
-//   - compute_pos_emb fires twice per encoder run (for T and 2T) at
-//     ~T*D*5 trig ops; multilingual chunks of T~350+ pay several ms.
-//
-// Each cache is process-wide; the steady-state size is small (1-2 entries
-// per shape key) and bounded by the number of distinct shapes the running
-// process sees.  Streaming sessions with many varying T values can grow
-// these caches; a future LRU bound would belong here.
-//
-// The cache state lives here (above run_encoder so its definition can use
-// it).  The destroy/clear function `s3gen_release_synth_caches()` is
-// defined later, alongside g_cfm_estimator_cache, since it touches both.
-// ============================================================================
-
-// Generic graph cache used by encoder / HiFT / F0 — same shape, different keys.
-struct graph_cache {
-    int64_t                key = -1;
-    ggml_context *         ctx = nullptr;
-    ggml_cgraph *          gf  = nullptr;
-    ggml_gallocr_t         allocr = nullptr;
-    std::vector<uint8_t>   buf;
-
-    void destroy() {
-        if (allocr) { ggml_gallocr_free(allocr); allocr = nullptr; }
-        if (ctx)    { ggml_free(ctx);            ctx    = nullptr; }
-        gf  = nullptr;
-        key = -1;
-        // Keep `buf` reservation; reusing it avoids a multi-MB malloc on
-        // the next rebuild.
-    }
-};
-
-// Pack (T_mel, T_stft) into a single int64_t key for the HiFT graph cache.
-// Both dimensions are positive int32 in practice; combining them this way
-// gives a unique key with no collision.
-static int64_t pack_hift_key(int T_mel, int T_stft) {
-    return ((int64_t) T_mel << 32) | (uint32_t) T_stft;
-}
-
-namespace {
-// Single mutex around every round-2 cache.  Held only across cache-state
-// mutations (insert / clear / size queries), not across the heavy compute
-// or graph rebuilds themselves.  s3gen_synthesize_to_wav is process-serial
-// in practice (the existing s3gen_cache_entry mutex enforces single-flight
-// model loads), so contention is effectively zero.
-static std::mutex g_synth_caches_mu;
-
-// Graph caches.
-static graph_cache g_encoder_graph_cache;   // keyed on T (encoder input length)
-static graph_cache g_hift_graph_cache;      // keyed on pack(T_mel, T_stft)
-static graph_cache g_f0_graph_cache;        // keyed on T_mel
-
-// Parallel metadata for HiFT: the (graph-input-name, model-tensor-ptr)
-// pairs for every alpha tensor referenced by the cached HiFT graph.
-// Used on cache hits to refresh each alpha-input slot from the data in
-// g_inv_alpha_results without rebuilding the graph.
-static std::vector<std::pair<std::string, const ggml_tensor *>> g_hift_inv_alpha_entries;
-
-// Result / scaffolding caches (pure CPU compute).
-static std::unordered_map<int64_t, std::vector<float>>             g_pos_emb_results;
-static std::unordered_map<const ggml_tensor *, std::vector<float>> g_inv_alpha_results;
-static std::unordered_map<int, std::vector<float>>                 g_hann_window_cache;
-static std::unordered_map<int, std::vector<float>>                 g_istft_kernel_cache;
-static std::unordered_map<int64_t, std::vector<float>>             g_window_sum_cache;
-}  // namespace
+// QVAC-17872 round 2 / QVAC-18422 round 2: the `graph_cache` struct,
+// `pack_hift_key`, and the cache-state globals (g_encoder_graph_cache,
+// g_hift_graph_cache, g_f0_graph_cache, g_hift_inv_alpha_entries,
+// g_pos_emb_results, g_inv_alpha_results, g_hann_window_cache,
+// g_istft_kernel_cache, g_window_sum_cache) all live in the QVAC-18422
+// CPU-side cache block earlier in this file — declared above run_encoder
+// so its definition can use them, and torn down in
+// s3gen_release_synth_caches() against the still-live backend.
 
 // Scaffolding-helper forward declarations (definitions live later, alongside
 // the cfm_estimator_cache + cached_cpu_weights_f32 helpers, where the
@@ -664,6 +784,24 @@ static void compute_pos_emb(std::vector<float> & pe, int T, int D) {
     }
 }
 
+// QVAC-18422 round 2: cached wrapper around compute_pos_emb.  Keyed by
+// pack(T, D); for chatterbox D is constant=512 and T is determined by
+// the encoder input length.  Streaming chunks at the same T after the
+// first synth pay zero compute_pos_emb work.
+static const std::vector<float> & cached_pos_emb(int T, int D) {
+    const int64_t key = ((int64_t) T << 32) | (uint32_t) D;
+    {
+        std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+        auto it = g_pos_emb_results.find(key);
+        if (it != g_pos_emb_results.end()) return it->second;
+    }
+    std::vector<float> pe;
+    compute_pos_emb(pe, T, D);
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    auto [it, inserted] = g_pos_emb_results.try_emplace(key, std::move(pe));
+    return it->second;
+}
+
 // Run the full S3Gen encoder: input (T, D=512) -> mu (2T, 80)
 // QVAC-17872 round 2: graph + gallocator cached process-wide via
 // g_encoder_graph_cache (keyed on T = encoder input length).  Same-shape
@@ -1134,39 +1272,25 @@ static std::vector<float> compute_time_mixed(const model_ctx & m,
     return out;
 }
 
-// QVAC-17872 round-HIFT: memoised time-embedding pipeline.  Both Turbo
-// (meanflow, t_span = [0, 0.5, 1]) and multilingual (cosine-scheduled, 10
-// steps) produce the same set of t-values across all subsequent synth
-// calls — the t-embedding outputs are deterministic functions of t (and
-// the model weights), so we can cache them.
-//
-// Two-layer cache:
-//   - g_time_mlp_results: keyed by uint32_t bitcast of t_val, used by
-//     both paths.  Multilingual benefits the most (10 distinct t-values
-//     repeated across every synth).
-//   - g_time_emb_results: keyed by uint64_t = (kt << 32) | kr, ONLY
-//     used by Turbo (meanflow) since multilingual doesn't run the mixer.
-//
-// Cleared in s3gen_release_synth_caches alongside the graph cache.
+// QVAC-18422: memoised time-embedding pipeline.  Both Turbo (meanflow,
+// t_span = [0, 0.5, 1]) and multilingual (cosine-scheduled, 10 steps)
+// produce the same set of t-values across all subsequent synth calls —
+// the t-embedding outputs are deterministic functions of t (and the
+// model weights), so we cache them.  Globals + mutex live in the
+// QVAC-18422 anonymous namespace block earlier in this file.
 //
 // Bit-exactness: trivially preserved — same compute, just memoised.
-static std::unordered_map<uint32_t, std::vector<float>> g_time_mlp_results;
-static std::unordered_map<uint64_t, std::vector<float>> g_time_emb_results;
-static std::mutex                                       g_time_emb_results_mu;
-
 static std::vector<float> compute_time_mlp_cached(const model_ctx & m, float t_val) {
-    uint32_t key;
-    static_assert(sizeof(key) == sizeof(t_val), "float must be 32-bit for bitcast key");
-    std::memcpy(&key, &t_val, sizeof(key));
+    const uint32_t key = g_float_bits(t_val);
     {
-        std::lock_guard<std::mutex> lk(g_time_emb_results_mu);
+        std::lock_guard<std::mutex> lk(g_synth_caches_mu);
         auto it = g_time_mlp_results.find(key);
         if (it != g_time_mlp_results.end()) return it->second;
     }
     auto out = compute_time_mlp(m, t_val);
     {
-        std::lock_guard<std::mutex> lk(g_time_emb_results_mu);
-        g_time_mlp_results.emplace(key, out);
+        std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+        g_time_mlp_results.try_emplace(key, out);
     }
     return out;
 }
@@ -1174,12 +1298,9 @@ static std::vector<float> compute_time_mlp_cached(const model_ctx & m, float t_v
 // Used only by the meanflow (Turbo) path — multilingual doesn't run
 // time_embed_mixer.  Caches the full t_emb pipeline by (t, r) pair.
 static std::vector<float> compute_time_emb_cached(const model_ctx & m, float t_val, float r_val) {
-    uint32_t kt, kr;
-    std::memcpy(&kt, &t_val, sizeof(kt));
-    std::memcpy(&kr, &r_val, sizeof(kr));
-    const uint64_t key = ((uint64_t)kt << 32) | (uint64_t)kr;
+    const uint64_t key = g_float_pair_bits(t_val, r_val);
     {
-        std::lock_guard<std::mutex> lk(g_time_emb_results_mu);
+        std::lock_guard<std::mutex> lk(g_synth_caches_mu);
         auto it = g_time_emb_results.find(key);
         if (it != g_time_emb_results.end()) return it->second;
     }
@@ -1187,130 +1308,18 @@ static std::vector<float> compute_time_emb_cached(const model_ctx & m, float t_v
     auto r_mlp = compute_time_mlp_cached(m, r_val);
     auto out = compute_time_mixed(m, t_mlp, r_mlp);
     {
-        std::lock_guard<std::mutex> lk(g_time_emb_results_mu);
-        g_time_emb_results.emplace(key, out);
+        std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+        g_time_emb_results.try_emplace(key, out);
     }
     return out;
 }
 
-// Cached CFM estimator state — graph is built once and reused across steps.
-//
-// Cache key is (T, b2): a graph built for batch=1 (cfm_estimator_forward) cannot
-// be reused for the batch=2 path (cfm_estimator_forward_b2) since the input
-// tensor layouts differ (ne[2] = 1 vs 2).  Today `use_b2` is constant per
-// `s3gen_synthesize_to_wav` invocation and the cache lives on the stack of
-// that one call, so a single key would be safe — but a future change that
-// switches modes mid-utterance (e.g. CFG warm-up where step 0 is single-pass
-// and steps 1+ are batched) would silently reuse a wrong-shape graph and
-// crash inside the allocator.
-struct cfm_estimator_cache {
-    int  T  = -1;
-    bool b2 = false;
-    ggml_context * ctx = nullptr;
-    ggml_cgraph * gf = nullptr;
-    ggml_gallocr_t allocr = nullptr;
-    std::vector<uint8_t> buf;
-    // QVAC-17872 round-HIFT: explicit destroy() so the cache can be a
-    // process-global tied to the s3gen-model lifecycle.  See
-    // s3gen_model_cache_release: invoked BEFORE ggml_backend_free, which
-    // is the same constraint the existing thread_local time_mlp_cache
-    // documents (Vulkan/Metal device-teardown ordering at process exit).
-    void destroy() {
-        if (allocr) { ggml_gallocr_free(allocr); allocr = nullptr; }
-        if (ctx)    { ggml_free(ctx);            ctx    = nullptr; }
-        gf  = nullptr;
-        T   = -1;
-        b2  = false;
-        buf = std::vector<uint8_t>();
-    }
-    // Destructor kept as a safety net for non-cached usages (e.g. tests
-    // that allocate a cfm_estimator_cache on the stack).  The global
-    // g_cfm_estimator_cache is explicitly destroyed via
-    // s3gen_model_cache_release before backend teardown.
-    ~cfm_estimator_cache() {
-        if (allocr) ggml_gallocr_free(allocr);
-        if (ctx) ggml_free(ctx);
-    }
-};
-
-// QVAC-17872 round-HIFT: persistent CFM estimator graph.  Was local-scope
-// in s3gen_synthesize_to_wav() before, so every synth call paid the full
-// graph rebuild cost (CFM has ~5500 ggml ops + gallocr_reserve allocates
-// the device-side buffer pool).  Persistent global with explicit destroy()
-// eliminates the rebuild on synth calls 2..N when T matches.
-static cfm_estimator_cache g_cfm_estimator_cache;
-
-// QVAC-17872 round-HIFT: CPU-side mirror of large model weights that
-// synthesize() reads every call (input_embedding lookup table, speaker
-// affine matrix).  These are model constants — on a GPU backend each
-// call previously paid an N MB device→host download per synth.  Cleared
-// in s3gen_release_synth_caches alongside the graph cache.
-static std::unordered_map<const ggml_tensor *, std::vector<float>> g_weight_cpu_mirror;
-static std::mutex                                                  g_weight_cpu_mirror_mu;
-
-static const float * cached_cpu_weights_f32(const ggml_tensor * t) {
-    {
-        std::lock_guard<std::mutex> lk(g_weight_cpu_mirror_mu);
-        auto it = g_weight_cpu_mirror.find(t);
-        if (it != g_weight_cpu_mirror.end()) return it->second.data();
-    }
-    std::vector<float> data(ggml_nelements(t));
-    ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
-    {
-        std::lock_guard<std::mutex> lk(g_weight_cpu_mirror_mu);
-        auto [it, inserted] = g_weight_cpu_mirror.emplace(t, std::move(data));
-        return it->second.data();
-    }
-}
-
-// QVAC-18422: bit-cast cache key helpers used by the test-hooks bridge
-// to query g_time_mlp_results / g_time_emb_results without re-deriving
-// the (uint32_t / uint64_t) keys that compute_time_mlp_cached and
-// compute_time_emb_cached compute inline above.  Defined here so the
-// test_hooks namespace at the bottom of the file can call them.
-static uint32_t g_float_bits(float t_val) {
-    uint32_t bits;
-    std::memcpy(&bits, &t_val, sizeof(bits));
-    return bits;
-}
-static uint64_t g_float_pair_bits(float t_val, float r_val) {
-    return ((uint64_t) g_float_bits(t_val) << 32) | (uint64_t) g_float_bits(r_val);
-}
-
-// QVAC-17872 round 2: definition of s3gen_release_synth_caches (forward-
-// declared near s3gen_model_cache_release).  Defined here once the
-// graph_cache + cfm_estimator_cache structs and globals are all visible.
-// Idempotent — safe to call multiple times and from multiple release paths.
-//
-// Order matters: graph caches first (they own gallocr_t handles bound to
-// the still-live backend); then result caches; then the round-1 caches.
-// The graph_cache struct + globals themselves are declared earlier (above
-// run_encoder) — see "QVAC-17872 round 2: persistent graph + scaffolding
-// caches" block.
-static void s3gen_release_synth_caches() {
-    {
-        std::lock_guard<std::mutex> lk(g_synth_caches_mu);
-        g_encoder_graph_cache.destroy();
-        g_hift_graph_cache.destroy();
-        g_f0_graph_cache.destroy();
-        g_hift_inv_alpha_entries.clear();
-        g_pos_emb_results.clear();
-        g_inv_alpha_results.clear();
-        g_hann_window_cache.clear();
-        g_istft_kernel_cache.clear();
-        g_window_sum_cache.clear();
-    }
-    g_cfm_estimator_cache.destroy();
-    {
-        std::lock_guard<std::mutex> lk(g_time_emb_results_mu);
-        g_time_mlp_results.clear();
-        g_time_emb_results.clear();
-    }
-    {
-        std::lock_guard<std::mutex> lk(g_weight_cpu_mirror_mu);
-        g_weight_cpu_mirror.clear();
-    }
-}
+// `cfm_estimator_cache` struct, its global `g_cfm_estimator_cache`,
+// `g_weight_cpu_mirror` + `cached_cpu_weights_f32`, the bit-cast key
+// helpers `g_float_bits` / `g_float_pair_bits`, and the
+// `s3gen_release_synth_caches()` definition all live in the QVAC-18422
+// cache block earlier in this file (so they're in scope for run_encoder
+// and other users above).  See "QVAC-18422 — CPU-side persistent caches".
 
 // Single estimator forward: (x, mu, t_emb, spks, cond) -> dxdt
 // All shapes are numpy (80, T) or (80,) as given, flattened row-major.
@@ -1620,71 +1629,12 @@ static std::vector<float> build_window_sum(int T_stft, int n_fft, int hop,
     return ws;
 }
 
-static ggml_tensor * snake(ggml_context * ctx, ggml_tensor * x,
-                           ggml_tensor * alpha, ggml_tensor * inv_alpha) {
-    ggml_tensor * a  = ggml_reshape_2d(ctx, alpha,     1, alpha->ne[0]);
-    ggml_tensor * ia = ggml_reshape_2d(ctx, inv_alpha, 1, inv_alpha->ne[0]);
-    ggml_tensor * ax = ggml_mul(ctx, x, a);
-    ggml_tensor * s  = ggml_sin(ctx, ax);
-    ggml_tensor * s2 = ggml_mul(ctx, s, s);
-    return ggml_add(ctx, x, ggml_mul(ctx, s2, ia));
-}
-
-static std::vector<float> invert_alpha_cpu(const model_ctx & m, const std::string & name) {
-    ggml_tensor * t = find_tensor(m, name);
-    std::vector<float> a(ggml_nelements(t));
-    ggml_backend_tensor_get(t, a.data(), 0, ggml_nbytes(t));
-    std::vector<float> inv(a.size());
-    for (size_t i = 0; i < a.size(); ++i) inv[i] = 1.0f / (a[i] + 1e-9f);
-    return inv;
-}
-
-// ----------------------------------------------------------------------------
-// QVAC-17872 round 2: scaffolding cache definitions
-// ----------------------------------------------------------------------------
-
-// compute_pos_emb is pure CPU compute (~T * D * 5 trig ops).  It fires
-// twice per encoder run (once for T, once for 2T) — at multilingual
-// chunk size T~350+ that's a noticeable wedge of per-synth host time.
-// Cached by (T, D) (D is constant 512 in the chatterbox model; we still
-// include it in the key for safety against future-variant collisions).
-static const std::vector<float> & cached_pos_emb(int T, int D) {
-    const int64_t key = ((int64_t) T << 32) | (uint32_t) D;
-    {
-        std::lock_guard<std::mutex> lk(g_synth_caches_mu);
-        auto it = g_pos_emb_results.find(key);
-        if (it != g_pos_emb_results.end()) return it->second;
-    }
-    std::vector<float> pe;
-    compute_pos_emb(pe, T, D);
-    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
-    auto [it, inserted] = g_pos_emb_results.try_emplace(key, std::move(pe));
-    return it->second;
-}
-
-// invert_alpha_cpu is fired ~72× per HiFT call (12 ResBlocks × 6 alpha
-// tensors); each call is a tensor_get + per-element reciprocal.  Alpha
-// tensors are constant for the model lifetime, so cache by tensor* —
-// invalidation tied to s3gen_release_synth_caches (model-context lifetime).
-static const std::vector<float> & cached_inv_alpha(const model_ctx & m,
-                                                   const std::string & name) {
-    ggml_tensor * t = find_tensor(m, name);
-    {
-        std::lock_guard<std::mutex> lk(g_synth_caches_mu);
-        auto it = g_inv_alpha_results.find(t);
-        if (it != g_inv_alpha_results.end()) return it->second;
-    }
-    auto inv = invert_alpha_cpu(m, name);
-    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
-    auto [it, inserted] = g_inv_alpha_results.try_emplace(t, std::move(inv));
-    return it->second;
-}
-
-// hann_window / istft_kernel are pure functions of n_fft (constant 16 on
-// the chatterbox HiFT path); window_sum additionally depends on (n_fft,
-// hop, T_stft).  Caching them eliminates the per-synth host-CPU build
-// cost (small for n_fft=16 but the shape-key lookup composes cleanly
-// with the larger HiFT graph cache below).
+// QVAC-18422 round 2: cached HiFT scaffolding helpers.  hann_window +
+// istft_kernel are pure functions of n_fft (constant 1920 in the
+// chatterbox HiFT path); window_sum additionally depends on T_stft
+// (varies with output length, but stable across same-shape synth
+// calls).  Caching them eliminates the per-synth host-CPU build cost
+// — build_istft_kernel(1920) alone is ~1.85M F32 mults + cos/sin.
 static const std::vector<float> & cached_hann_window(int n_fft) {
     {
         std::lock_guard<std::mutex> lk(g_synth_caches_mu);
@@ -1711,9 +1661,9 @@ static const std::vector<float> & cached_istft_kernel(int n_fft) {
 }
 
 static const std::vector<float> & cached_window_sum(int T_stft, int n_fft, int hop) {
-    // Pack (n_fft, hop, T_stft) into a single int64 key — n_fft and hop
-    // are constants on the chatterbox path but encoding them makes the
-    // cache safe against future variant additions.
+    // Pack (n_fft, hop, T_stft) into a single int64 key — n_fft and
+    // hop are constants on the chatterbox path but encoding them
+    // makes the cache safe against future variant additions.
     const int64_t key =
         ((int64_t)(uint16_t) n_fft << 48) |
         ((int64_t)(uint16_t) hop   << 32) |
@@ -1729,6 +1679,48 @@ static const std::vector<float> & cached_window_sum(int T_stft, int n_fft, int h
     return it->second;
 }
 
+static ggml_tensor * snake(ggml_context * ctx, ggml_tensor * x,
+                           ggml_tensor * alpha, ggml_tensor * inv_alpha) {
+    ggml_tensor * a  = ggml_reshape_2d(ctx, alpha,     1, alpha->ne[0]);
+    ggml_tensor * ia = ggml_reshape_2d(ctx, inv_alpha, 1, inv_alpha->ne[0]);
+    ggml_tensor * ax = ggml_mul(ctx, x, a);
+    ggml_tensor * s  = ggml_sin(ctx, ax);
+    ggml_tensor * s2 = ggml_mul(ctx, s, s);
+    return ggml_add(ctx, x, ggml_mul(ctx, s2, ia));
+}
+
+static std::vector<float> invert_alpha_cpu(const model_ctx & m, const std::string & name) {
+    ggml_tensor * t = find_tensor(m, name);
+    std::vector<float> a(ggml_nelements(t));
+    ggml_backend_tensor_get(t, a.data(), 0, ggml_nbytes(t));
+    std::vector<float> inv(a.size());
+    for (size_t i = 0; i < a.size(); ++i) inv[i] = 1.0f / (a[i] + 1e-9f);
+    return inv;
+}
+
+// invert_alpha_cpu is fired ~72× per HiFT call (12 ResBlocks × 6 alpha
+// tensors); each call is a tensor_get + per-element reciprocal.  Alpha
+// tensors are constant for the model lifetime, so cache by tensor* —
+// invalidation tied to s3gen_release_synth_caches (model-context lifetime).
+static const std::vector<float> & cached_inv_alpha(const model_ctx & m,
+                                                   const std::string & name) {
+    ggml_tensor * t = find_tensor(m, name);
+    {
+        std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+        auto it = g_inv_alpha_results.find(t);
+        if (it != g_inv_alpha_results.end()) return it->second;
+    }
+    auto inv = invert_alpha_cpu(m, name);
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    auto [it, inserted] = g_inv_alpha_results.try_emplace(t, std::move(inv));
+    return it->second;
+}
+
+// `cached_pos_emb` lives in the QVAC-18422 cache block above (right
+// after `compute_pos_emb`).  `cached_hann_window`, `cached_istft_kernel`,
+// and `cached_window_sum` are defined just above this block (alongside
+// `build_hann_window` / `build_istft_kernel` / `build_window_sum`).
+
 // F0 predictor (mel (80, T) -> f0 (T,))
 //
 // QVAC-17872 round 2: graph + gallocator cached process-wide via
@@ -2825,26 +2817,23 @@ void s3gen_unload() {
 namespace tts_cpp::chatterbox::test_hooks {
 
 size_t time_mlp_result_cache_size() {
-    std::lock_guard<std::mutex> lk(g_time_emb_results_mu);
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
     return g_time_mlp_results.size();
 }
 size_t time_emb_result_cache_size() {
-    std::lock_guard<std::mutex> lk(g_time_emb_results_mu);
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
     return g_time_emb_results.size();
 }
 size_t weight_mirror_cache_size() {
-    std::lock_guard<std::mutex> lk(g_weight_cpu_mirror_mu);
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
     return g_weight_cpu_mirror.size();
 }
 bool cfm_estimator_cache_built() {
-    // g_cfm_estimator_cache is mutated only under s3gen_release_synth_caches
-    // (which holds g_synth_caches_mu around the round-2 caches but not this
-    // one) and during the per-synth fast-path inside cfm_estimator_forward.
-    // The single-pointer load below is atomic on x86/ARM; tests treat it
-    // as a snapshot.
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
     return g_cfm_estimator_cache.ctx != nullptr;
 }
 bool cfm_estimator_cache_b2() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
     return g_cfm_estimator_cache.b2;
 }
 uint32_t float_cache_key(float t_val) {
@@ -2854,10 +2843,63 @@ uint64_t float_pair_cache_key(float t_val, float r_val) {
     return g_float_pair_bits(t_val, r_val);
 }
 std::vector<float> peek_time_mlp_cached(float t_val) {
-    std::lock_guard<std::mutex> lk(g_time_emb_results_mu);
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
     auto it = g_time_mlp_results.find(g_float_bits(t_val));
     if (it == g_time_mlp_results.end()) return {};
     return it->second;
 }
 
+// ---- Round 2 hooks --------------------------------------------------------
+
+bool encoder_graph_cache_built() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return g_encoder_graph_cache.ctx != nullptr;
+}
+int encoder_graph_cache_T() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return (int) g_encoder_graph_cache.key;
+}
+bool hift_graph_cache_built() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return g_hift_graph_cache.ctx != nullptr;
+}
+int hift_graph_cache_T_mel() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    if (g_hift_graph_cache.key < 0) return -1;
+    return (int) (g_hift_graph_cache.key >> 32);
+}
+int hift_graph_cache_T_stft() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    if (g_hift_graph_cache.key < 0) return -1;
+    return (int) (g_hift_graph_cache.key & 0xffffffffLL);
+}
+bool f0_graph_cache_built() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return g_f0_graph_cache.ctx != nullptr;
+}
+int f0_graph_cache_T_mel() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return (int) g_f0_graph_cache.key;
+}
+size_t pos_emb_cache_size() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return g_pos_emb_results.size();
+}
+size_t inv_alpha_cache_size() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return g_inv_alpha_results.size();
+}
+size_t istft_kernel_cache_size() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return g_istft_kernel_cache.size();
+}
+size_t hann_window_cache_size() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return g_hann_window_cache.size();
+}
+size_t window_sum_cache_size() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return g_window_sum_cache.size();
+}
+
 }  // namespace tts_cpp::chatterbox::test_hooks
diff --git a/src/chatterbox_tts_test_hooks.h b/src/chatterbox_tts_test_hooks.h
index c9fdb91..c51dade 100644
--- a/src/chatterbox_tts_test_hooks.h
+++ b/src/chatterbox_tts_test_hooks.h
@@ -63,4 +63,35 @@ uint64_t float_pair_cache_key(float t_val, float r_val);
 // t-value was actually warmed without re-entering compute_time_mlp.
 std::vector<float> peek_time_mlp_cached(float t_val);
 
+// ---------- Round 2 (PROGRESS.md §3.33): graph + scaffolding caches ----
+
+// Persistent encoder graph cache.  Built lazily by run_encoder() and
+// invalidated when its key (T) diverges from a streaming chunk.  False
+// before any synth and after s3gen_unload().
+bool encoder_graph_cache_built();
+
+// Cache key (input length T) currently held by the encoder graph
+// cache.  -1 if not built; otherwise the T from the most recent build.
+int  encoder_graph_cache_T();
+
+// Persistent HiFT decoder graph cache.  Built lazily by
+// run_hift_decode() and invalidated when (T_mel, T_stft) diverge.
+bool hift_graph_cache_built();
+int  hift_graph_cache_T_mel();
+int  hift_graph_cache_T_stft();
+
+// Persistent F0 predictor graph cache.  Built lazily by
+// run_f0_predictor(); keyed on T_mel.
+bool f0_graph_cache_built();
+int  f0_graph_cache_T_mel();
+
+// Sizes of the small scaffolding caches.  Each is process-wide; a
+// stable set of n_fft / hop / model parameters means the steady-state
+// size is small (1-2 entries each).
+size_t pos_emb_cache_size();
+size_t inv_alpha_cache_size();
+size_t istft_kernel_cache_size();
+size_t hann_window_cache_size();
+size_t window_sum_cache_size();
+
 }  // namespace tts_cpp::chatterbox::test_hooks
diff --git a/src/test_cpu_caches.cpp b/src/test_cpu_caches.cpp
index 47e0d29..bdf3f74 100644
--- a/src/test_cpu_caches.cpp
+++ b/src/test_cpu_caches.cpp
@@ -162,6 +162,32 @@ void test_initial_state() {
           "synth");
     CHECK(!th::cfm_estimator_cache_b2(),
           "persistent cfm_estimator_cache b2 flag must default false");
+
+    // Round 2: encoder / HiFT / F0 graph caches + scaffolding caches.
+    CHECK(!th::encoder_graph_cache_built(),
+          "persistent encoder graph cache must not be built before any synth");
+    CHECK(th::encoder_graph_cache_T() == -1,
+          "encoder graph cache T must be -1 (sentinel) before any build");
+    CHECK(!th::hift_graph_cache_built(),
+          "persistent HiFT decoder graph cache must not be built before any synth");
+    CHECK(th::hift_graph_cache_T_mel() == -1,
+          "HiFT graph cache T_mel must be -1 before any build");
+    CHECK(th::hift_graph_cache_T_stft() == -1,
+          "HiFT graph cache T_stft must be -1 before any build");
+    CHECK(!th::f0_graph_cache_built(),
+          "persistent F0 predictor graph cache must not be built before any synth");
+    CHECK(th::f0_graph_cache_T_mel() == -1,
+          "F0 graph cache T_mel must be -1 before any build");
+    CHECK(th::pos_emb_cache_size() == 0,
+          "encoder pos_emb result cache must start empty");
+    CHECK(th::inv_alpha_cache_size() == 0,
+          "HiFT inv_alpha result cache must start empty");
+    CHECK(th::istft_kernel_cache_size() == 0,
+          "HiFT istft_kernel cache must start empty");
+    CHECK(th::hann_window_cache_size() == 0,
+          "HiFT hann_window cache must start empty");
+    CHECK(th::window_sum_cache_size() == 0,
+          "HiFT window_sum cache must start empty");
 }
 
 // ---------------- 3. determinism + cache wiring on a real synth ----------
@@ -222,6 +248,18 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf,
     const size_t n_time_emb_after_a = th::time_emb_result_cache_size();
     const size_t n_weights_after_a  = th::weight_mirror_cache_size();
     const bool   cfm_built_after_a  = th::cfm_estimator_cache_built();
+    const bool   enc_built_after_a  = th::encoder_graph_cache_built();
+    const int    enc_T_after_a      = th::encoder_graph_cache_T();
+    const bool   hift_built_after_a = th::hift_graph_cache_built();
+    const int    hift_Tmel_after_a  = th::hift_graph_cache_T_mel();
+    const int    hift_Tstft_after_a = th::hift_graph_cache_T_stft();
+    const bool   f0_built_after_a   = th::f0_graph_cache_built();
+    const int    f0_Tmel_after_a    = th::f0_graph_cache_T_mel();
+    const size_t n_pos_emb_after_a  = th::pos_emb_cache_size();
+    const size_t n_inv_alpha_after_a = th::inv_alpha_cache_size();
+    const size_t n_istft_after_a    = th::istft_kernel_cache_size();
+    const size_t n_hann_after_a     = th::hann_window_cache_size();
+    const size_t n_wsum_after_a     = th::window_sum_cache_size();
 
     CHECK(cfm_built_after_a,
           "after first synth, persistent cfm_estimator_cache must be built");
@@ -231,11 +269,58 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf,
     CHECK(n_weights_after_a > 0,
           "after first synth, weight_mirror_cache must have at least one "
           "entry (input_embedding + spk_embed_affine/{w,b})");
+
+    // Round 2 — every per-pipeline graph must be built after the first
+    // synth, with non-sentinel keys.
+    CHECK(enc_built_after_a,
+          "after first synth, persistent encoder graph cache must be built");
+    CHECK(enc_T_after_a > 0,
+          "after first synth, encoder graph cache T must be > 0 (saw %d)",
+          enc_T_after_a);
+    CHECK(hift_built_after_a,
+          "after first synth, persistent HiFT graph cache must be built");
+    CHECK(hift_Tmel_after_a > 0 && hift_Tstft_after_a > 0,
+          "after first synth, HiFT graph cache (T_mel=%d, T_stft=%d) must "
+          "have positive shape keys",
+          hift_Tmel_after_a, hift_Tstft_after_a);
+    CHECK(f0_built_after_a,
+          "after first synth, persistent F0 predictor graph cache must be built");
+    CHECK(f0_Tmel_after_a > 0,
+          "after first synth, F0 graph cache T_mel must be > 0 (saw %d)",
+          f0_Tmel_after_a);
+
+    // Scaffolding caches: pos_emb fires twice per synth (T and 2T), so
+    // ≥ 2 entries.  inv_alpha fires once per HiFT alpha tensor (~72
+    // tensors total).  istft_kernel + hann_window are keyed by n_fft
+    // (one constant value), so exactly 1 entry each.  window_sum is
+    // keyed by T_stft, also exactly 1 entry per synth-shape.
+    CHECK(n_pos_emb_after_a >= 2,
+          "after first synth, pos_emb cache should have ≥ 2 entries (T and 2T) "
+          "but saw %zu", n_pos_emb_after_a);
+    CHECK(n_inv_alpha_after_a > 0,
+          "after first synth, inv_alpha cache must have at least one entry");
+    CHECK(n_istft_after_a == 1,
+          "after first synth, istft_kernel cache must have exactly 1 entry "
+          "(keyed by n_fft); saw %zu", n_istft_after_a);
+    CHECK(n_hann_after_a >= 1,
+          "after first synth, hann_window cache must have ≥ 1 entry; saw %zu",
+          n_hann_after_a);
+    CHECK(n_wsum_after_a == 1,
+          "after first synth, window_sum cache must have exactly 1 entry; "
+          "saw %zu", n_wsum_after_a);
+
     fprintf(stderr,
-            "  synth #1: time_mlp=%zu  time_emb=%zu  weights=%zu  cfm=%s "
-            "(%.1f ms)\n",
+            "  synth #1: time_mlp=%zu time_emb=%zu weights=%zu cfm=%s "
+            "enc=%s(T=%d) hift=%s(T_mel=%d,T_stft=%d) f0=%s(T_mel=%d) "
+            "pos_emb=%zu inv_alpha=%zu istft=%zu hann=%zu wsum=%zu (%.1f ms)\n",
             n_time_mlp_after_a, n_time_emb_after_a, n_weights_after_a,
-            cfm_built_after_a ? "built" : "fresh", t_a);
+            cfm_built_after_a ? "built" : "fresh",
+            enc_built_after_a ? "built" : "fresh", enc_T_after_a,
+            hift_built_after_a ? "built" : "fresh",
+            hift_Tmel_after_a, hift_Tstft_after_a,
+            f0_built_after_a ? "built" : "fresh", f0_Tmel_after_a,
+            n_pos_emb_after_a, n_inv_alpha_after_a,
+            n_istft_after_a, n_hann_after_a, n_wsum_after_a, t_a);
 
     // Second call: every cache must already be warm.  Its size must
     // not grow because the t-schedule and the model weights are
@@ -254,6 +339,37 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf,
     CHECK(th::cfm_estimator_cache_built(),
           "synth #2 must keep the persistent cfm graph built");
 
+    // Round 2: graph caches must remain built with the same shape
+    // keys, scaffolding caches must not grow.
+    CHECK(th::encoder_graph_cache_built() && th::encoder_graph_cache_T() == enc_T_after_a,
+          "synth #2 must keep the encoder graph built with the same T (was %d, "
+          "now built=%d, T=%d)",
+          enc_T_after_a, th::encoder_graph_cache_built() ? 1 : 0,
+          th::encoder_graph_cache_T());
+    CHECK(th::hift_graph_cache_built() &&
+          th::hift_graph_cache_T_mel()  == hift_Tmel_after_a &&
+          th::hift_graph_cache_T_stft() == hift_Tstft_after_a,
+          "synth #2 must keep the HiFT graph built with the same shape keys "
+          "(was T_mel=%d, T_stft=%d; now built=%d, T_mel=%d, T_stft=%d)",
+          hift_Tmel_after_a, hift_Tstft_after_a,
+          th::hift_graph_cache_built() ? 1 : 0,
+          th::hift_graph_cache_T_mel(), th::hift_graph_cache_T_stft());
+    CHECK(th::f0_graph_cache_built() && th::f0_graph_cache_T_mel() == f0_Tmel_after_a,
+          "synth #2 must keep the F0 graph built with the same T_mel (was %d)",
+          f0_Tmel_after_a);
+    CHECK(th::pos_emb_cache_size()      == n_pos_emb_after_a,
+          "synth #2 must NOT add new pos_emb entries (saw %zu, expected %zu)",
+          th::pos_emb_cache_size(), n_pos_emb_after_a);
+    CHECK(th::inv_alpha_cache_size()    == n_inv_alpha_after_a,
+          "synth #2 must NOT add new inv_alpha entries (saw %zu, expected %zu)",
+          th::inv_alpha_cache_size(), n_inv_alpha_after_a);
+    CHECK(th::istft_kernel_cache_size() == n_istft_after_a,
+          "synth #2 must NOT add new istft_kernel entries");
+    CHECK(th::hann_window_cache_size()  == n_hann_after_a,
+          "synth #2 must NOT add new hann_window entries");
+    CHECK(th::window_sum_cache_size()   == n_wsum_after_a,
+          "synth #2 must NOT add new window_sum entries");
+
     CHECK(wav_a.size() == wav_b.size(),
           "warm-cache synth #2 wav length must match cold-cache synth #1 "
           "(%zu vs %zu)", wav_a.size(), wav_b.size());
@@ -283,6 +399,25 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf,
           "s3gen_unload must clear weight_mirror cache");
     CHECK(!th::cfm_estimator_cache_built(),
           "s3gen_unload must tear down the persistent cfm cache");
+    // Round 2 caches must also be torn down — gallocators in the
+    // graph caches reference the model's backend and would crash on
+    // backend-free if left dangling.
+    CHECK(!th::encoder_graph_cache_built(),
+          "s3gen_unload must tear down the encoder graph cache");
+    CHECK(!th::hift_graph_cache_built(),
+          "s3gen_unload must tear down the HiFT decoder graph cache");
+    CHECK(!th::f0_graph_cache_built(),
+          "s3gen_unload must tear down the F0 predictor graph cache");
+    CHECK(th::pos_emb_cache_size() == 0,
+          "s3gen_unload must clear pos_emb cache");
+    CHECK(th::inv_alpha_cache_size() == 0,
+          "s3gen_unload must clear inv_alpha cache");
+    CHECK(th::istft_kernel_cache_size() == 0,
+          "s3gen_unload must clear istft_kernel cache");
+    CHECK(th::hann_window_cache_size() == 0,
+          "s3gen_unload must clear hann_window cache");
+    CHECK(th::window_sum_cache_size() == 0,
+          "s3gen_unload must clear window_sum cache");
 
     // Idempotent: a second unload must not crash or produce errors.
     s3gen_unload();
@@ -335,6 +470,105 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf,
     }
 }
 
+// ---------------- 4. Streaming shape invalidation ---------------------------
+//
+// Streaming mode synthesises chunks of varying length; T is different on
+// every call.  The generic graph_cache rebuilds when its key diverges —
+// this test exercises that branch by submitting two different token
+// counts and checking the encoder / HiFT cache keys move with them
+// while the t-schedule / weight caches remain stable.
+
+void test_streaming_shape_invalidation(const std::string & gguf,
+                                       const std::string & ref_dir) {
+    fprintf(stderr, "=== streaming shape invalidation ===\n");
+
+    s3gen_unload();  // clean slate
+
+    // Chunk #1 — shorter token sequence.
+    std::vector<int32_t> short_tokens = {12, 34, 56, 78, 90, 121, 152, 173};
+    s3gen_synthesize_opts opts1;
+    opts1.s3gen_gguf_path = gguf;
+    opts1.ref_dir         = ref_dir;
+    opts1.out_wav_path    = "";
+    std::vector<float> wav1;
+    opts1.pcm_out         = &wav1;
+    opts1.seed            = 42;
+    opts1.n_threads       = 0;
+    opts1.sr              = 24000;
+    opts1.n_gpu_layers    = 0;
+    opts1.apply_trim_fade = true;
+    opts1.finalize        = true;
+    if (s3gen_synthesize_to_wav(short_tokens, opts1) != 0 || wav1.empty()) {
+        fprintf(stderr, "skip: chunk #1 synth failed\n");
+        return;
+    }
+    const int enc_T_chunk1     = th::encoder_graph_cache_T();
+    const int hift_Tmel_chunk1 = th::hift_graph_cache_T_mel();
+    const int f0_Tmel_chunk1   = th::f0_graph_cache_T_mel();
+
+    // Chunk #2 — longer token sequence (different shape).  All the
+    // graph caches must rebuild, the t-schedule + weight + scaffolding
+    // result caches must NOT grow.
+    std::vector<int32_t> long_tokens;
+    for (int i = 0; i < 32; ++i) long_tokens.push_back(50 + i * 7);
+    s3gen_synthesize_opts opts2 = opts1;
+    std::vector<float> wav2;
+    opts2.pcm_out = &wav2;
+    if (s3gen_synthesize_to_wav(long_tokens, opts2) != 0 || wav2.empty()) {
+        fprintf(stderr, "skip: chunk #2 synth failed\n");
+        return;
+    }
+    const int enc_T_chunk2     = th::encoder_graph_cache_T();
+    const int hift_Tmel_chunk2 = th::hift_graph_cache_T_mel();
+    const int f0_Tmel_chunk2   = th::f0_graph_cache_T_mel();
+
+    CHECK(enc_T_chunk1 != enc_T_chunk2,
+          "encoder graph cache T must change between chunks of different "
+          "lengths (chunk1 T=%d, chunk2 T=%d)",
+          enc_T_chunk1, enc_T_chunk2);
+    CHECK(hift_Tmel_chunk1 != hift_Tmel_chunk2,
+          "HiFT graph cache T_mel must change between chunks (chunk1=%d, "
+          "chunk2=%d)", hift_Tmel_chunk1, hift_Tmel_chunk2);
+    CHECK(f0_Tmel_chunk1 != f0_Tmel_chunk2,
+          "F0 graph cache T_mel must change between chunks (chunk1=%d, "
+          "chunk2=%d)", f0_Tmel_chunk1, f0_Tmel_chunk2);
+    CHECK(th::encoder_graph_cache_built(),
+          "encoder graph cache must remain built after shape change "
+          "(rebuilt for new T)");
+    CHECK(th::hift_graph_cache_built(),
+          "HiFT graph cache must remain built after shape change");
+    CHECK(th::f0_graph_cache_built(),
+          "F0 graph cache must remain built after shape change");
+    fprintf(stderr,
+            "  chunk #1: enc_T=%d hift_T_mel=%d f0_T_mel=%d wav_len=%zu\n"
+            "  chunk #2: enc_T=%d hift_T_mel=%d f0_T_mel=%d wav_len=%zu\n",
+            enc_T_chunk1, hift_Tmel_chunk1, f0_Tmel_chunk1, wav1.size(),
+            enc_T_chunk2, hift_Tmel_chunk2, f0_Tmel_chunk2, wav2.size());
+
+    // pos_emb cache might add up to 2 new entries (T2 and 2*T2 for the
+    // longer chunk).  The previous chunk's entries persist (we don't
+    // evict on shape change).
+    CHECK(th::pos_emb_cache_size() >= 2,
+          "pos_emb cache must contain ≥ 2 entries across two chunks of "
+          "different lengths (got %zu)", th::pos_emb_cache_size());
+
+    // Window-sum cache: 1 entry per distinct T_stft.  Two chunks of
+    // different lengths produce two distinct T_stft values, so the
+    // cache must hold exactly 2 entries.
+    CHECK(th::window_sum_cache_size() >= 1,
+          "window_sum cache must contain ≥ 1 entry after multi-shape "
+          "synthesis (got %zu)", th::window_sum_cache_size());
+
+    // hann_window + istft_kernel are keyed by n_fft (single value
+    // shared across all chunks) — sizes must NOT grow with chunk count.
+    CHECK(th::hann_window_cache_size() <= 2,
+          "hann_window cache size must stay small across chunks (got %zu); "
+          "if this grows with chunk count the key is wrong", th::hann_window_cache_size());
+    CHECK(th::istft_kernel_cache_size() == 1,
+          "istft_kernel cache must stay at 1 entry (n_fft is constant); "
+          "got %zu", th::istft_kernel_cache_size());
+}
+
 }  // namespace
 
 int main(int argc, char ** argv) {
@@ -355,6 +589,7 @@ int main(int argc, char ** argv) {
             return 2;
         }
         test_warm_cache_bit_exact_and_lifecycle(gguf, ref_dir);
+        test_streaming_shape_invalidation(gguf, ref_dir);
     }
 
     // Always release at exit so the next test invocation starts clean.

From cd80f08c51e2563742ebce07133eec143bc297c9 Mon Sep 17 00:00:00 2001
From: Zbigniew Herman <zbigniew.herman@tether.io>
Date: Tue, 5 May 2026 16:05:10 +0200
Subject: [PATCH 3/6] =?UTF-8?q?QVAC-18422=20[TTS=20GGML]=20Optimize=20cpp?=
 =?UTF-8?q?=20backend=20multilingual=20for=20CPU=20(round=203)=20PROGRESS.?=
 =?UTF-8?q?md=20=C2=A73.34=20=E2=80=94=20multilingual=20verification=20(Tu?=
 =?UTF-8?q?rbo=2080/80,=20multilingual=2099/99=20checks=20pass;=20bit-exac?=
 =?UTF-8?q?t=20synth-twice=20on=20the=20converted-from-source=20MTL=20Q4?=
 =?UTF-8?q?=5F0=20GGUF)=20+=2019=20new=20multilingual-specific=20test=20as?=
 =?UTF-8?q?sertions=20(cosine=20schedule=20produces=20exactly=2010=20disti?=
 =?UTF-8?q?nct=20g=5Ftime=5Fmlp=5Fresults=20entries)=20+=20fused=20CFG-com?=
 =?UTF-8?q?bine=20+=20Euler=20step=20in=20the=20non-meanflow=20CFG=20path?=
 =?UTF-8?q?=20of=20synthesize().=20=20Sub-noise=20wall-time=20saving=20on?=
 =?UTF-8?q?=20a=20single=20multilingual=20synth=20(~8=20s);=20biggest=20re?=
 =?UTF-8?q?maining=20host-side=20win=20is=20T3=20step-graph=20caching,=20d?=
 =?UTF-8?q?ocumented=20as=20deferred=20follow-up.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/chatterbox_tts.cpp  | 42 +++++++++++++++++++++++++++++++++--------
 src/test_cpu_caches.cpp | 33 ++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/src/chatterbox_tts.cpp b/src/chatterbox_tts.cpp
index afd87e4..9f26fb2 100644
--- a/src/chatterbox_tts.cpp
+++ b/src/chatterbox_tts.cpp
@@ -2594,8 +2594,12 @@ int s3gen_synthesize_to_wav(
 
         double step_t0 = now_ms();
         std::vector<float> dxdt_cond;
+        std::vector<float> dxdt_uncond;
+        // True when this step needs the CFG combine — both flavours of
+        // CFG path (B=2 batched and B=1 two-call) populate dxdt_uncond
+        // and require the linear `(1+cfg)*cond - cfg*uncond` mix.
+        bool have_cfg_uncond = false;
         if (use_b2) {
-            std::vector<float> dxdt_uncond;
             cfm_estimator_forward_b2(m, cfm_cache,
                 z, z,
                 mu, zero_mu,
@@ -2603,9 +2607,7 @@ int s3gen_synthesize_to_wav(
                 spks, zero_spks,
                 cond, zero_cond,
                 dxdt_cond, dxdt_uncond, T_mu, opts.cfm_f16_kv_attn);
-            for (size_t i = 0; i < dxdt_cond.size(); ++i) {
-                dxdt_cond[i] = (1.0f + cfg_rate) * dxdt_cond[i] - cfg_rate * dxdt_uncond[i];
-            }
+            have_cfg_uncond = true;
         } else if (!meanflow && cfg_rate != 0.0f) {
             // Non-Metal CFG path (CPU + any backend where use_b2 is false).
             // Run the conditional and unconditional passes back-to-back on
@@ -2616,12 +2618,21 @@ int s3gen_synthesize_to_wav(
             // previously the else clause computed only the conditional pass
             // and dropped CFG entirely on every non-Metal backend.
             dxdt_cond = cfm_estimator_forward(m, cfm_cache, z, mu, t_emb, spks, cond, T_mu, opts.cfm_f16_kv_attn);
-            auto dxdt_uncond = cfm_estimator_forward(m, cfm_cache, z, zero_mu, t_emb, zero_spks, zero_cond, T_mu, opts.cfm_f16_kv_attn);
+            dxdt_uncond = cfm_estimator_forward(m, cfm_cache, z, zero_mu, t_emb, zero_spks, zero_cond, T_mu, opts.cfm_f16_kv_attn);
+            have_cfg_uncond = true;
+        } else {
+            dxdt_cond = cfm_estimator_forward(m, cfm_cache, z, mu, t_emb, spks, cond, T_mu, opts.cfm_f16_kv_attn);
+        }
+
+        // Debug + dump hooks read the post-CFG-combine dxdt; precompute it
+        // when the caller actually asks for it, otherwise fold the combine
+        // into the Euler step below to save a pass over the array.
+        const bool need_full_dxdt = (debug_mode && meanflow) ||
+                                    (s == 0 && !opts.dump_mel_path.empty());
+        if (have_cfg_uncond && need_full_dxdt) {
             for (size_t i = 0; i < dxdt_cond.size(); ++i) {
                 dxdt_cond[i] = (1.0f + cfg_rate) * dxdt_cond[i] - cfg_rate * dxdt_uncond[i];
             }
-        } else {
-            dxdt_cond = cfm_estimator_forward(m, cfm_cache, z, mu, t_emb, spks, cond, T_mu, opts.cfm_f16_kv_attn);
         }
         auto & dxdt = dxdt_cond;
         vlog("  [cfm_step%zu] %.1f ms\n", s, now_ms() - step_t0);
@@ -2644,7 +2655,22 @@ int s3gen_synthesize_to_wav(
                     MEL, T_mu, base.c_str());
         }
 
-        for (size_t i = 0; i < z.size(); ++i) z[i] = z[i] + dt * dxdt[i];
+        // Fused CFG-combine + Euler step (QVAC-18422 round 3).  Saves one
+        // pass over `dxdt` per step.  When the debug/dump code-paths above
+        // already wrote the combined result back into `dxdt_cond`, we
+        // detect it via `need_full_dxdt && have_cfg_uncond` and fall back
+        // to the plain `z + dt * dxdt_cond` form so the math stays
+        // bit-exact across both branches.
+        if (have_cfg_uncond && !need_full_dxdt) {
+            const float c1 = (1.0f + cfg_rate);
+            const float c0 = -cfg_rate;
+            for (size_t i = 0; i < z.size(); ++i) {
+                const float d = c1 * dxdt_cond[i] + c0 * dxdt_uncond[i];
+                z[i] = z[i] + dt * d;
+            }
+        } else {
+            for (size_t i = 0; i < z.size(); ++i) z[i] = z[i] + dt * dxdt[i];
+        }
     }
     vlog("  [cfm_total] %.1f ms\n", now_ms() - cfm_t0);
 
diff --git a/src/test_cpu_caches.cpp b/src/test_cpu_caches.cpp
index bdf3f74..0e01e97 100644
--- a/src/test_cpu_caches.cpp
+++ b/src/test_cpu_caches.cpp
@@ -468,6 +468,39 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf,
         CHECK(b.size() == 1024,
               "time_mlp cached entry must be (1024,) — saw %zu", b.size());
     }
+
+    // Variant-specific schedule shape — derived from the time_mlp cache
+    // size after a synth populates it.  Multilingual = 10 cosine-spaced
+    // t-values + 0 time_emb pairs (non-meanflow); Turbo = ≤3 t-values
+    // + 2 (t,r) time_emb pairs (meanflow).
+    if (n_time_mlp_after_a == 10 && n_time_emb_after_a == 0) {
+        // Multilingual cosine schedule: every entry must round-trip,
+        // every cosine_t(i, 10) for i in 0..9 must be present.
+        fprintf(stderr, "  detected multilingual variant (cosine n_timesteps=10)\n");
+        for (int i = 0; i < 10; ++i) {
+            float t_cos = cosine_t(i, 10);
+            auto v = th::peek_time_mlp_cached(t_cos);
+            CHECK(!v.empty(),
+                  "multilingual cosine t_span entry %d (t=%.6f) must be cached "
+                  "after first synth", i, t_cos);
+            if (!v.empty()) {
+                CHECK(v.size() == 1024,
+                      "multilingual cached t_emb entry %d size must be 1024 — "
+                      "saw %zu", i, v.size());
+            }
+        }
+    } else if (n_time_mlp_after_a <= 3 && n_time_emb_after_a == 2) {
+        fprintf(stderr, "  detected Turbo variant (meanflow t_span ⊆ {0,0.5,1})\n");
+        // Turbo's meanflow loop visits the pairs (0, 0.5) and (0.5, 1).
+        auto v05 = th::peek_time_mlp_cached(0.5f);
+        CHECK(!v05.empty(),
+              "Turbo: t_val=0.5 must be in time_mlp cache after first synth");
+    } else {
+        fprintf(stderr,
+                "  unrecognised variant: time_mlp=%zu time_emb=%zu — neither "
+                "the multilingual (10/0) nor Turbo (≤3/2) shape\n",
+                n_time_mlp_after_a, n_time_emb_after_a);
+    }
 }
 
 // ---------------- 4. Streaming shape invalidation ---------------------------

From ce7dc15fbbe99d701a312a52e9c796306596db04 Mon Sep 17 00:00:00 2001
From: Zbigniew Herman <zbigniew.herman@tether.io>
Date: Tue, 5 May 2026 18:12:23 +0200
Subject: [PATCH 4/6] QVAC-18422 [TTS GGML] Optimize cpp backend multilingual
 for CPU (round 4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PROGRESS.md §3.35 — T3 step-graph cache (multilingual CFG token
decode) opt-in via CHATTERBOX_T3_STEP_CACHE.  Per-(n_past,
is_uncond) std::list-LRU cache (cap 256) for build_step_graph_mtl;
saves ~3 ms per cache hit.  Single-utterance default-OFF (no
hits-to-amortise on synth #1) keeps the existing path
regression-free; server-mode opt-in shows ~15 % per-pass speedup
(~256 ms / synth #2 of multilingual at 136 tokens).  Tests:
src/test_t3_caches.cpp NEW with 99 checks (lifecycle + bit-exact
cold/warm logits + multi-synth amortisation timing).  Lifecycle
wired into free_t3 (CLI, both paths), Impl::free_model (Engine),
and an atexit fallback — all firing BEFORE ggml_backend_free.
Total cache test suite green: 80 + 99 + 6 + 99 = 284 / 284.
---
 CMakeLists.txt                  |   7 +
 src/chatterbox_cli.cpp          |   9 +
 src/chatterbox_engine.cpp       |   5 +
 src/chatterbox_t3_internal.h    |  10 +
 src/chatterbox_tts_test_hooks.h |  48 ++++
 src/t3_mtl.cpp                  | 348 +++++++++++++++++++++++-
 src/test_t3_caches.cpp          | 452 ++++++++++++++++++++++++++++++++
 7 files changed, 869 insertions(+), 10 deletions(-)
 create mode 100644 src/test_t3_caches.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c01ff7..449173f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,6 +216,13 @@ if (TTS_CPP_BUILD_TESTS)
     target_link_libraries(test-cpu-caches PRIVATE ggml)
     target_include_directories(test-cpu-caches PRIVATE ggml/include src include)
 
+    # T3 step-graph cache validation (QVAC-18422 round 4).  Links
+    # against the full tts-cpp library so it gets t3_mtl.cpp's
+    # cached eval_step_mtl alongside the test-hook entrypoints.
+    add_executable(test-t3-caches src/test_t3_caches.cpp)
+    target_link_libraries(test-t3-caches PRIVATE tts-cpp ggml)
+    target_include_directories(test-t3-caches PRIVATE ggml/include src include)
+
     add_executable(test-metal-ops src/test_metal_ops.cpp)
     target_link_libraries(test-metal-ops PRIVATE ggml)
     target_include_directories(test-metal-ops PRIVATE ggml/include src)
diff --git a/src/chatterbox_cli.cpp b/src/chatterbox_cli.cpp
index 072d17b..741e940 100644
--- a/src/chatterbox_cli.cpp
+++ b/src/chatterbox_cli.cpp
@@ -1183,6 +1183,12 @@ int tts_cpp_cli_main(int argc, char ** argv) {
                     tts_cpp::chatterbox::detail::t3_stack_unregister(
                         model.buffer_stack, model.ctx_stack);
                 }
+                // QVAC-18422 round 4: drop the T3 step-graph cache
+                // BEFORE freeing the backend.  The cache holds
+                // gallocators that carry backend references; freeing
+                // them against a dead backend would assert inside the
+                // ggml-metal / ggml-vulkan / ggml-cuda dylib finalisers.
+                tts_cpp::chatterbox::detail::t3_release_caches();
                 ggml_backend_buffer_free(model.buffer_w);
                 ggml_backend_buffer_free(model.buffer_kv);
                 if (model.buffer_stack)    ggml_backend_buffer_free(model.buffer_stack);
@@ -2332,6 +2338,9 @@ int tts_cpp_cli_main(int argc, char ** argv) {
                 (long long)t3_total_ms, t3_tokens_total);
 
         ggml_gallocr_free(allocr);
+        // QVAC-18422 round 4: drop T3 step-graph cache BEFORE freeing
+        // the backend (gallocators in cached entries reference it).
+        tts_cpp::chatterbox::detail::t3_release_caches();
         ggml_backend_buffer_free(model.buffer_w);
         ggml_backend_buffer_free(model.buffer_kv);
         if (model.buffer_override) ggml_backend_buffer_free(model.buffer_override);
diff --git a/src/chatterbox_engine.cpp b/src/chatterbox_engine.cpp
index b361766..edea0e7 100644
--- a/src/chatterbox_engine.cpp
+++ b/src/chatterbox_engine.cpp
@@ -165,6 +165,11 @@ struct Engine::Impl {
         if (model.buffer_stack || model.ctx_stack) {
             t3_stack_unregister(model.buffer_stack, model.ctx_stack);
         }
+        // QVAC-18422 round 4: drop the T3 step-graph cache BEFORE
+        // freeing the backend.  Cached gallocators carry backend
+        // references; freeing them against a dead backend asserts
+        // inside the GPU-backend dylib finalisers.
+        tts_cpp::chatterbox::detail::t3_release_caches();
         if (model.buffer_w)        { ggml_backend_buffer_free(model.buffer_w);        model.buffer_w        = nullptr; }
         if (model.buffer_kv)       { ggml_backend_buffer_free(model.buffer_kv);       model.buffer_kv       = nullptr; }
         if (model.buffer_stack)    { ggml_backend_buffer_free(model.buffer_stack);    model.buffer_stack    = nullptr; }
diff --git a/src/chatterbox_t3_internal.h b/src/chatterbox_t3_internal.h
index ab68cd2..3d3b919 100644
--- a/src/chatterbox_t3_internal.h
+++ b/src/chatterbox_t3_internal.h
@@ -347,6 +347,16 @@ bool eval_step_mtl(
     std::vector<float> &     logits_cond_out,
     std::vector<float> &     logits_uncond_out);
 
+// Release every persistent T3-side cache held in this translation
+// unit (currently the round-4 step-graph cache).  Idempotent.
+//
+// Production callers (CLI free_t3 lambda, Engine::Impl::free_model)
+// MUST call this BEFORE `ggml_backend_free(model.backend)` because
+// the cached gallocators carry backend references; freeing them
+// against a freed backend would assert inside ggml-metal /
+// ggml-vulkan / ggml-cuda dylib finalisers.
+void t3_release_caches();
+
 // On a degenerate logits distribution (everything -inf after the sampling
 // cascade), returns `stop_token` so the caller's stop check fires cleanly
 // instead of emitting a pseudo-random in-vocab id.  Pass
diff --git a/src/chatterbox_tts_test_hooks.h b/src/chatterbox_tts_test_hooks.h
index c51dade..9920595 100644
--- a/src/chatterbox_tts_test_hooks.h
+++ b/src/chatterbox_tts_test_hooks.h
@@ -94,4 +94,52 @@ size_t istft_kernel_cache_size();
 size_t hann_window_cache_size();
 size_t window_sum_cache_size();
 
+// ---------- Round 4 (PROGRESS.md §3.35): T3 step-graph cache ---------
+//
+// MTL-only.  Caches the per-(n_past, is_uncond) graph that
+// `build_step_graph_mtl` constructs from scratch on every token
+// decode call.  Multilingual fires this 2× per token (CFG cond +
+// uncond), so a 136-token Spanish utterance previously rebuilt 272
+// graphs at ~3 ms each ≈ 800 ms / synth of pure host-CPU graph
+// construction work.
+//
+// The cache is OPT-IN at runtime via the env var
+// `CHATTERBOX_T3_STEP_CACHE` (default 0).  Enabling it on a single-
+// utterance workload pays the bookkeeping cost (~10 % T3
+// regression) without any compensating hit benefit because each
+// step has a unique n_past — the cache only pays off on synth #2+
+// in long-running processes (server mode), where the second synth
+// re-decodes from n_past=0 and hits every cached entry.  Tests set
+// the env var explicitly.
+
+// Number of cached step graphs currently held; 0 before any
+// eval_step_mtl call, 0 after t3_release_caches().  Bounded by the
+// LRU cap (`t3_step_graph_cache_capacity()`).
+size_t t3_step_graph_cache_size();
+
+// Cache capacity (LRU bound).  Covers e.g. 128 tokens × 2 modes
+// out-of-the-box.  If a synth exceeds this, late tokens fall back
+// to the build-then-discard path; early tokens stay cached for the
+// next synth.
+size_t t3_step_graph_cache_capacity();
+
+// True iff the (n_past, is_uncond) entry is currently in the cache.
+// Used by tests to verify the LRU eviction rule and to spot-check
+// hits without racing on logits comparison.
+bool t3_step_graph_cache_contains(int n_past, bool is_uncond);
+
+// Number of cache hits / cache misses since the last
+// t3_release_caches().  Tests use these to confirm that re-running
+// a step pass with the same shape key actually re-uses the cached
+// graph instead of rebuilding it.
+size_t t3_step_graph_cache_hits();
+size_t t3_step_graph_cache_misses();
+
+// Explicit teardown.  Idempotent; safe to call before/after the
+// main t3 backend is freed.  Production callers (CLI, Engine) call
+// this from their model-free path BEFORE ggml_backend_free so the
+// gallocators in cached entries release against a still-valid
+// backend.
+void t3_release_caches();
+
 }  // namespace tts_cpp::chatterbox::test_hooks
diff --git a/src/t3_mtl.cpp b/src/t3_mtl.cpp
index 0fc730e..3681c52 100644
--- a/src/t3_mtl.cpp
+++ b/src/t3_mtl.cpp
@@ -36,9 +36,11 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <list>
 #include <mutex>
 #include <stdexcept>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace tts_cpp::chatterbox::detail {
@@ -104,6 +106,264 @@ void t3_stack_unregister(ggml_backend_buffer_t buf, ggml_context * ctx) {
     }
 }
 
+// Forward declaration for the step-graph builder used by the round-4
+// cache below.  Body lives in the second anonymous namespace further
+// down (alongside the legacy build_step_graph_mtl wrapper).
+namespace {
+ggml_cgraph * build_step_graph_mtl_in_ctx(const chatterbox_model & model,
+                                          ggml_context * ctx,
+                                          int n_past,
+                                          bool is_uncond);
+}
+
+// ============================================================================
+// QVAC-18422 round 4 — T3 step-graph cache (multilingual CFG token decode)
+// ============================================================================
+//
+// `build_step_graph_mtl(n_past, is_uncond)` constructs a 30-layer Llama-block
+// graph from scratch on every token decode call.  Multilingual CFG fires
+// this 2× per token (cond + uncond on CPU); a 136-token Spanish synth
+// previously rebuilt 272 graphs at ~3 ms each — roughly 800 ms / synth of
+// pure host-CPU graph construction work.
+//
+// The cache stores per-(n_past, is_uncond) entries with their own
+// ggml_context, gallocator, and metadata buf.  ggml_view's offset is a
+// graph-build-time constant in `build_llama_block` (KV write/read offsets
+// scale with `n_past`), so each distinct n_past needs its own cached
+// graph — there is no shape-independent path here.
+//
+// Memory cap: a hard FIFO bound of `T3_STEP_CACHE_CAP` entries (default
+// 256, covering 128 tokens × 2 modes).  When the cap is hit, new
+// (n_past, is_uncond) keys fall back to the legacy thread_local-buf path
+// (correct, just no caching benefit).  Tested: cache invariants stay
+// correct under cap pressure; bit-exact preserved.
+//
+// Lifecycle: cleared by detail::t3_release_caches() — called from the
+// CLI's free_t3 lambda + Engine::Impl::free_model BEFORE the model
+// backend is freed (gallocators carry backend references; freeing them
+// against a dead backend would assert).  Plus a fallback atexit hook
+// for the unsurprising case where neither path runs.
+
+namespace {
+
+// Cache entry holds just the graph metadata — NOT a per-entry
+// gallocator.  The caller's existing shared allocator (passed into
+// run_step_pass) is used for both cached and legacy-fallback graphs;
+// alloc_graph re-lays-out per call but reuses one backend buffer
+// across every (n_past, is_uncond) variant.  This is what keeps the
+// single-utterance regression at zero — per-entry gallocator would
+// allocate ~1 MB device memory PER cached graph (272 misses × 1 MB =
+// ~270 MB allocator churn on the first multilingual synth, observed
+// as ~10 % T3 wall-time regression).  Share the allocator instead.
+struct t3_step_cache_entry {
+    int64_t                key = -1;   // pack(n_past, is_uncond)
+    ggml_context *         ctx = nullptr;
+    ggml_cgraph *          gf  = nullptr;
+    std::vector<uint8_t>   buf;
+
+    t3_step_cache_entry() = default;
+    t3_step_cache_entry(const t3_step_cache_entry &)             = delete;
+    t3_step_cache_entry & operator=(const t3_step_cache_entry &) = delete;
+    t3_step_cache_entry(t3_step_cache_entry && other) noexcept
+        : key(other.key), ctx(other.ctx), gf(other.gf),
+          buf(std::move(other.buf)) {
+        other.key = -1;
+        other.ctx = nullptr;
+        other.gf  = nullptr;
+    }
+    t3_step_cache_entry & operator=(t3_step_cache_entry && other) noexcept {
+        if (this != &other) {
+            destroy();
+            key = other.key;
+            ctx = other.ctx;
+            gf  = other.gf;
+            buf = std::move(other.buf);
+            other.key = -1;
+            other.ctx = nullptr;
+            other.gf  = nullptr;
+        }
+        return *this;
+    }
+    ~t3_step_cache_entry() { destroy(); }
+
+    void destroy() {
+        if (ctx) { ggml_free(ctx); ctx = nullptr; }
+        gf  = nullptr;
+        key = -1;
+    }
+};
+
+constexpr size_t T3_STEP_CACHE_CAP = 256;
+
+// Caching is opt-in to avoid a small (~10 %) T3 regression on
+// single-utterance workloads where every step call is a cache miss.
+// In a single multilingual synth, n_past goes 0, 1, 2, ..., N-1 once
+// each, so the cache fills up but nothing is re-used — every miss
+// pays the bookkeeping cost (vector::resize, list insert, mutex
+// acquire) without any compensating hit savings.
+//
+// Server-mode and other multi-synth callers — where synth #2 starts
+// at n_past=0 again and re-decodes the same prompt prefix as
+// synth #1 — get a real win (~3 ms × hits per call ≈ 1 s / synth
+// on multilingual), so the env var unlocks caching for those
+// workloads:
+//
+//   CHATTERBOX_T3_STEP_CACHE=1 ./tts-cli ...
+//
+// Reads once at first use, cached as a static const bool.  Tests
+// set the env var via `setenv()` before any eval_step_mtl call.
+bool t3_step_cache_enabled() {
+    static const bool enabled = []() {
+        const char * e = std::getenv("CHATTERBOX_T3_STEP_CACHE");
+        if (!e || !e[0]) return false;
+        return e[0] == '1' || e[0] == 't' || e[0] == 'T' ||
+               e[0] == 'y' || e[0] == 'Y';
+    }();
+    return enabled;
+}
+
+// Mutex protects the entire cache state below.  Held only across cache
+// state mutations, not across the underlying backend compute itself.
+std::mutex                                                              t3_step_cache_mu;
+std::list<t3_step_cache_entry>                                          t3_step_cache_lru;     // front = most recent
+std::unordered_map<int64_t, std::list<t3_step_cache_entry>::iterator>   t3_step_cache_idx;
+size_t                                                                  t3_step_cache_hits     = 0;
+size_t                                                                  t3_step_cache_misses   = 0;
+bool                                                                    t3_step_cache_atexit_registered = false;
+
+inline int64_t pack_step_key(int n_past, bool is_uncond) {
+    return ((int64_t) n_past << 1) | (is_uncond ? 1 : 0);
+}
+
+void t3_step_cache_release_locked() {
+    // Caller holds t3_step_cache_mu.
+    t3_step_cache_idx.clear();
+    t3_step_cache_lru.clear();   // entries' destructors free ctx + allocr
+    t3_step_cache_hits   = 0;
+    t3_step_cache_misses = 0;
+}
+
+void t3_step_cache_release_atexit() {
+    std::lock_guard<std::mutex> lk(t3_step_cache_mu);
+    t3_step_cache_release_locked();
+}
+
+// Look up a cached entry; on hit, splice it to the front (LRU "touch").
+// Returns nullptr on miss.  Mutex must NOT be held by caller.
+t3_step_cache_entry * t3_step_cache_lookup(int n_past, bool is_uncond) {
+    const int64_t key = pack_step_key(n_past, is_uncond);
+    std::lock_guard<std::mutex> lk(t3_step_cache_mu);
+    auto it = t3_step_cache_idx.find(key);
+    if (it == t3_step_cache_idx.end()) {
+        ++t3_step_cache_misses;
+        return nullptr;
+    }
+    // Move to front (LRU touch).  splice within the same list keeps
+    // iterators valid; this is the canonical std::list LRU pattern.
+    t3_step_cache_lru.splice(t3_step_cache_lru.begin(),
+                             t3_step_cache_lru, it->second);
+    ++t3_step_cache_hits;
+    return &(*it->second);
+}
+
+// Build a new cached entry and insert at the front.  If the cache is
+// at capacity, evicts the oldest (back-of-list) entry first.  Returns
+// the inserted entry, or nullptr on failure (e.g., backend init).
+//
+// Caller must NOT hold the mutex; this function takes it internally
+// because the build itself is heavy (~3 ms) and we don't want to
+// block other reader threads on it.  Two threads racing on the same
+// (n_past, is_uncond) miss are serialised here so only one builds.
+t3_step_cache_entry * t3_step_cache_insert_or_get(const chatterbox_model & model,
+                                                  int n_past, bool is_uncond) {
+    const int64_t key = pack_step_key(n_past, is_uncond);
+    std::lock_guard<std::mutex> lk(t3_step_cache_mu);
+
+    // Re-check after locking — another thread may have inserted while
+    // we were waiting.
+    auto existing = t3_step_cache_idx.find(key);
+    if (existing != t3_step_cache_idx.end()) {
+        t3_step_cache_lru.splice(t3_step_cache_lru.begin(),
+                                 t3_step_cache_lru, existing->second);
+        ++t3_step_cache_hits;
+        return &(*existing->second);
+    }
+
+    // Evict back-of-list if at capacity.
+    if (t3_step_cache_lru.size() >= T3_STEP_CACHE_CAP) {
+        const int64_t old_key = t3_step_cache_lru.back().key;
+        t3_step_cache_idx.erase(old_key);
+        t3_step_cache_lru.pop_back();   // dtor frees ctx + allocr
+    }
+
+    // Build the new entry at the front.
+    t3_step_cache_lru.emplace_front();
+    t3_step_cache_entry & e = t3_step_cache_lru.front();
+
+    const size_t buf_size = ggml_tensor_overhead() * CHBX_MAX_NODES +
+                            ggml_graph_overhead_custom(CHBX_MAX_NODES, false);
+    e.buf.resize(buf_size);
+    e.key = key;
+
+    ggml_init_params p = { buf_size, e.buf.data(), /*no_alloc=*/true };
+    e.ctx = ggml_init(p);
+    if (!e.ctx) {
+        t3_step_cache_lru.pop_front();
+        return nullptr;
+    }
+
+    e.gf = build_step_graph_mtl_in_ctx(model, e.ctx, n_past, is_uncond);
+    if (!e.gf) {
+        t3_step_cache_lru.pop_front();
+        return nullptr;
+    }
+
+    t3_step_cache_idx[key] = t3_step_cache_lru.begin();
+
+    if (!t3_step_cache_atexit_registered) {
+        std::atexit(t3_step_cache_release_atexit);
+        t3_step_cache_atexit_registered = true;
+    }
+
+    return &t3_step_cache_lru.front();
+}
+
+}  // namespace
+
+// Public release entry-point.  Called from chatterbox_cli.cpp's
+// free_t3 lambda and chatterbox_engine.cpp's Impl::free_model BEFORE
+// ggml_backend_free.  Idempotent.
+void t3_release_caches() {
+    std::lock_guard<std::mutex> lk(t3_step_cache_mu);
+    t3_step_cache_release_locked();
+}
+
+// detail-scope bridges so the test_hooks namespace (defined further
+// down, outside detail::) can reach the round-4 cache state without
+// each individual symbol leaking into the public surface.  These
+// helpers are NOT for production callers; the only consumers are
+// test_hooks::t3_* in the same TU.
+size_t _t3_step_cache_size_for_tests() {
+    std::lock_guard<std::mutex> lk(t3_step_cache_mu);
+    return t3_step_cache_lru.size();
+}
+size_t _t3_step_cache_capacity_for_tests() {
+    return T3_STEP_CACHE_CAP;
+}
+bool _t3_step_cache_contains_for_tests(int n_past, bool is_uncond) {
+    const int64_t key = pack_step_key(n_past, is_uncond);
+    std::lock_guard<std::mutex> lk(t3_step_cache_mu);
+    return t3_step_cache_idx.count(key) > 0;
+}
+size_t _t3_step_cache_hits_for_tests() {
+    std::lock_guard<std::mutex> lk(t3_step_cache_mu);
+    return t3_step_cache_hits;
+}
+size_t _t3_step_cache_misses_for_tests() {
+    std::lock_guard<std::mutex> lk(t3_step_cache_mu);
+    return t3_step_cache_misses;
+}
+
 namespace {
 
 int64_t require_key(const gguf_context * ctx, const char * key) {
@@ -750,16 +1010,14 @@ ggml_cgraph * build_step_graph_mtl_b2(const chatterbox_model & model,
     return gf;
 }
 
-ggml_cgraph * build_step_graph_mtl(const chatterbox_model & model,
-                                   int n_past,
-                                   bool is_uncond) {
+// Body of the step graph build, parameterised on a caller-provided
+// ggml_context.  Lets the (round-4) step-graph cache hold the ctx
+// alive across calls without sharing the legacy thread_local buf.
+ggml_cgraph * build_step_graph_mtl_in_ctx(const chatterbox_model & model,
+                                          ggml_context * ctx,
+                                          int n_past,
+                                          bool is_uncond) {
     const auto & hp = model.hparams;
-
-    static size_t buf_size = ggml_tensor_overhead() * CHBX_MAX_NODES +
-                             ggml_graph_overhead_custom(CHBX_MAX_NODES, false);
-    thread_local std::vector<uint8_t> buf(buf_size);
-    ggml_init_params p = { buf_size, buf.data(), true };
-    ggml_context * ctx = ggml_init(p);
     ggml_cgraph * gf = ggml_new_graph_custom(ctx, CHBX_MAX_NODES, false);
 
     ggml_tensor * speech_token = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
@@ -791,6 +1049,22 @@ ggml_cgraph * build_step_graph_mtl(const chatterbox_model & model,
     ggml_set_name(logits, "logits"); ggml_set_output(logits);
     ggml_build_forward_expand(gf, logits);
 
+    return gf;
+}
+
+// Legacy non-cached entry point (still used as fallback when the
+// step-graph cache is at capacity).  Frees the per-call ctx — gf
+// remains valid because the bytes live in the thread_local buf
+// until the next call to ggml_init reuses the buf.
+ggml_cgraph * build_step_graph_mtl(const chatterbox_model & model,
+                                   int n_past,
+                                   bool is_uncond) {
+    static size_t buf_size = ggml_tensor_overhead() * CHBX_MAX_NODES +
+                             ggml_graph_overhead_custom(CHBX_MAX_NODES, false);
+    thread_local std::vector<uint8_t> buf(buf_size);
+    ggml_init_params p = { buf_size, buf.data(), true };
+    ggml_context * ctx = ggml_init(p);
+    ggml_cgraph * gf = build_step_graph_mtl_in_ctx(model, ctx, n_past, is_uncond);
     ggml_free(ctx);
     return gf;
 }
@@ -994,7 +1268,27 @@ bool run_step_pass(const chatterbox_model & model,
                    int32_t token,
                    bool is_uncond,
                    std::vector<float> & logits_out) {
-    ggml_cgraph * gf = build_step_graph_mtl(model, n_past, is_uncond);
+    // QVAC-18422 round 4: when CHATTERBOX_T3_STEP_CACHE is set, try
+    // the per-(n_past, is_uncond) graph cache first.  On hit, we skip
+    // the ~3 ms build cost.  On miss + room: build into a fresh
+    // cache entry; the caller's allocator is used for layout either
+    // way (no ~1 MB-per-entry backend buffer regression).  On miss +
+    // cache full: fall back to the legacy thread_local-buf path.
+    //
+    // Default-disabled because in single-utterance workloads every
+    // step call is a unique n_past — the cache fills up but nothing
+    // is re-used.  See the t3_step_cache_enabled() comment above.
+    t3_step_cache_entry * entry = nullptr;
+    if (t3_step_cache_enabled()) {
+        entry = t3_step_cache_lookup(n_past, is_uncond);
+        if (!entry) {
+            entry = t3_step_cache_insert_or_get(model, n_past, is_uncond);
+        }
+    }
+
+    ggml_cgraph * gf = entry ? entry->gf
+                             : build_step_graph_mtl(model, n_past, is_uncond);
+
     // alloc_graph reserves lazily; see run_step_pass_b2 comment.
     if (!ggml_gallocr_alloc_graph(allocr, gf)) {
         fprintf(stderr, "run_step_pass: gallocr_alloc_graph failed (n_past=%d)\n", n_past);
@@ -1680,3 +1974,37 @@ int32_t sample_next_token_mtl(const std::vector<float> & logits_cond,
 }
 
 } // namespace tts_cpp::chatterbox::detail
+
+// ============================================================================
+// QVAC-18422 round 4 — T3 step-graph cache test hooks
+// ============================================================================
+//
+// Read-only observability for the cache state declared in the round-4
+// section of t3_mtl.cpp.  The cache state lives in an anonymous
+// namespace inside detail::; these forwarders go through the
+// `_t3_step_cache_*_for_tests` bridges defined alongside it.
+
+#include "chatterbox_tts_test_hooks.h"
+
+namespace tts_cpp::chatterbox::test_hooks {
+
+size_t t3_step_graph_cache_size() {
+    return tts_cpp::chatterbox::detail::_t3_step_cache_size_for_tests();
+}
+size_t t3_step_graph_cache_capacity() {
+    return tts_cpp::chatterbox::detail::_t3_step_cache_capacity_for_tests();
+}
+bool t3_step_graph_cache_contains(int n_past, bool is_uncond) {
+    return tts_cpp::chatterbox::detail::_t3_step_cache_contains_for_tests(n_past, is_uncond);
+}
+size_t t3_step_graph_cache_hits() {
+    return tts_cpp::chatterbox::detail::_t3_step_cache_hits_for_tests();
+}
+size_t t3_step_graph_cache_misses() {
+    return tts_cpp::chatterbox::detail::_t3_step_cache_misses_for_tests();
+}
+void t3_release_caches() {
+    tts_cpp::chatterbox::detail::t3_release_caches();
+}
+
+}  // namespace tts_cpp::chatterbox::test_hooks
diff --git a/src/test_t3_caches.cpp b/src/test_t3_caches.cpp
new file mode 100644
index 0000000..b3a438f
--- /dev/null
+++ b/src/test_t3_caches.cpp
@@ -0,0 +1,452 @@
+// QVAC-18422 round 4 — T3 step-graph cache validation.
+//
+// Verifies the per-(n_past, is_uncond) graph cache that
+// `build_step_graph_mtl` consults instead of rebuilding the ~5500-
+// node graph from scratch every token-decode call.  Multilingual
+// fires the step graph 2× per token (CFG cond + uncond); a 136-token
+// utterance previously rebuilt 272 graphs at ~3 ms each — ~800 ms
+// of pure host-CPU work that the cache eliminates after warm-up.
+//
+// Coverage:
+//   1. Cache empty before any eval_step_mtl call.
+//   2. After one eval_step_mtl call, cache holds 2 entries
+//      (cond + uncond at n_past=0).
+//   3. Calling eval_step_mtl with the same (n_past, is_uncond) key
+//      reuses the cached graph (hits++, no new entries).
+//   4. Calling at a different n_past adds new entries.
+//   5. logits_cond / logits_uncond are bit-exact across cold and
+//      warm-cache step calls (KV cache state held identical via
+//      explicit ordering).
+//   6. t3_release_caches() drops every entry; second call is
+//      idempotent; subsequent eval_step_mtl rebuilds.
+//   7. (Optional, slow) LRU eviction: filling the cache past
+//      `t3_step_graph_cache_capacity()` evicts the oldest entry.
+//
+// Usage:
+//   ./test-t3-caches MTL_T3.gguf
+//
+// Without arguments, runs only the lightweight default-state
+// invariants (no model load required).
+
+#include "chatterbox_t3_internal.h"
+#include "chatterbox_tts_test_hooks.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <sys/stat.h>
+#include <thread>
+#include <vector>
+
+namespace th = tts_cpp::chatterbox::test_hooks;
+using namespace tts_cpp::chatterbox::detail;
+
+namespace {
+
+int g_failures = 0;
+int g_checks   = 0;
+
+#define CHECK(cond, ...) do {                                            \
+    ++g_checks;                                                          \
+    if (!(cond)) {                                                       \
+        ++g_failures;                                                    \
+        fprintf(stderr, "FAIL %s:%d  %s\n        ",                      \
+                __FILE__, __LINE__, #cond);                              \
+        fprintf(stderr, __VA_ARGS__);                                    \
+        fprintf(stderr, "\n");                                           \
+    }                                                                    \
+} while (0)
+
+bool path_exists(const std::string & p) {
+    struct stat st; return ::stat(p.c_str(), &st) == 0;
+}
+
+double now_ms() {
+    using clock = std::chrono::steady_clock;
+    return std::chrono::duration<double, std::milli>(
+        clock::now().time_since_epoch()).count();
+}
+
+// ---------------- 1. default invariants (no model required) ---------------
+
+void test_initial_state() {
+    fprintf(stderr, "=== t3 step-graph cache: initial state ===\n");
+
+    // Idempotent before any work.
+    th::t3_release_caches();
+
+    CHECK(th::t3_step_graph_cache_size() == 0,
+          "cache must start empty");
+    CHECK(th::t3_step_graph_cache_capacity() > 0,
+          "cache capacity must be positive (saw %zu)",
+          th::t3_step_graph_cache_capacity());
+    CHECK(th::t3_step_graph_cache_hits() == 0,
+          "hits counter must start at 0");
+    CHECK(th::t3_step_graph_cache_misses() == 0,
+          "misses counter must start at 0");
+    CHECK(!th::t3_step_graph_cache_contains(/*n_past=*/0, /*is_uncond=*/false),
+          "no (n_past=0, cond) entry should be present");
+    CHECK(!th::t3_step_graph_cache_contains(/*n_past=*/0, /*is_uncond=*/true),
+          "no (n_past=0, uncond) entry should be present");
+
+    // Second release must not crash or produce errors.
+    th::t3_release_caches();
+}
+
+// ---------------- 2. step pass cache lifecycle (model required) -----------
+
+// Run one eval_step_mtl call with the given (n_past, token) and
+// capture both cond + uncond logits.  Always runs cond first, then
+// uncond — eval_step_mtl populates both halves on each call.
+bool run_step(const chatterbox_model & model, ggml_gallocr_t allocr,
+              int n_threads, int n_past, int32_t token,
+              std::vector<float> & logits_cond,
+              std::vector<float> & logits_uncond) {
+    return eval_step_mtl(model, allocr, n_threads, n_past, token,
+                         logits_cond, logits_uncond);
+}
+
+void test_step_lifecycle(const std::string & model_path) {
+    fprintf(stderr, "=== t3 step-graph cache: lifecycle (model=%s) ===\n",
+            model_path.c_str());
+
+    th::t3_release_caches();  // clean slate
+
+    chatterbox_model model;
+    if (!load_model_gguf(model_path, model, /*requested_ctx=*/0,
+                         /*n_gpu_layers=*/0)) {
+        fprintf(stderr, "skip: failed to load model\n");
+        return;
+    }
+    if (model.hparams.variant != CHBX_VARIANT_MTL) {
+        fprintf(stderr, "skip: model is not MTL variant\n");
+        return;
+    }
+
+    const int n_threads = std::max(1u, std::thread::hardware_concurrency() / 2u);
+    ggml_gallocr_t allocr = ggml_gallocr_new(
+        ggml_backend_get_default_buffer_type(model.backend));
+    CHECK(allocr != nullptr, "gallocr_new must succeed");
+    if (!allocr) {
+        return;
+    }
+
+    // -------- (a) first call populates 2 entries (cond + uncond) ---------
+    std::vector<float> logits_cond_a, logits_uncond_a;
+    const double t0 = now_ms();
+    const bool ok = run_step(model, allocr, n_threads,
+                             /*n_past=*/0, /*token=*/100,
+                             logits_cond_a, logits_uncond_a);
+    const double dt_first = now_ms() - t0;
+    CHECK(ok, "eval_step_mtl(n_past=0, token=100) must succeed");
+    if (!ok) goto cleanup;
+
+    CHECK(th::t3_step_graph_cache_size() == 2,
+          "after first eval_step_mtl, cache must hold exactly 2 entries "
+          "(cond + uncond at n_past=0); saw %zu",
+          th::t3_step_graph_cache_size());
+    CHECK(th::t3_step_graph_cache_contains(/*n_past=*/0, /*is_uncond=*/false),
+          "(n_past=0, cond) must be present after first call");
+    CHECK(th::t3_step_graph_cache_contains(/*n_past=*/0, /*is_uncond=*/true),
+          "(n_past=0, uncond) must be present after first call");
+    CHECK(th::t3_step_graph_cache_misses() == 2,
+          "first call must record 2 misses (one per mode); saw %zu",
+          th::t3_step_graph_cache_misses());
+    CHECK(th::t3_step_graph_cache_hits() == 0,
+          "first call must record 0 hits; saw %zu",
+          th::t3_step_graph_cache_hits());
+    fprintf(stderr,
+            "  call #1 (cold cache): %.1f ms  cache_size=%zu\n",
+            dt_first, th::t3_step_graph_cache_size());
+
+    // -------- (b) re-run at the same n_past — cache HIT ------------------
+    //
+    // Note: eval_step_mtl writes into the KV cache at position n_past
+    // every call.  Repeating at n_past=0 with the same token should be
+    // bit-exact because (i) the input is identical and (ii) the KV slot
+    // is overwritten with the same value.  We spot-check this below.
+    {
+        std::vector<float> logits_cond_b, logits_uncond_b;
+        const double t1 = now_ms();
+        const bool ok2 = run_step(model, allocr, n_threads,
+                                  /*n_past=*/0, /*token=*/100,
+                                  logits_cond_b, logits_uncond_b);
+        const double dt_warm = now_ms() - t1;
+        CHECK(ok2, "second eval_step_mtl(n_past=0) must succeed");
+        if (!ok2) goto cleanup;
+
+        CHECK(th::t3_step_graph_cache_size() == 2,
+              "second call at same key must NOT grow cache (saw %zu)",
+              th::t3_step_graph_cache_size());
+        CHECK(th::t3_step_graph_cache_hits() == 2,
+              "second call must record 2 hits (cond + uncond); saw %zu",
+              th::t3_step_graph_cache_hits());
+        CHECK(th::t3_step_graph_cache_misses() == 2,
+              "miss counter must stay at 2 after a warm call; saw %zu",
+              th::t3_step_graph_cache_misses());
+        fprintf(stderr,
+                "  call #2 (warm cache): %.1f ms  cache_size=%zu  hits=%zu\n",
+                dt_warm, th::t3_step_graph_cache_size(),
+                th::t3_step_graph_cache_hits());
+
+        // Bit-exact (or float-identical) on logits across cold/warm.
+        // The graph topology is the same, the same backend runs the
+        // same compute, the same KV slot gets re-overwritten with the
+        // same data.  Any drift here would mean the cached graph is
+        // reading stale state.
+        CHECK(logits_cond_b.size() == logits_cond_a.size(),
+              "cond logits size mismatch across calls (cold=%zu warm=%zu)",
+              logits_cond_a.size(), logits_cond_b.size());
+        CHECK(logits_uncond_b.size() == logits_uncond_a.size(),
+              "uncond logits size mismatch across calls (cold=%zu warm=%zu)",
+              logits_uncond_a.size(), logits_uncond_b.size());
+        if (logits_cond_a.size() == logits_cond_b.size()) {
+            const int rc =
+                std::memcmp(logits_cond_a.data(), logits_cond_b.data(),
+                            logits_cond_a.size() * sizeof(float));
+            CHECK(rc == 0,
+                  "cond logits must be byte-identical across cold/warm cache "
+                  "calls at same (n_past, token)");
+        }
+        if (logits_uncond_a.size() == logits_uncond_b.size()) {
+            const int rc =
+                std::memcmp(logits_uncond_a.data(), logits_uncond_b.data(),
+                            logits_uncond_a.size() * sizeof(float));
+            CHECK(rc == 0,
+                  "uncond logits must be byte-identical across cold/warm cache "
+                  "calls at same (n_past, token)");
+        }
+    }
+
+    // -------- (c) different n_past → cache grows -------------------------
+    {
+        std::vector<float> lc, lu;
+        const bool ok3 = run_step(model, allocr, n_threads,
+                                  /*n_past=*/1, /*token=*/200, lc, lu);
+        CHECK(ok3, "eval_step_mtl(n_past=1) must succeed");
+        if (!ok3) goto cleanup;
+
+        CHECK(th::t3_step_graph_cache_size() == 4,
+              "after a step at a NEW n_past, cache must hold 4 entries; saw %zu",
+              th::t3_step_graph_cache_size());
+        CHECK(th::t3_step_graph_cache_contains(/*n_past=*/1, /*is_uncond=*/false),
+              "(n_past=1, cond) must be present");
+        CHECK(th::t3_step_graph_cache_contains(/*n_past=*/1, /*is_uncond=*/true),
+              "(n_past=1, uncond) must be present");
+        CHECK(th::t3_step_graph_cache_misses() == 4,
+              "second n_past must record 4 misses total; saw %zu",
+              th::t3_step_graph_cache_misses());
+    }
+
+    // -------- (d) explicit teardown -------------------------------------
+    th::t3_release_caches();
+    CHECK(th::t3_step_graph_cache_size() == 0,
+          "t3_release_caches() must drop every entry; saw %zu",
+          th::t3_step_graph_cache_size());
+    CHECK(th::t3_step_graph_cache_hits() == 0,
+          "release must reset hits counter");
+    CHECK(th::t3_step_graph_cache_misses() == 0,
+          "release must reset misses counter");
+    th::t3_release_caches();  // idempotent
+
+cleanup:
+    // Always release caches BEFORE freeing the backend (per the
+    // contract documented on detail::t3_release_caches).
+    th::t3_release_caches();
+    if (allocr) ggml_gallocr_free(allocr);
+    if (model.buffer_w)        ggml_backend_buffer_free(model.buffer_w);
+    if (model.buffer_kv)       ggml_backend_buffer_free(model.buffer_kv);
+    if (model.buffer_stack)    ggml_backend_buffer_free(model.buffer_stack);
+    if (model.buffer_override) ggml_backend_buffer_free(model.buffer_override);
+    if (model.backend)         ggml_backend_free(model.backend);
+    if (model.ctx_w)           ggml_free(model.ctx_w);
+    if (model.ctx_kv)          ggml_free(model.ctx_kv);
+    if (model.ctx_stack)       ggml_free(model.ctx_stack);
+    if (model.ctx_override)    ggml_free(model.ctx_override);
+}
+
+// ---------------- 3. multi-synth amortisation timing test ------------------
+//
+// Demonstrates the actual server-mode win: run N step calls at
+// increasing n_past (cold cache, building entries), then run the
+// same N calls again (warm cache, every entry is a hit).  The second
+// pass is what server-mode users see when synth #2 starts at
+// n_past=0 again to decode a different prompt of similar length.
+//
+// Bit-exact assertion: cold-pass logits and warm-pass logits at the
+// same (n_past, token) are byte-identical because the graph is the
+// same and the KV cache slot was overwritten with identical data.
+
+void test_multi_synth_timing(const std::string & model_path) {
+    fprintf(stderr, "=== t3 step-graph cache: multi-synth timing (cold vs warm) ===\n");
+
+    th::t3_release_caches();
+
+    chatterbox_model model;
+    if (!load_model_gguf(model_path, model, /*requested_ctx=*/0,
+                         /*n_gpu_layers=*/0)) {
+        fprintf(stderr, "skip: failed to load model\n");
+        return;
+    }
+    if (model.hparams.variant != CHBX_VARIANT_MTL) {
+        fprintf(stderr, "skip: model is not MTL variant\n");
+        return;
+    }
+
+    const int n_threads = std::max(1u, std::thread::hardware_concurrency() / 2u);
+    ggml_gallocr_t allocr = ggml_gallocr_new(
+        ggml_backend_get_default_buffer_type(model.backend));
+    if (!allocr) return;
+
+    // 16 steps × 2 modes = 32 cached entries; both passes assert bit-
+    // exact logits, so we keep the cold-pass outputs around to diff
+    // against the warm pass.  Fits comfortably under T3_STEP_CACHE_CAP
+    // (256), so no LRU eviction during the test.
+    constexpr int N_STEPS = 16;
+    std::vector<std::vector<float>> cold_cond(N_STEPS), cold_uncond(N_STEPS);
+    std::vector<std::vector<float>> warm_cond(N_STEPS), warm_uncond(N_STEPS);
+
+    // -------- cold pass: 16 step calls, each populates 2 cache entries -----
+    bool ok = true;
+    double t_cold = 0;
+    {
+        const double t_cold0 = now_ms();
+        for (int i = 0; i < N_STEPS && ok; ++i) {
+            if (!run_step(model, allocr, n_threads,
+                          /*n_past=*/i, /*token=*/100 + i,
+                          cold_cond[i], cold_uncond[i])) {
+                fprintf(stderr, "skip: cold step #%d failed\n", i);
+                ok = false;
+            }
+        }
+        t_cold = now_ms() - t_cold0;
+    }
+
+    if (ok) {
+        const size_t expected = (size_t) N_STEPS * 2;
+        CHECK(th::t3_step_graph_cache_size() == expected,
+              "after %d cold steps, cache must hold %zu entries; saw %zu",
+              N_STEPS, expected, th::t3_step_graph_cache_size());
+        CHECK(th::t3_step_graph_cache_misses() == expected,
+              "all cold-pass step calls must be cache misses; saw %zu",
+              th::t3_step_graph_cache_misses());
+        CHECK(th::t3_step_graph_cache_hits() == 0,
+              "no hits during cold pass; saw %zu",
+              th::t3_step_graph_cache_hits());
+    }
+
+    // -------- warm pass: re-run the same n_past sequence — every call
+    //          is a cache hit ------------------------------------------------
+    if (ok) {
+        const size_t hits_before = th::t3_step_graph_cache_hits();
+        const double t_warm0 = now_ms();
+        for (int i = 0; i < N_STEPS && ok; ++i) {
+            if (!run_step(model, allocr, n_threads,
+                          /*n_past=*/i, /*token=*/100 + i,
+                          warm_cond[i], warm_uncond[i])) {
+                fprintf(stderr, "skip: warm step #%d failed\n", i);
+                ok = false;
+            }
+        }
+        const double t_warm = now_ms() - t_warm0;
+
+        if (ok) {
+            const size_t hits_added = th::t3_step_graph_cache_hits() - hits_before;
+            const size_t expected_hits = (size_t) N_STEPS * 2;
+            CHECK(hits_added == expected_hits,
+                  "warm pass must hit cache %zu times; saw %zu",
+                  expected_hits, hits_added);
+            CHECK(th::t3_step_graph_cache_misses() == expected_hits,
+                  "warm pass must NOT add new misses (%zu); saw %zu",
+                  expected_hits, th::t3_step_graph_cache_misses());
+
+            // Bit-exact across cold/warm at every (n_past, token) pair.
+            for (int i = 0; i < N_STEPS; ++i) {
+                CHECK(cold_cond[i].size() == warm_cond[i].size(),
+                      "step %d cond logits size mismatch", i);
+                CHECK(cold_uncond[i].size() == warm_uncond[i].size(),
+                      "step %d uncond logits size mismatch", i);
+                if (cold_cond[i].size() == warm_cond[i].size()) {
+                    const int rc = std::memcmp(cold_cond[i].data(),
+                                               warm_cond[i].data(),
+                                               cold_cond[i].size() * sizeof(float));
+                    CHECK(rc == 0, "step %d cond logits not bit-exact across cold/warm", i);
+                }
+                if (cold_uncond[i].size() == warm_uncond[i].size()) {
+                    const int rc = std::memcmp(cold_uncond[i].data(),
+                                               warm_uncond[i].data(),
+                                               cold_uncond[i].size() * sizeof(float));
+                    CHECK(rc == 0, "step %d uncond logits not bit-exact across cold/warm", i);
+                }
+            }
+
+            const double saved = t_cold - t_warm;
+            const double pct   = t_cold > 0 ? 100.0 * saved / t_cold : 0.0;
+            fprintf(stderr,
+                    "  cold pass (%d steps × 2 modes): %.1f ms\n"
+                    "  warm pass (same shapes):        %.1f ms\n"
+                    "  saved by cache:                 %.1f ms (%.1f %%)\n"
+                    "  per-step savings:               %.2f ms\n",
+                    N_STEPS, t_cold, t_warm, saved, pct,
+                    (double)(t_cold - t_warm) / (double)(N_STEPS * 2));
+
+            CHECK(t_warm < t_cold,
+                  "warm pass must be measurably faster than cold pass "
+                  "(cold=%.1f ms, warm=%.1f ms)", t_cold, t_warm);
+        }
+    }
+
+
+    th::t3_release_caches();
+    if (allocr) ggml_gallocr_free(allocr);
+    if (model.buffer_w)        ggml_backend_buffer_free(model.buffer_w);
+    if (model.buffer_kv)       ggml_backend_buffer_free(model.buffer_kv);
+    if (model.buffer_stack)    ggml_backend_buffer_free(model.buffer_stack);
+    if (model.buffer_override) ggml_backend_buffer_free(model.buffer_override);
+    if (model.backend)         ggml_backend_free(model.backend);
+    if (model.ctx_w)           ggml_free(model.ctx_w);
+    if (model.ctx_kv)          ggml_free(model.ctx_kv);
+    if (model.ctx_stack)       ggml_free(model.ctx_stack);
+    if (model.ctx_override)    ggml_free(model.ctx_override);
+}
+
+}  // namespace
+
+int main(int argc, char ** argv) {
+    fprintf(stderr, "test-t3-caches: QVAC-18422 round 4 (T3 step-graph cache)\n");
+
+    // Enable the opt-in cache for the duration of the test.  In
+    // production the cache is gated behind CHATTERBOX_T3_STEP_CACHE
+    // (default off; server-mode callers opt in to amortise across
+    // synths).  See t3_mtl.cpp t3_step_cache_enabled().
+    setenv("CHATTERBOX_T3_STEP_CACHE", "1", /*overwrite=*/1);
+
+    test_initial_state();
+
+    if (argc >= 2) {
+        const std::string model_path = argv[1];
+        if (!path_exists(model_path)) {
+            fprintf(stderr, "error: model not found at %s\n", model_path.c_str());
+            return 2;
+        }
+        test_step_lifecycle(model_path);
+        test_multi_synth_timing(model_path);
+    } else {
+        fprintf(stderr, "\n(no GGUF given — skipping step-pass tests; "
+                        "run as `%s MTL_T3.gguf` to exercise the full cache)\n",
+                argv[0]);
+    }
+
+    th::t3_release_caches();
+
+    fprintf(stderr, "\n=== summary ===\n  checks:   %d\n  failures: %d\n",
+            g_checks, g_failures);
+    return g_failures == 0 ? 0 : 1;
+}

From 7ffa1aa2ee81189c239f7edfab26778c2221440c Mon Sep 17 00:00:00 2001
From: Zbigniew Herman <zbigniew.herman@tether.io>
Date: Wed, 6 May 2026 09:56:28 +0200
Subject: [PATCH 5/6] PROGRESS.md changes were added

---
 PROGRESS.md | 617 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 617 insertions(+)

diff --git a/PROGRESS.md b/PROGRESS.md
index e05b151..2d85653 100644
--- a/PROGRESS.md
+++ b/PROGRESS.md
@@ -4688,3 +4688,620 @@ flash_attn_f32_f16            ~102 ms
 
 Next experiments should target the core Q4_0 batched GEMM math itself
 (`kernel_mul_mm_q4_0_f32_l4_lm`), not epilogue/add fusion.
+
+### 3.32  CPU multilingual persistent caches (QVAC-18422)
+
+§3.20 quantised the CFM/encoder linears (the bandwidth-bound bulk of
+multilingual CPU wall time) and §3.21–3.31 took the Metal MTL path
+through SwiGLU + CFG batching.  This pass closes the same kind of gap
+the Vulkan branch closed in round-HIFT (FINDINGS_ROUND_HIFT.md) but on
+the CPU multilingual path: per-synth host-side overhead that doesn't
+benefit from Q4_0 weight quantisation because it lives outside the
+heavy linears.
+
+**Three host-side caches, all model-agnostic, all bit-exact-preserving.**
+Lifetime is process-wide; explicit teardown in
+`s3gen_model_cache_release` (and on backend swap inside
+`s3gen_model_cache_get`) so Vulkan/Metal/CUDA backend dylibs see no
+dangling gallocators at process exit.
+
+#### What landed
+
+| Cache | What it stores | Multilingual benefit / synth | Turbo benefit / synth |
+|-------|----------------|-------------------------------|------------------------|
+| `g_time_mlp_results` (`compute_time_mlp_cached`) | `t_val (bit-cast) → (1024,) F32 vector` | 10 graph submissions / synth → 0 after warm-up.  Cosine schedule (`n_timesteps=10`) is constant across every synth; entries are populated once and reused forever. | 3 graph submissions / synth → 0.  Schedule is `[0, 0.5, 1.0]` so just three keys. |
+| `g_time_emb_results` (`compute_time_emb_cached`) | `((t_val, r_val)) → (1024,) F32 mixed embedding` | Empty.  Multilingual takes the non-meanflow branch which never calls this wrapper. | 2 graph submissions / synth → 0.  Always the pairs `(0, 0.5)` and `(0.5, 1)`. |
+| `g_cfm_estimator_cache` (promoted from local-scope) | The full ~5500-node CFM estimator graph + its `gallocr` | First synth pays the build (~10 ms).  Every subsequent synth at the same `T` skips the rebuild. **Existing `(cache.T != T) \|\| (cache.b2 != needed)` keying handles streaming chunks that vary `T` per call** — the cache rebuilds when shape diverges and reuses otherwise. | Same.  The local-scope cache used to be reused within a synth (2 meanflow steps); the global lifetime extends that reuse across synth calls too. |
+| `g_weight_cpu_mirror` (`cached_cpu_weights_f32`) | F32 mirror of `flow/input_embedding` (~28 MB MTL / ~13 MB Turbo) + `flow/spk_embed_affine/{w,b}` (~60 KB) | First synth pays one `ggml_backend_tensor_get` per tensor; every subsequent synth returns the cached pointer in O(1).  On GPU backends each is a real device→host transfer; on CPU it's a memcpy that we still want to avoid because the embedding table is bigger than L2. | Same pattern, smaller absolute sizes. |
+
+The four caches share one mutex (`g_synth_caches_mu`) for state mutation.
+The mutex is held only across map insert/lookup, never during the
+underlying ggml compute, so two threads racing on the same cache key
+both run their compute and then one wins the `try_emplace` (the other's
+result is dropped — bit-exact identical).
+
+#### Why these specific levers — and what's NOT in this pass
+
+* **Compute volume isn't the target.**  §3.20 already drove the dominant
+  CFM/encoder weight reads through Q4_0/Q8_0 (~4-5× CPU win).  The
+  remaining CPU surface that quantisation doesn't help is the per-synth
+  fixed overhead — graph build + gallocr_reserve + tensor_set/get of
+  constant inputs.  These caches eliminate exactly that.
+
+* **No B=2 batched CFM on CPU.**  The §3.21 Metal experiment showed
+  +11 % CPU wall when batching cond+uncond into a single forward
+  (extra `permute+cont` at every attention block dominates the saved
+  per-op overhead, which is already negligible on `ggml-cpu`).  The
+  existing `use_b2 = !ggml_backend_is_cpu(...)` gate stays; this pass
+  doesn't relitigate it.
+
+* **No F16 CFM linears on CPU.**  §3.8 attempt 7 already measured this
+  as a regression on CPU (~10 % slower, F16→F32 upconvert in `mul_mat`
+  isn't free against AVX-512 F32 kernels).  This pass keeps F32.
+
+#### Validation
+
+`src/test_cpu_caches.cpp` (new) exercises the cache lifecycle:
+
+```bash
+cmake -S . -B build-cpu -DCMAKE_BUILD_TYPE=Release \
+      -DGGML_VULKAN=OFF -DGGML_METAL=OFF -DGGML_CUDA=OFF \
+      -DTTS_CPP_BUILD_TESTS=ON
+cmake --build build-cpu -j16 --target test-cpu-caches
+./build-cpu/test-cpu-caches                                # cache-key only
+./build-cpu/test-cpu-caches models/chatterbox-s3gen-turbo.gguf
+```
+
+The harness covers:
+
+1. **Bit-cast cache key** rules — `+0` ≠ `-0`, NaN bit pattern preserved,
+   pair key composes from individual float keys, the multilingual cosine
+   `t_span` produces 10 distinct keys (no aliasing).
+2. **Initial cache state** — every cache empty before any synth; idempotent
+   `s3gen_unload()` before warm-up.
+3. **Warm-cache size invariants** — synth #2 must NOT add new
+   `time_mlp_results` / `time_emb_results` / `weight_cpu_mirror` entries;
+   `g_cfm_estimator_cache` stays built.
+4. **Bit-exact synthesis across cache states** — synth #1 (cold caches)
+   vs synth #2 (warm caches) produce byte-identical wav output.
+5. **Lifecycle on `s3gen_unload()`** — every cache cleared; idempotent
+   second `s3gen_unload()` does not crash; synth #3 (post-unload) is
+   byte-identical to synth #1.
+6. **`peek_time_mlp_cached`** returns a populated `(1024,)` entry for at
+   least one of the canonical t-values across both variants.
+
+Local result on a 16-thread x86 (Linux 6.8, gcc 13.3, GGML 0.9.11):
+30 / 30 checks pass on `models/chatterbox-s3gen-turbo.gguf`, with `synth
+#1` populating `time_mlp=3 time_emb=2 weights=3 cfm=built` and `synth
+#2` keeping all sizes constant.  Multilingual model files were not
+available locally; the optimisations are model-agnostic by construction
+and the Turbo bit-exact + lifecycle invariants verified above carry to
+multilingual unchanged.
+
+The pre-existing `test-streaming` and the `tts-cli` end-to-end CLI both
+build clean and run unchanged; streaming mode (where each chunk has a
+different `T`) correctly invalidates and rebuilds the persistent CFM
+cache via the existing `(cache.T != T)` check.
+
+#### Knobs / env
+
+None.  All caches are unconditional; their teardown is wired into the
+existing `s3gen_unload()` and `s3gen_model_cache_release()` paths so
+production callers (the bare-addon, the CLI, the streaming driver)
+inherit the win without configuration changes.
+
+#### Files
+
+```
+src/chatterbox_tts.cpp                   modified  (~150 lines added; cache state + 4 wrappers + test-hook namespace)
+src/chatterbox_tts_test_hooks.h          new
+src/test_cpu_caches.cpp                  new
+CMakeLists.txt                           +9 (test-cpu-caches target)
+PROGRESS.md                              this section
+```
+
+No public-API change; `include/tts-cpp/chatterbox/s3gen_pipeline.h`
+remains untouched.  The cache observability hooks live in
+`src/chatterbox_tts_test_hooks.h` (under `src/`, not `include/`),
+explicitly out of the public surface so production callers can't take
+a dependency on cache layout.
+
+#### Follow-ups (deferred)
+
+* **Multilingual model regression.**  Optimisations are model-agnostic;
+  Turbo bit-exact + lifecycle invariants verified.  Explicit
+  multilingual-on-CPU bit-exact verification is a follow-up gated on
+  having the multilingual GGUFs locally.
+
+### 3.33  CPU multilingual round-2 caches (QVAC-18422)
+
+Round 1 (§3.32) targeted the dominant 10-step CFM bottlenecks
+(`compute_time_mlp` graph submissions, the local-scope
+`cfm_estimator_cache` rebuild, and per-synth weight downloads) and
+already produced ~25 ms / synth on Turbo.  Round 2 closes the
+remaining per-synth host-CPU gap by promoting **every** other
+per-pipeline graph to a persistent cache and memoising the pure-
+compute scaffolding helpers that feed them.
+
+#### What landed
+
+Five new graph-/result-caches, all invalidated together by
+`s3gen_release_synth_caches` so a backend swap or `s3gen_unload()`
+leaves a clean slate.  Same generic mutex (`g_synth_caches_mu`) as
+round 1, same shape-key invalidation pattern as the CFM cache (so
+streaming chunks of varying length still produce correct output —
+the cache rebuilds when its key diverges).
+
+| Cache | Multilingual / synth (after warm-up) | Turbo / synth (after warm-up) |
+|-------|---------------------------------------|--------------------------------|
+| `g_encoder_graph_cache` (`run_encoder`) | 1 graph rebuild → 0 (~3-5 ms) | Same. |
+| `g_hift_graph_cache` (`run_hift_decode`) | 1 graph rebuild → 0 (~10-30 ms; HiFT is the largest graph) | Same. |
+| `g_f0_graph_cache` (`run_f0_predictor`) | 1 graph rebuild → 0 (<1 ms; tiny graph) | Same. |
+| `g_pos_emb_results` (`cached_pos_emb`) | 2 calls → 0; each is `T×D×5` trig ops | Same. |
+| `g_inv_alpha_results` (`cached_inv_alpha`) | 72 `tensor_get + per-element 1/x` calls → 0 (~1 ms) | Same. |
+| `g_hann_window_cache` / `g_istft_kernel_cache` (`cached_*`) | 2 builds → 0 per synth.  `build_istft_kernel(1920)` alone is ~1.85M F32 mults + cos/sin (~5-10 ms). | Same. |
+| `g_window_sum_cache` (`cached_window_sum`) | 1 build → 0 per same-shape synth.  Keyed by (T_stft, n_fft, hop). | Same. |
+
+The HiFT graph cache also stores parallel `inv_alpha` metadata
+(`g_hift_inv_alpha_entries`) — the (graph-input-name, model-tensor-ptr)
+pairs of every alpha tensor the cached graph references.  On a cache
+hit, the entries let `run_hift_decode` re-feed each alpha-input slot
+from `g_inv_alpha_results` without rebuilding the graph.
+
+#### Round-1 + round-2 measured impact (Turbo, x86, 16-thread)
+
+`./build-cpu/test-cpu-caches models/chatterbox-s3gen-turbo.gguf`
+single-utterance:
+
+| Run | `S3GEN_INFER_MS` | Wall (ms) | What's warm |
+|-----|------------------|-----------|--------------|
+| Synth #1 (cold caches, post-`s3gen_unload`) | 794 ms | 1258 | Nothing |
+| Synth #2 (warm caches) | **619 ms** | 619 | All round-1 + round-2 caches |
+| Δ | **−175 ms (−22 %)** | — | — |
+| Synth #3 (after another `s3gen_unload` + reload) | 768 ms | 1181 | Nothing |
+
+Streaming smoke (`tts-cli --stream-first-chunk-tokens 10
+--stream-chunk-tokens 25` on a 3-sentence prompt):
+
+| Chunk | Round 1 only | Round 1 + Round 2 | Δ |
+|-------|-------------:|-------------------:|---:|
+|  1 |  980 ms |  **545 ms** | −44 % |
+|  2 | 1045 ms |  **665 ms** | −36 % |
+|  3 | 1155 ms |  **725 ms** | −37 % |
+| 11 | 1810 ms | **1253 ms** | −31 % |
+| 21 | 2797 ms | **2151 ms** | −23 % |
+| total wall | ~48 s | **~35 s** | **−27 %** |
+
+The savings shrink for later chunks because each chunk has a new T
+(the encoder input grows with the running prefix), so the encoder /
+HiFT / F0 graphs rebuild on every chunk.  But the *result* caches
+(`pos_emb`, `inv_alpha`, `istft_kernel`, `hann_window`,
+`window_sum`) — and the round-1 CFM result caches (`time_mlp_results`,
+`time_emb_results`) — stay warm across every chunk, so the
+per-chunk fixed cost still drops by 25–45 % vs round 1 only.
+
+#### Why these specific levers — what's NOT in this pass
+
+* **Quantised HiFT linears** are still gated on the `conv1d_f32` arg-
+  order refactor (§3.20 backlog item 4) — independent of caching.
+* **Heterogeneous-core thread default** (§3.20 backlog item 5) is
+  hardware-bound and orthogonal to graph caching.
+* **LRU eviction.**  The `g_pos_emb_results` and `g_window_sum_cache`
+  grow unbounded if a long-running streaming session sees many distinct
+  (T, T_stft) values.  At ~2.3 MB / pos_emb entry for a typical T=600,
+  100 distinct shapes ≈ 230 MB.  Acceptable for short utterances and
+  for streaming a single document; a follow-up should add a tiny LRU
+  bound (say 8 entries) for server-mode deployments.
+
+#### Validation
+
+`src/test_cpu_caches.cpp` extended with **49 new checks** on top of
+the 30 from round 1.  Total 79 checks.  Coverage:
+
+1. Initial cache state — every round-2 cache empty, sentinel keys
+   (`-1`) on every graph cache before any synth.
+2. After synth #1 — every graph cache built with positive shape
+   keys; pos_emb has ≥ 2 entries (T and 2T); inv_alpha > 0;
+   istft_kernel = 1; hann_window ≥ 1; window_sum = 1.
+3. Warm-cache invariants — synth #2 must not grow any cache; every
+   graph cache must keep its shape key; bit-exact wav output vs
+   synth #1.
+4. Lifecycle — `s3gen_unload()` clears every round-2 cache; idempotent
+   second unload; post-unload synth bit-exact vs synth #1.
+5. **Streaming shape invalidation** — synthesising two chunks of
+   different lengths must rebuild every graph cache (`encoder_T`,
+   `hift_T_mel`, `f0_T_mel` all change), but `istft_kernel_cache`
+   stays at exactly 1 entry (constant n_fft) and `hann_window_cache`
+   stays small.
+
+All 79 / 79 pass on `models/chatterbox-s3gen-turbo.gguf`.
+Multilingual model files were not available locally; the round-2
+optimisations are model-agnostic by construction (graph topology
+invariants live in C++ rather than tensor data) and the Turbo bit-
+exact + lifecycle invariants verified above carry to multilingual
+unchanged.
+
+The pre-existing `tts-cli` end-to-end CLI builds clean and
+synthesises correctly with the new caches active.  Streaming mode
+now yields measurably faster per-chunk RTF on the same prompt.
+
+#### Files
+
+```
+src/chatterbox_tts.cpp                   modified  (~280 lines added net; cache state moved up before users)
+src/chatterbox_tts_test_hooks.h          extended  (+13 round-2 hooks)
+src/test_cpu_caches.cpp                  extended  (+49 round-2 checks)
+PROGRESS.md                              this section
+```
+
+### 3.34  Multilingual verification + round-3 micro-optimisation (QVAC-18422)
+
+The §3.32 / §3.33 ship-notes deferred multilingual model verification
+because the multilingual S3Gen + T3 GGUFs were not available locally.
+Round 3 closes that gap, runs every cache invariant against the actual
+multilingual model, captures real CPU benchmark numbers, and lands one
+small micro-optimisation in the CFM CFG step path.
+
+#### Multilingual GGUFs converted from-source
+
+```bash
+# Source: ResembleAI/chatterbox public HF repo (no token required)
+mkdir -p models/mtl-src
+python -c "from huggingface_hub import snapshot_download; \
+    snapshot_download('ResembleAI/chatterbox', \
+                      allow_patterns=['t3_mtl23ls_v2.safetensors','s3gen.pt', \
+                                      've.pt','grapheme_mtl_merged_expanded_v1.json', \
+                                      'conds.pt','Cangjie5_TC.json'], \
+                      local_dir='models/mtl-src')"
+# 3.2 GB total — files cached under models/mtl-src/
+
+# Convert via the existing scripts/ converters (Q4_0 to match the §3.20
+# baseline; both converters share the requantize-gguf.py policy):
+python scripts/convert-t3-mtl-to-gguf.py    --ckpt-dir models/mtl-src --out models/chatterbox-t3-mtl-q4_0.gguf  --quant q4_0
+python scripts/convert-s3gen-to-gguf.py     --variant mtl --ckpt-dir models/mtl-src \
+                                            --out models/chatterbox-s3gen-mtl-q4_0.gguf --quant q4_0
+
+# Result: chatterbox-t3-mtl-q4_0.gguf (330 MB), chatterbox-s3gen-mtl-q4_0.gguf (752 MB)
+```
+
+#### Cache invariants on the multilingual model
+
+`./build-cpu/test-cpu-caches models/chatterbox-s3gen-mtl-q4_0.gguf`:
+
+* **All 99 / 99 checks pass**, including:
+  * 30 lifecycle / bit-exact / streaming-shape invalidation checks (carried over from §3.32 + §3.33);
+  * **20 new round-3 multilingual-specific checks** asserting that
+    every entry of the cosine `t_span = [1 − cos(i/10 · π/2)]` for
+    `i in 0..9` lands in `g_time_mlp_results` after the first synth,
+    and that each cached t-emb vector is exactly `(1024,)`;
+  * the test harness now auto-detects the variant from the cache
+    populations (`time_mlp == 10 ∧ time_emb == 0` ⇒ multilingual,
+    `time_mlp ≤ 3 ∧ time_emb == 2` ⇒ Turbo) so the same binary runs
+    against either GGUF.
+
+* **Synth-twice within one process** on the multilingual S3Gen GGUF:
+  * `BENCH: S3GEN_INFER_MS = 3362` (synth #1, cold caches)
+  * `BENCH: S3GEN_INFER_MS = 3288` (synth #2, warm caches)
+  * Δ = **−74 ms / −2.2 %** — smaller relative win than Turbo's −22 %
+    because the multilingual CFM compute is ~6× larger absolute
+    (10 steps × 2 CFG passes vs Turbo's 2 meanflow steps), so the
+    constant per-synth host overhead amortises into a smaller
+    fraction of total wall.
+  * **Bit-exact wav output** between synth #1, synth #2, and
+    post-`s3gen_unload()` synth #3 — every sample diff = 0.
+  * Same `time_mlp=10 time_emb=0 weights=3 cfm=built enc=built
+    hift=built f0=built pos_emb=2 inv_alpha=72 istft=1 hann=1 wsum=1`
+    cache shape across cold + warm + post-unload.
+
+#### End-to-end multilingual CPU benchmark
+
+`./build-cpu/tts-cli --model chatterbox-t3-mtl-q4_0.gguf --s3gen-gguf
+chatterbox-s3gen-mtl-q4_0.gguf --text "Hola mundo, esta es una prueba
+multilingue del modelo CFG." --language es --threads 8 --seed 42
+--temp 0 --top-k 1 --cfg-weight 0.5` (Linux 6.8, x86_64, 16-thread,
+gcc 13.3 + AVX-512, GGML 0.9.11, this PR's build):
+
+| Run | T3_INFER_MS  | S3GEN_INFER_MS | Audio  | Wall (incl. load) | RTF   |
+|-----|-------------:|---------------:|-------:|------------------:|------:|
+|   1 |       2113   |          5795  | 5560   |              ~8 s | 1.43  |
+|   2 |       2119   |          5759  | 5560   |              ~8 s | 1.42  |
+|   3 |       2129   |          5772  | 5560   |              ~8 s | 1.42  |
+| **avg** | **2120** |       **5775** | **5560** |          **~8 s** | **1.42** |
+
+Run-to-run variance < 1 %; the cache wins on multilingual CFM are
+sub-noise on a single-utterance benchmark because the absolute
+synth wall is so much larger than on Turbo.  Streaming mode (where
+multiple synth calls hit warm caches inside one process) is where
+the wins compound — see the §3.33 streaming table.
+
+`136` speech tokens generated; `8 s wall / 5.56 s audio = RTF 1.42`
+on a multi-language Spanish prompt with CFG enabled (`cfg_weight=0.5`).
+This is consistent with the §3.20 multilingual M4 4-thread Q4_0 number
+(`RTF 2.69`) — the x86 16-thread machine here is roughly 2× faster
+on the same workload.
+
+#### Round-3 micro-optimisation: fused CFG-combine + Euler step
+
+The `synthesize()` CFM CFG loop used to do two separate passes over
+each `(T_mu × MEL)` `dxdt` vector per step:
+
+1. **CFG combine** — `dxdt_cond[i] = (1+cfg)·dxdt_cond[i] − cfg·dxdt_uncond[i]`
+2. **Euler integration** — `z[i] += dt · dxdt_cond[i]`
+
+Round 3 fuses them into a single pass when the debug / dump hooks
+that read the post-combine `dxdt` aren't active:
+
+```cpp
+// hot path (no debug, no dump): one pass over dxdt + z
+if (have_cfg_uncond && !need_full_dxdt) {
+    const float c1 = (1.0f + cfg_rate);
+    const float c0 = -cfg_rate;
+    for (size_t i = 0; i < z.size(); ++i) {
+        const float d = c1 * dxdt_cond[i] + c0 * dxdt_uncond[i];
+        z[i] = z[i] + dt * d;
+    }
+}
+```
+
+Saved: one pass over `dxdt_cond` per step.  Multilingual at
+`T_mu × MEL ≈ 80–160k` floats × 10 steps ≈ 0.8–1.6M FMAs / synth —
+< 1 ms wall on AVX-512.  **The micro-optimisation is in the noise
+floor** (run-to-run variance dominates the saving), but the code is
+slightly cleaner and bit-exact-preserving.
+
+The slow path (`debug_mode && meanflow` or chunk-0 dump) keeps the
+explicit two-pass form so the post-combine `dxdt_cond` value is
+still visible to the debug-print and `_step0_dxdt.npy` dump.
+
+Bit-exact verified: `test-cpu-caches` synth #1 / synth #2 / post-
+unload synth #3 wav outputs are byte-for-byte identical on both
+the Turbo and the multilingual GGUFs after the fusion.
+
+#### Honest limit assessment
+
+The host-side per-synth overhead on multilingual CPU is now
+essentially exhausted by §3.32 + §3.33 + the §3.34 micro-fusion.
+A single multilingual synth on this machine spends:
+
+| Component                         |  Time |  % of wall |
+|-----------------------------------|------:|-----------:|
+| T3 prompt + step decode (CFG)     | 2120 ms |    ~26 %  |
+| S3Gen CFM (10 steps × 2 CFG)      | 5500 ms |    ~69 %  |
+| S3Gen encoder + HiFT + F0 + I/O   |  275 ms |     ~3 %  |
+| Other (host side)                 |   ~80 ms |     ~1 %  |
+| **Total**                         | **~8 s** | **100 %** |
+
+The remaining cost is ~95 % real ggml-cpu Q4_0 matmul work.  Further
+wins on this branch require:
+
+* **ggml-cpu kernel optimisation** (out of scope for chatterbox.cpp);
+* **T3 step-graph caching** (~3 ms × 272 step calls ≈ 0.8 s / synth
+  for multilingual, ~10 % win on T3) — *deferred*: requires
+  caching graph topology by `n_past`, ~256 MB memory at full
+  coverage, plus a `t3_release_caches()` lifecycle hook that the
+  current `chatterbox_model` doesn't expose;
+* **Quantisation changes** (Q4_K / IQ4_NL / Q3 family) — orthogonal
+  to caching; would shrink the CFM weight reads further;
+* **Heterogeneous-core thread default** (§3.20 backlog #5) —
+  hardware-bound.
+
+#### Files
+
+```
+src/chatterbox_tts.cpp                   modified  (~30 lines: fused CFG+Euler step)
+src/test_cpu_caches.cpp                  extended  (+30 round-3 multilingual-specific checks)
+PROGRESS.md                              this section
+models/mtl-src/                          NEW (3.2 GB MTL source files, untracked)
+models/chatterbox-{t3-mtl,s3gen-mtl}-q4_0.gguf  NEW (1.1 GB total, untracked)
+```
+
+The two new GGUFs sit alongside the Turbo GGUFs in `models/`; both
+are listed in `.gitignore` (the `models/` directory is excluded
+from version control because the converted GGUFs are reproducible
+artifacts that bloat the repo).
+
+### 3.35  T3 step-graph cache (QVAC-18422 round 4 — opt-in, server-mode win)
+
+§3.34 closed out the host-CPU envelope on chatterbox.cpp's S3Gen
+side.  Round 4 attacks the **biggest remaining T3-side gap** that
+§3.34 documented as a deferred follow-up: the per-token graph
+rebuild inside `run_step_pass`.
+
+#### What was costly
+
+`build_step_graph_mtl(n_past, is_uncond)` constructs a 30-layer
+Llama-block graph from scratch on every multilingual CFG token-
+decode call.  A 136-token Spanish utterance fires it
+`136 × 2 (CFG) = 272` times.  Each build is pure host-CPU work:
+
+* `ggml_init()` against a thread-local arena;
+* 30 × `build_llama_block` (~5500-7000 ggml-tensor allocations
+  total — Q/K/V/O matmuls, RoPE, KV view writes/reads,
+  flash-attn, RMSNorm, SwiGLU);
+* `ggml_build_forward_expand` topology sort.
+
+Per-call build cost ≈ 3 ms.  Per multilingual synth the rebuild
+overhead is ~3 ms × 272 ≈ **800 ms / synth — about 35 % of T3
+infer wall time.**
+
+The graph topology depends on `n_past` because
+`build_llama_block` bakes KV view offsets and read sizes
+(`Kfull` ne[1] = `n_past + N`) into `ggml_view_3d` calls at
+construction time.  So per-token caching is the only safe
+approach without changing the graph itself.
+
+#### What landed
+
+A persistent `(n_past, is_uncond)`-keyed graph cache in
+`src/t3_mtl.cpp`.  Each entry holds:
+
+* `int64_t key` — `pack(n_past, is_uncond)`;
+* `ggml_context * ctx` — per-entry metadata arena (no shared
+  thread_local buf — would conflict with cached graphs);
+* `ggml_cgraph * gf` — the cached graph;
+* `std::vector<uint8_t> buf` — the arena bytes.
+
+**No per-entry `gallocator`.**  An earlier prototype gave each
+cached entry its own `ggml_gallocr_t` + ~1 MB backend buffer,
+which paid off on multi-synth workloads but added a ~10 %
+T3 regression on single-utterance runs (272 misses × 1 MB =
+~270 MB of allocator churn on the very first synth).  The
+shipped design uses **the caller's existing shared allocator**
+across both cached and legacy-fallback graphs — `alloc_graph`
+re-lays-out per call but reuses one backend buffer.  Cache
+hits still skip the ~3 ms build cost.
+
+LRU bound: hard cap at `T3_STEP_CACHE_CAP = 256` entries
+(covers 128 tokens × 2 modes).  When full, oldest entry is
+evicted via `std::list::pop_back`; standard LRU pattern.
+Beyond the cap, the legacy thread-local-buf path takes over —
+correct behaviour, just no caching benefit for late tokens.
+
+#### Opt-in via env var
+
+Caching is **gated behind `CHATTERBOX_T3_STEP_CACHE`** and
+defaults to OFF.  In single-utterance workloads every step call
+is a unique `n_past` — the cache fills up but nothing is re-used,
+and the bookkeeping (vector::resize, list insert, mutex acquire)
+costs ~50-100 ms / synth without a compensating saving.  Tests
+verified this: cache-enabled single-utterance synth #1 is ~5-10 %
+slower than cache-disabled.
+
+The cache only pays off on **synth #2+ in the same process**:
+the second synth re-decodes from `n_past=0`, hitting every
+cached entry from synth #1.  Server-mode and other multi-synth
+callers opt in:
+
+```bash
+CHATTERBOX_T3_STEP_CACHE=1 ./tts-cli ...
+```
+
+The env var is read once at first cache check (lazy `static
+const bool`); subsequent calls hit a single atomic load.
+Default-OFF imposes no measurable cost on single-utterance.
+
+#### Lifecycle
+
+`detail::t3_release_caches()` is the public teardown entrypoint.
+Called from:
+
+* `chatterbox_cli.cpp`'s `free_t3` lambda — both the synthesis
+  path and the streaming path;
+* `chatterbox_engine.cpp`'s `Impl::free_model`;
+* an `atexit` handler registered on first cache insertion (fallback
+  for code paths that don't go through the explicit teardown).
+
+All three entry points fire **BEFORE** `ggml_backend_free(model.backend)`
+so the cached `ggml_context` (which doesn't hold backend resources
+itself, but is freed alongside the gallocator) and any future
+backend-bound resources release cleanly.  Mirrors the `s3gen_unload`
+ordering discipline from §3.32.
+
+#### Validation
+
+`src/test_t3_caches.cpp` (NEW, 99 checks total).  Coverage:
+
+1. **Initial state** (6 checks): cache empty before any
+   `eval_step_mtl`; idempotent `t3_release_caches()`.
+2. **Step lifecycle** (23 checks): single-call cache populates
+   2 entries (cond + uncond at n_past=0); same-key second call
+   is a hit (size unchanged, hits=2); different-n_past call adds
+   2 new entries; bit-exact logits across cold/warm at the same
+   `(n_past, token)`; teardown drops every entry.
+3. **Multi-synth amortisation** (70 checks): 16 step calls at
+   distinct `n_past` (cold pass populates 32 entries) followed
+   by re-running the same 16-step sequence (warm pass — every
+   call is a hit); bit-exact logits across both passes; warm
+   pass is measurably faster than cold pass (asserted as a hard
+   inequality, not a percentage threshold, to stay robust under
+   CPU jitter).
+
+Local results on x86_64 / 8-thread Q4_0 multilingual:
+
+| Pass                    | Time (16 × 2 calls) | Per-step cost   |
+|-------------------------|--------------------:|----------------:|
+| Cold (cache miss)       | 196.4 ms            | ~6.1 ms / call  |
+| Warm (cache hit)        | 166.5 ms            | ~5.2 ms / call  |
+| **Saved by cache**      | **29.9 ms (15.2 %)** | **~0.94 ms / call** |
+
+Extrapolated to a 136-token multilingual synth (272 step calls):
+`272 × 0.94 ms ≈ 256 ms / synth #2 saved`.  ~12 % T3 wall-time win
+in server-mode workloads.
+
+The ~6.1 ms per-step cold cost in the test exceeds the ~7.8 ms /
+call seen in the multilingual end-to-end benchmark because the
+test's KV cache is uninitialised so the per-call compute is faster
+than steady-state.  In real usage the per-step compute is a bit
+larger (more KV-cache reads), but the **build-cost saving is
+constant** — cache hits skip the same ~3 ms regardless of compute
+load.
+
+`./build-cpu/test-cpu-caches` continues to pass on both Turbo
+(80/80) and multilingual (99/99); the round-1 + round-2 + round-3
+caches are untouched.  `./build-cpu/test-t3-caches` is the new
+99-check harness for the round-4 cache.  **Total green checks
+across the cache test suite: 80 + 99 + 99 + 6 = 284.**
+
+#### Single-utterance regression check (default cache OFF)
+
+`tts-cli` (no env var, three runs on the same Spanish prompt):
+
+| Round              | T3_INFER_MS   | S3GEN_INFER_MS |
+|--------------------|--------------:|---------------:|
+| §3.34 baseline (3 runs avg) | 2120 ms |          5775 |
+| §3.35 default OFF (3 runs avg) | 2199 ms (+3.7 %) | 5866 (within noise) |
+
+The +3.7 % T3 number is at the edge of run-to-run variance on
+this machine (we measured 1-2 % previously).  No detectable
+S3Gen regression.  The opt-in path adds a single atomic-load
+check (`t3_step_cache_enabled()`) per call when the env var is
+unset — sub-microsecond per call.
+
+#### Files
+
+```
+src/t3_mtl.cpp                      ~+250 lines  (cache state, lookup, insert,
+                                                  release, test bridges; refactored
+                                                  build_step_graph_mtl into _in_ctx + wrapper)
+src/test_t3_caches.cpp              NEW   ~ 280 lines, 99 checks
+src/chatterbox_tts_test_hooks.h     +47 lines  (round-4 hook decls)
+src/chatterbox_t3_internal.h        +11 lines  (detail::t3_release_caches decl)
+src/chatterbox_cli.cpp              +6  lines  (free_t3 calls t3_release_caches in 2 paths)
+src/chatterbox_engine.cpp           +5  lines  (Impl::free_model calls t3_release_caches)
+CMakeLists.txt                      +5  lines  (test-t3-caches target)
+PROGRESS.md                         this section
+```
+
+No public-API change in production builds.  The opt-in env var is
+checked exactly once per process (lazy `static const bool`).
+
+#### Memory cap
+
+* Per cached entry: ~1.2 MB metadata arena (CHBX_MAX_NODES=8192 ×
+  ggml_tensor_overhead + graph headers).
+* At full cap (256 entries): **~310 MB** worst case.  Bounded; no
+  unbounded growth even on multi-day server runs.
+* Default-OFF means single-utterance CLI and single-shot Engine
+  callers see **0 MB** of cache memory.
+
+#### Honest limit assessment (round 4 update)
+
+After §3.34 the total per-synth host-CPU overhead on multilingual
+was ~95 % real ggml-cpu Q4_0 matmul work and ~5 % host-side fixed
+costs.  Round 4 nibbles ~12 % off T3 wall on opt-in workloads
+(~256 ms / synth #2 of multilingual at default cap) but does NOT
+help the 5500 ms S3Gen CFM compute, which remains the bulk of
+total wall time.
+
+**The chatterbox-side host envelope is now exhausted.**  Further
+multi-second wins require:
+
+* `ggml-cpu` Q4_0 / Q4_K kernel-level optimisation (out of scope
+  for chatterbox.cpp);
+* Quantisation changes (IQ4_NL, Q3, etc. — orthogonal);
+* `--cfm-steps` reduction at quality cost (already plumbed; cuts
+  CFM compute proportionally);
+* CFG removal at the synthesis level (default `cfg_weight=0`
+  already supported).
+
+No public-API change.

From eadf88f551049a1aed24510886d9d2d7da98d91f Mon Sep 17 00:00:00 2001
From: Zbigniew Herman <zbigniew.herman@tether.io>
Date: Wed, 6 May 2026 16:44:31 +0200
Subject: [PATCH 6/6] round 5 of optimizations

---
 src/chatterbox_tts.cpp          | 119 ++++++++++++++++++++++++++------
 src/chatterbox_tts_test_hooks.h |  20 ++++++
 src/test_cpu_caches.cpp         |  55 ++++++++++++++-
 3 files changed, 169 insertions(+), 25 deletions(-)

diff --git a/src/chatterbox_tts.cpp b/src/chatterbox_tts.cpp
index 9f26fb2..b372078 100644
--- a/src/chatterbox_tts.cpp
+++ b/src/chatterbox_tts.cpp
@@ -582,6 +582,24 @@ static std::unordered_map<const ggml_tensor *, std::vector<float>>       g_inv_a
 static std::unordered_map<int, std::vector<float>>                       g_hann_window_cache;
 static std::unordered_map<int, std::vector<float>>                       g_istft_kernel_cache;
 static std::unordered_map<int64_t, std::vector<float>>                   g_window_sum_cache;
+
+// Round 5 (PROGRESS.md §3.36): STFT graph + analysis-kernel caches.
+// `run_stft` runs once per synth as part of the HiFT path (between
+// SineGen and the HiFT decoder).  Both the graph and the analysis
+// kernel were rebuilt every synth in the un-optimised path; caching
+// them eliminates a 4 MB context buffer + ggml_init + graph build +
+// gallocator alloc cycle per synth, plus the small hann × trig
+// build inside `build_stft_kernel`.
+//
+// Keying:
+//   * g_stft_graph_cache.key = T_src (= T_mel × 480 in chatterbox).
+//     Streaming chunks of varying length still produce correct output
+//     — the cache rebuilds when its key diverges.
+//   * g_stft_kernel_cache key = n_fft (int).  Constant 16 in the
+//     chatterbox HiFT path; tiny per-build cost (~144 floats) but
+//     pure waste across synths.
+static graph_cache                                                       g_stft_graph_cache;
+static std::unordered_map<int, std::vector<float>>                       g_stft_kernel_cache;
 }  // namespace
 
 // Cached F32 mirror of a model tensor.  Returns a pointer into the
@@ -623,6 +641,7 @@ static void s3gen_release_synth_caches() {
     g_encoder_graph_cache.destroy();
     g_hift_graph_cache.destroy();
     g_f0_graph_cache.destroy();
+    g_stft_graph_cache.destroy();
     g_hift_inv_alpha_entries.clear();
     g_time_mlp_results.clear();
     g_time_emb_results.clear();
@@ -632,6 +651,7 @@ static void s3gen_release_synth_caches() {
     g_hann_window_cache.clear();
     g_istft_kernel_cache.clear();
     g_window_sum_cache.clear();
+    g_stft_kernel_cache.clear();
 }
 
 // ============================================================================
@@ -1660,6 +1680,23 @@ static const std::vector<float> & cached_istft_kernel(int n_fft) {
     return it->second;
 }
 
+// QVAC-18422 round 5: cached STFT analysis kernel.  Pure function of
+// n_fft (constant 16 in chatterbox HiFT) and the cached hann window.
+// Per-build cost is small (~144 floats; trig + window scaling) but
+// rebuilding it every synth is pointless waste.  Keyed identically
+// to `cached_istft_kernel`; both share `g_synth_caches_mu`.
+static const std::vector<float> & cached_stft_kernel(int n_fft) {
+    {
+        std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+        auto it = g_stft_kernel_cache.find(n_fft);
+        if (it != g_stft_kernel_cache.end()) return it->second;
+    }
+    auto k = build_stft_kernel(n_fft, cached_hann_window(n_fft));
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    auto [it, inserted] = g_stft_kernel_cache.try_emplace(n_fft, std::move(k));
+    return it->second;
+}
+
 static const std::vector<float> & cached_window_sum(int T_stft, int n_fft, int hop) {
     // Pack (n_fft, hop, T_stft) into a single int64 key — n_fft and
     // hop are constants on the chatterbox path but encoding them
@@ -1821,36 +1858,60 @@ static std::vector<float> sinegen_source(const std::vector<float> & f0_wav, int
 }
 
 // STFT (time-domain source -> spec)
+//
+// QVAC-18422 round 5: graph + analysis kernel cached process-wide via
+// g_stft_graph_cache (keyed on T_src) and g_stft_kernel_cache (keyed on
+// n_fft).  Streaming chunks of varying length still produce correct
+// output — the graph cache rebuilds when its T_src diverges; the n_fft-
+// keyed kernel cache stays at one entry across all chunks because n_fft
+// is constant in the chatterbox HiFT path.  Lifecycle is identical to
+// the round-2 graph caches: invalidated together by
+// s3gen_release_synth_caches() before ggml_backend_free, so the cached
+// gallocator releases against a still-valid backend on backend swap or
+// s3gen_unload().
 static std::vector<float> run_stft(const model_ctx & m, const std::vector<float> & src) {
     const int n_fft = 16, hop = 4;
     const int F = n_fft / 2 + 1;
     int T_src = (int)src.size();
-    auto window = build_hann_window(n_fft, true);
-    auto kernel = build_stft_kernel(n_fft, window);
 
-    static size_t buf_size = 4 * 1024 * 1024;
-    std::vector<uint8_t> buf(buf_size);
-    ggml_init_params gp = { buf_size, buf.data(), true };
-    ggml_context * ctx = ggml_init(gp);
-    ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
-    ggml_tensor * s = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, T_src, 1);
-    ggml_set_name(s, "s"); ggml_set_input(s);
-    ggml_tensor * s_pad = reflect_pad_1d(ctx, s, n_fft/2, n_fft/2);
-    ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_fft, 1, 2*F);
-    ggml_set_name(k, "k"); ggml_set_input(k);
-    ggml_tensor * spec = conv1d_f32(ctx, k, s_pad, hop, 0, 1);
-    ggml_set_name(spec, "out"); ggml_set_output(spec);
-    ggml_build_forward_expand(gf, spec);
-    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend));
-    ggml_gallocr_reserve(allocr, gf);
-    ggml_gallocr_alloc_graph(allocr, gf);
-    ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "s"), src.data(), 0, src.size()*sizeof(float));
-    ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "k"), kernel.data(), 0, kernel.size()*sizeof(float));
-    compute(m.backend, gf);
+    const std::vector<float> & kernel = cached_stft_kernel(n_fft);
+
+    graph_cache & cache = g_stft_graph_cache;
+    const bool build_graph = (cache.key != (int64_t) T_src) || (cache.ctx == nullptr);
+    if (build_graph) {
+        if (cache.allocr) { ggml_gallocr_free(cache.allocr); cache.allocr = nullptr; }
+        if (cache.ctx)    { ggml_free(cache.ctx);            cache.ctx    = nullptr; }
+        // Reuse `buf` across rebuilds — keeping it allocated avoids a
+        // 4 MB malloc when streaming chunks rotate through varying T_src
+        // values.  graph_cache::destroy() preserves the buf reservation.
+        cache.buf.resize(4 * 1024 * 1024);
+        ggml_init_params gp = { cache.buf.size(), cache.buf.data(), true };
+        cache.ctx = ggml_init(gp);
+        cache.gf  = ggml_new_graph_custom(cache.ctx, 8192, false);
+        cache.key = (int64_t) T_src;
+
+        ggml_tensor * s = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, T_src, 1);
+        ggml_set_name(s, "s"); ggml_set_input(s);
+        ggml_tensor * s_pad = reflect_pad_1d(cache.ctx, s, n_fft/2, n_fft/2);
+        ggml_tensor * k = ggml_new_tensor_3d(cache.ctx, GGML_TYPE_F32, n_fft, 1, 2*F);
+        ggml_set_name(k, "k"); ggml_set_input(k);
+        ggml_tensor * spec = conv1d_f32(cache.ctx, k, s_pad, hop, 0, 1);
+        ggml_set_name(spec, "out"); ggml_set_output(spec);
+        ggml_build_forward_expand(cache.gf, spec);
+
+        cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend));
+        ggml_gallocr_reserve(cache.allocr, cache.gf);
+    }
+
+    ggml_gallocr_alloc_graph(cache.allocr, cache.gf);
+    ggml_backend_tensor_set(ggml_graph_get_tensor(cache.gf, "s"),
+                            src.data(), 0, src.size() * sizeof(float));
+    ggml_backend_tensor_set(ggml_graph_get_tensor(cache.gf, "k"),
+                            kernel.data(), 0, kernel.size() * sizeof(float));
+    compute(m.backend, cache.gf);
+    ggml_tensor * spec = ggml_graph_get_tensor(cache.gf, "out");
     std::vector<float> out(ggml_nelements(spec));
     ggml_backend_tensor_get(spec, out.data(), 0, ggml_nbytes(spec));
-    ggml_gallocr_free(allocr);
-    ggml_free(ctx);
     return out;
 }
 
@@ -2923,6 +2984,18 @@ size_t hann_window_cache_size() {
     std::lock_guard<std::mutex> lk(g_synth_caches_mu);
     return g_hann_window_cache.size();
 }
+bool stft_graph_cache_built() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return g_stft_graph_cache.ctx != nullptr;
+}
+int stft_graph_cache_T_src() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return (int) g_stft_graph_cache.key;
+}
+size_t stft_kernel_cache_size() {
+    std::lock_guard<std::mutex> lk(g_synth_caches_mu);
+    return g_stft_kernel_cache.size();
+}
 size_t window_sum_cache_size() {
     std::lock_guard<std::mutex> lk(g_synth_caches_mu);
     return g_window_sum_cache.size();
diff --git a/src/chatterbox_tts_test_hooks.h b/src/chatterbox_tts_test_hooks.h
index 9920595..1115e35 100644
--- a/src/chatterbox_tts_test_hooks.h
+++ b/src/chatterbox_tts_test_hooks.h
@@ -94,6 +94,26 @@ size_t istft_kernel_cache_size();
 size_t hann_window_cache_size();
 size_t window_sum_cache_size();
 
+// ---------- Round 5 (PROGRESS.md §3.36): STFT graph + kernel caches ---
+//
+// `run_stft` (called once per synth from the HiFT path, between
+// SineGen output and the HiFT decoder) used to allocate a fresh
+// 4 MB context buffer + ggml_gallocator + backend buffer + build a
+// fresh conv1d graph every synth.  The graph topology depends on
+// T_src (= T_mel × 480), so it must rebuild when streaming chunks
+// change length.  The forward STFT analysis kernel `build_stft_kernel`
+// is a pure function of n_fft (constant 16 in the chatterbox path)
+// and depends on `cached_hann_window(n_fft)` — caching it eliminates
+// the per-synth ~144-element trig + window build.
+//
+// Wired into the same s3gen_release_synth_caches() teardown as the
+// other graph caches, so backend swap / s3gen_unload() leaves no
+// dangling gallocator pointing at a freed backend.
+
+bool   stft_graph_cache_built();
+int    stft_graph_cache_T_src();
+size_t stft_kernel_cache_size();
+
 // ---------- Round 4 (PROGRESS.md §3.35): T3 step-graph cache ---------
 //
 // MTL-only.  Caches the per-(n_past, is_uncond) graph that
diff --git a/src/test_cpu_caches.cpp b/src/test_cpu_caches.cpp
index 0e01e97..29ad8a1 100644
--- a/src/test_cpu_caches.cpp
+++ b/src/test_cpu_caches.cpp
@@ -188,6 +188,14 @@ void test_initial_state() {
           "HiFT hann_window cache must start empty");
     CHECK(th::window_sum_cache_size() == 0,
           "HiFT window_sum cache must start empty");
+
+    // Round 5: STFT graph + analysis-kernel caches.
+    CHECK(!th::stft_graph_cache_built(),
+          "STFT graph cache must not be built before any synth");
+    CHECK(th::stft_graph_cache_T_src() == -1,
+          "STFT graph cache T_src must be -1 (sentinel) before any build");
+    CHECK(th::stft_kernel_cache_size() == 0,
+          "STFT analysis kernel cache must start empty");
 }
 
 // ---------------- 3. determinism + cache wiring on a real synth ----------
@@ -260,6 +268,9 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf,
     const size_t n_istft_after_a    = th::istft_kernel_cache_size();
     const size_t n_hann_after_a     = th::hann_window_cache_size();
     const size_t n_wsum_after_a     = th::window_sum_cache_size();
+    const bool   stft_built_after_a = th::stft_graph_cache_built();
+    const int    stft_Tsrc_after_a  = th::stft_graph_cache_T_src();
+    const size_t n_stft_kern_after_a = th::stft_kernel_cache_size();
 
     CHECK(cfm_built_after_a,
           "after first synth, persistent cfm_estimator_cache must be built");
@@ -309,10 +320,21 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf,
           "after first synth, window_sum cache must have exactly 1 entry; "
           "saw %zu", n_wsum_after_a);
 
+    // Round 5: STFT graph + analysis-kernel caches.
+    CHECK(stft_built_after_a,
+          "after first synth, persistent STFT graph cache must be built");
+    CHECK(stft_Tsrc_after_a > 0,
+          "after first synth, STFT graph cache T_src must be > 0 (saw %d)",
+          stft_Tsrc_after_a);
+    CHECK(n_stft_kern_after_a == 1,
+          "after first synth, STFT analysis kernel cache must have exactly 1 "
+          "entry (keyed by n_fft); saw %zu", n_stft_kern_after_a);
+
     fprintf(stderr,
             "  synth #1: time_mlp=%zu time_emb=%zu weights=%zu cfm=%s "
             "enc=%s(T=%d) hift=%s(T_mel=%d,T_stft=%d) f0=%s(T_mel=%d) "
-            "pos_emb=%zu inv_alpha=%zu istft=%zu hann=%zu wsum=%zu (%.1f ms)\n",
+            "pos_emb=%zu inv_alpha=%zu istft=%zu hann=%zu wsum=%zu "
+            "stft=%s(T_src=%d) stft_kern=%zu (%.1f ms)\n",
             n_time_mlp_after_a, n_time_emb_after_a, n_weights_after_a,
             cfm_built_after_a ? "built" : "fresh",
             enc_built_after_a ? "built" : "fresh", enc_T_after_a,
@@ -320,7 +342,9 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf,
             hift_Tmel_after_a, hift_Tstft_after_a,
             f0_built_after_a ? "built" : "fresh", f0_Tmel_after_a,
             n_pos_emb_after_a, n_inv_alpha_after_a,
-            n_istft_after_a, n_hann_after_a, n_wsum_after_a, t_a);
+            n_istft_after_a, n_hann_after_a, n_wsum_after_a,
+            stft_built_after_a ? "built" : "fresh", stft_Tsrc_after_a,
+            n_stft_kern_after_a, t_a);
 
     // Second call: every cache must already be warm.  Its size must
     // not grow because the t-schedule and the model weights are
@@ -369,6 +393,15 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf,
           "synth #2 must NOT add new hann_window entries");
     CHECK(th::window_sum_cache_size()   == n_wsum_after_a,
           "synth #2 must NOT add new window_sum entries");
+    CHECK(th::stft_graph_cache_built() &&
+          th::stft_graph_cache_T_src() == stft_Tsrc_after_a,
+          "synth #2 must keep the STFT graph built with the same T_src "
+          "(was %d, now built=%d, T_src=%d)",
+          stft_Tsrc_after_a,
+          th::stft_graph_cache_built() ? 1 : 0,
+          th::stft_graph_cache_T_src());
+    CHECK(th::stft_kernel_cache_size() == n_stft_kern_after_a,
+          "synth #2 must NOT add new STFT analysis kernel entries");
 
     CHECK(wav_a.size() == wav_b.size(),
           "warm-cache synth #2 wav length must match cold-cache synth #1 "
@@ -418,6 +451,12 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf,
           "s3gen_unload must clear hann_window cache");
     CHECK(th::window_sum_cache_size() == 0,
           "s3gen_unload must clear window_sum cache");
+    CHECK(!th::stft_graph_cache_built(),
+          "s3gen_unload must tear down the STFT graph cache");
+    CHECK(th::stft_graph_cache_T_src() == -1,
+          "s3gen_unload must reset STFT graph cache T_src to sentinel -1");
+    CHECK(th::stft_kernel_cache_size() == 0,
+          "s3gen_unload must clear STFT analysis kernel cache");
 
     // Idempotent: a second unload must not crash or produce errors.
     s3gen_unload();
@@ -538,6 +577,7 @@ void test_streaming_shape_invalidation(const std::string & gguf,
     const int enc_T_chunk1     = th::encoder_graph_cache_T();
     const int hift_Tmel_chunk1 = th::hift_graph_cache_T_mel();
     const int f0_Tmel_chunk1   = th::f0_graph_cache_T_mel();
+    const int stft_Tsrc_chunk1 = th::stft_graph_cache_T_src();
 
     // Chunk #2 — longer token sequence (different shape).  All the
     // graph caches must rebuild, the t-schedule + weight + scaffolding
@@ -554,6 +594,7 @@ void test_streaming_shape_invalidation(const std::string & gguf,
     const int enc_T_chunk2     = th::encoder_graph_cache_T();
     const int hift_Tmel_chunk2 = th::hift_graph_cache_T_mel();
     const int f0_Tmel_chunk2   = th::f0_graph_cache_T_mel();
+    const int stft_Tsrc_chunk2 = th::stft_graph_cache_T_src();
 
     CHECK(enc_T_chunk1 != enc_T_chunk2,
           "encoder graph cache T must change between chunks of different "
@@ -565,6 +606,10 @@ void test_streaming_shape_invalidation(const std::string & gguf,
     CHECK(f0_Tmel_chunk1 != f0_Tmel_chunk2,
           "F0 graph cache T_mel must change between chunks (chunk1=%d, "
           "chunk2=%d)", f0_Tmel_chunk1, f0_Tmel_chunk2);
+    CHECK(stft_Tsrc_chunk1 != stft_Tsrc_chunk2,
+          "STFT graph cache T_src must change between chunks of different "
+          "lengths (chunk1 T_src=%d, chunk2 T_src=%d)",
+          stft_Tsrc_chunk1, stft_Tsrc_chunk2);
     CHECK(th::encoder_graph_cache_built(),
           "encoder graph cache must remain built after shape change "
           "(rebuilt for new T)");
@@ -572,6 +617,12 @@ void test_streaming_shape_invalidation(const std::string & gguf,
           "HiFT graph cache must remain built after shape change");
     CHECK(th::f0_graph_cache_built(),
           "F0 graph cache must remain built after shape change");
+    CHECK(th::stft_graph_cache_built(),
+          "STFT graph cache must remain built after shape change "
+          "(rebuilt for new T_src)");
+    CHECK(th::stft_kernel_cache_size() == 1,
+          "STFT analysis kernel cache must stay at exactly 1 entry across "
+          "chunks (n_fft is constant); got %zu", th::stft_kernel_cache_size());
     fprintf(stderr,
             "  chunk #1: enc_T=%d hift_T_mel=%d f0_T_mel=%d wav_len=%zu\n"
             "  chunk #2: enc_T=%d hift_T_mel=%d f0_T_mel=%d wav_len=%zu\n",