From 2ead7edcf53d6e52b12231eabbd092eb2d42200d Mon Sep 17 00:00:00 2001 From: Zbigniew Herman Date: Tue, 5 May 2026 10:08:22 +0200 Subject: [PATCH 1/6] QVAC-18422 [TTS GGML] Optimize cpp backend multilingual for CPU --- CMakeLists.txt | 11 + src/chatterbox_tts.cpp | 100 ++++++++- src/chatterbox_tts_test_hooks.h | 66 ++++++ src/test_cpu_caches.cpp | 366 ++++++++++++++++++++++++++++++++ 4 files changed, 534 insertions(+), 9 deletions(-) create mode 100644 src/chatterbox_tts_test_hooks.h create mode 100644 src/test_cpu_caches.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6401476..8c01ff7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -205,6 +205,17 @@ if (TTS_CPP_BUILD_TESTS) target_link_libraries(test-streaming PRIVATE ggml) target_include_directories(test-streaming PRIVATE ggml/include src include) + # CPU-side persistent-cache validation (QVAC-18422). + # Exercises the time_mlp / time_emb / cfm_estimator / weight_mirror + # caches that amortise per-synth overhead on the multilingual CPU + # path. Links the chatterbox_tts.cpp directly so it can reach the + # internal test-hook entrypoints. + add_executable(test-cpu-caches + src/test_cpu_caches.cpp + src/chatterbox_tts.cpp) + target_link_libraries(test-cpu-caches PRIVATE ggml) + target_include_directories(test-cpu-caches PRIVATE ggml/include src include) + add_executable(test-metal-ops src/test_metal_ops.cpp) target_link_libraries(test-metal-ops PRIVATE ggml) target_include_directories(test-metal-ops PRIVATE ggml/include src) diff --git a/src/chatterbox_tts.cpp b/src/chatterbox_tts.cpp index 22c00f6..1bb139c 100644 --- a/src/chatterbox_tts.cpp +++ b/src/chatterbox_tts.cpp @@ -27,6 +27,7 @@ #include "ggml-cpu.h" #include "gguf.h" #include "npy.h" +#include "chatterbox_tts_test_hooks.h" #ifdef GGML_USE_CUDA #include "ggml-cuda.h" @@ -179,6 +180,15 @@ static std::unique_ptr g_s3gen_cache_entry; static double g_s3gen_cache_last_load_ms = 0.0; } // namespace +// Forward declaration: clear all per-synth caches. The persistent +// graph caches (cfm_estimator + time_mlp scaffolding) and the CPU +// weight mirrors are tied to the model's backend, so they must be +// torn down BEFORE ggml_backend_free or the gallocators / backend +// buffers freed there would be released against a dead device. +// +// Defined further down (after cfm_estimator_cache is in scope). +static void s3gen_release_synth_caches(); + // Release any cached model_ctx (frees its backend buffer, ggml context and // backend). Must run before the ggml-metal / ggml-cuda / ggml-vulkan dylib // tears down its static device list; otherwise their static destructors hit @@ -186,6 +196,13 @@ static double g_s3gen_cache_last_load_ms = 0.0; // orphan backend buffer). We register it with atexit() on first cache // insertion so it runs before process-exit dylib finalisers. static void s3gen_model_cache_release() { + // Tear down the per-synth caches first so any gallocrs they hold + // (cfm_estimator_cache::allocr) are freed against the still-alive + // backend, then drop the model. Reverse order would crash on + // Vulkan/Metal/CUDA where ggml_gallocr_free against a freed + // backend asserts. + s3gen_release_synth_caches(); + std::lock_guard lk(g_s3gen_cache_mu); // QVAC-17872 round-HIFT + round 2: tear down every persistent host-side // cache BEFORE freeing the backend. The graph caches own @@ -206,16 +223,18 @@ static void s3gen_model_cache_release() { } static model_ctx * s3gen_model_cache_get(const std::string & path, int n_gpu_layers, bool verbose) { - std::lock_guard lk(g_s3gen_cache_mu); - if (g_s3gen_cache_entry && - g_s3gen_cache_entry->path == path && - g_s3gen_cache_entry->gpu == n_gpu_layers) { - if (verbose) { - fprintf(stderr, " %zu tensors (cached — skip GGUF load)\n", - g_s3gen_cache_entry->m->tensors.size()); + { + std::lock_guard lk(g_s3gen_cache_mu); + if (g_s3gen_cache_entry && + g_s3gen_cache_entry->path == path && + g_s3gen_cache_entry->gpu == n_gpu_layers) { + if (verbose) { + fprintf(stderr, " %zu tensors (cached — skip GGUF load)\n", + g_s3gen_cache_entry->m->tensors.size()); + } + g_s3gen_cache_last_load_ms = 0.0; + return g_s3gen_cache_entry->m.get(); } - g_s3gen_cache_last_load_ms = 0.0; - return g_s3gen_cache_entry->m.get(); } // QVAC-17872 round-HIFT + round 2: backend swap (different path or // n_gpu_layers). Tear down every persistent cache against the OLD @@ -1244,6 +1263,20 @@ static const float * cached_cpu_weights_f32(const ggml_tensor * t) { } } +// QVAC-18422: bit-cast cache key helpers used by the test-hooks bridge +// to query g_time_mlp_results / g_time_emb_results without re-deriving +// the (uint32_t / uint64_t) keys that compute_time_mlp_cached and +// compute_time_emb_cached compute inline above. Defined here so the +// test_hooks namespace at the bottom of the file can call them. +static uint32_t g_float_bits(float t_val) { + uint32_t bits; + std::memcpy(&bits, &t_val, sizeof(bits)); + return bits; +} +static uint64_t g_float_pair_bits(float t_val, float r_val) { + return ((uint64_t) g_float_bits(t_val) << 32) | (uint64_t) g_float_bits(r_val); +} + // QVAC-17872 round 2: definition of s3gen_release_synth_caches (forward- // declared near s3gen_model_cache_release). Defined here once the // graph_cache + cfm_estimator_cache structs and globals are all visible. @@ -2779,3 +2812,52 @@ int s3gen_preload(const std::string & s3gen_gguf_path, int n_gpu_layers) { void s3gen_unload() { s3gen_model_cache_release(); } + +// ============================================================================ +// QVAC-18422 — internal test hooks +// ============================================================================ +// +// Implementations of the read-only cache-state queries declared in +// chatterbox_tts_test_hooks.h. Defined here so they sit in the same +// translation unit as the caches themselves and don't need any extra +// linkage gymnastics. + +namespace tts_cpp::chatterbox::test_hooks { + +size_t time_mlp_result_cache_size() { + std::lock_guard lk(g_time_emb_results_mu); + return g_time_mlp_results.size(); +} +size_t time_emb_result_cache_size() { + std::lock_guard lk(g_time_emb_results_mu); + return g_time_emb_results.size(); +} +size_t weight_mirror_cache_size() { + std::lock_guard lk(g_weight_cpu_mirror_mu); + return g_weight_cpu_mirror.size(); +} +bool cfm_estimator_cache_built() { + // g_cfm_estimator_cache is mutated only under s3gen_release_synth_caches + // (which holds g_synth_caches_mu around the round-2 caches but not this + // one) and during the per-synth fast-path inside cfm_estimator_forward. + // The single-pointer load below is atomic on x86/ARM; tests treat it + // as a snapshot. + return g_cfm_estimator_cache.ctx != nullptr; +} +bool cfm_estimator_cache_b2() { + return g_cfm_estimator_cache.b2; +} +uint32_t float_cache_key(float t_val) { + return g_float_bits(t_val); +} +uint64_t float_pair_cache_key(float t_val, float r_val) { + return g_float_pair_bits(t_val, r_val); +} +std::vector peek_time_mlp_cached(float t_val) { + std::lock_guard lk(g_time_emb_results_mu); + auto it = g_time_mlp_results.find(g_float_bits(t_val)); + if (it == g_time_mlp_results.end()) return {}; + return it->second; +} + +} // namespace tts_cpp::chatterbox::test_hooks diff --git a/src/chatterbox_tts_test_hooks.h b/src/chatterbox_tts_test_hooks.h new file mode 100644 index 0000000..c9fdb91 --- /dev/null +++ b/src/chatterbox_tts_test_hooks.h @@ -0,0 +1,66 @@ +// Internal test hooks for chatterbox_tts.cpp's CPU optimisation caches. +// +// These declarations let tests in src/test_*.cpp inspect cache state that is +// otherwise file-static. They are deliberately NOT included in +// include/tts-cpp/chatterbox/s3gen_pipeline.h because production callers must +// not depend on cache layout. +// +// The hooks are populated by the persistent-cache work landed for QVAC-18422 +// (CPU-side multilingual perf): see PROGRESS.md §3.32. +// +// Rules: +// - Read-only. Tests must NOT mutate cache state via these hooks; use +// the public s3gen_unload() helper if a clean slate is required. +// - Locking is internal. All hooks acquire the same mutex used by the +// cache writers, so concurrent calls during a synthesize() in another +// thread are safe but may briefly block. +// - Stable across the QVAC-18422 series. Adding new caches must add new +// hooks rather than reshape existing ones. + +#pragma once + +#include +#include +#include + +namespace tts_cpp::chatterbox::test_hooks { + +// Number of (t_val) entries in the time_mlp result cache populated lazily +// by compute_time_mlp_cached(). Multilingual = up to n_timesteps + 1 +// distinct t-values per process; Turbo = up to 3 (t_span = [0, 0.5, 1]). +size_t time_mlp_result_cache_size(); + +// Number of ((t_val, r_val)) entries in the time_mixed result cache used +// only by the Turbo meanflow path. Multilingual never populates this. +size_t time_emb_result_cache_size(); + +// Number of ggml_tensor* entries in the CPU weight mirror cache. +// Populated by cached_cpu_weights_f32(); covers flow/input_embedding + +// spk_embed_affine/{w,b} + any other weight that synthesize() reads via +// ggml_backend_tensor_get on the hot path. +size_t weight_mirror_cache_size(); + +// True iff the persistent (global) cfm_estimator_cache currently holds +// a built graph. Initially false; flips to true after the first call to +// cfm_estimator_forward() and stays true until s3gen_unload(). +bool cfm_estimator_cache_built(); + +// Returns true iff the persistent cfm_estimator_cache last built a B=2 +// (CFG cond+uncond batched) graph. Always false on CPU because the +// CPU code path keeps use_b2 = false; useful for verifying that future +// edits don't accidentally flip CPU into the B=2 path. +bool cfm_estimator_cache_b2(); + +// Cache key generators — exposed so tests can verify the hashing rules +// for floats (bit-cast into uint32_t / uint64_t). Important because +// std::hash mishandles -0.0 / +0.0 and NaN inconsistently across +// libstdc++/libc++. +uint32_t float_cache_key(float t_val); +uint64_t float_pair_cache_key(float t_val, float r_val); + +// Returns the cached time_mlp output for `t_val` if present, or an +// empty vector if there's no entry. Lets tests probe whether a given +// t-value was actually warmed without re-entering compute_time_mlp. +std::vector peek_time_mlp_cached(float t_val); + +} // namespace tts_cpp::chatterbox::test_hooks diff --git a/src/test_cpu_caches.cpp b/src/test_cpu_caches.cpp new file mode 100644 index 0000000..47e0d29 --- /dev/null +++ b/src/test_cpu_caches.cpp @@ -0,0 +1,366 @@ +// CPU-side persistent-cache validation harness for QVAC-18422 +// "[TTS GGML] Optimize cpp backend multilingual for CPU". +// +// Verifies the four cache layers added to chatterbox_tts.cpp: +// +// 1. compute_time_mlp_cached() — t_val (float) → (1024,) t_emb vector. +// Multilingual fires 10 distinct t-values per synth (cosine schedule); +// Turbo fires 3. Across synth calls the schedule is constant, so the +// cache amortises every subsequent synth to zero compute_time_mlp work. +// +// 2. compute_time_emb_cached() — (t_val, r_val) → (1024,) mixed embedding. +// Turbo meanflow only; multilingual leaves this cache empty. +// +// 3. g_cfm_estimator_cache — promotes the local-scope cfm_estimator_cache +// to global lifetime so subsequent synth calls don't rebuild the +// ~5500-node CFM graph or pay the gallocr_reserve cost. +// +// 4. g_weight_cpu_mirror — CPU mirror of large per-synth weight reads +// (flow/input_embedding ~28 MB on multilingual, spk_embed_affine +// ~60 KB). Saves the ggml_backend_tensor_get round-trip every synth. +// +// All caches are invalidated together by s3gen_unload() so that switching +// to a different backend (e.g. CPU → Vulkan) doesn't reuse stale state. +// +// Usage (with model) : ./test-cpu-caches MODEL_S3GEN.gguf [REF_DIR] +// Usage (cache-key only) : ./test-cpu-caches +// +// Without a GGUF the harness still runs the lightweight cache-key tests +// that catch the typical -0/+0/NaN / std::hash portability traps. + +#include "tts-cpp/chatterbox/s3gen_pipeline.h" +#include "chatterbox_tts_test_hooks.h" +#include "npy.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace th = tts_cpp::chatterbox::test_hooks; + +namespace { + +int g_failures = 0; +int g_checks = 0; + +#define CHECK(cond, ...) do { \ + ++g_checks; \ + if (!(cond)) { \ + ++g_failures; \ + fprintf(stderr, "FAIL %s:%d %s\n ", \ + __FILE__, __LINE__, #cond); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + } \ +} while (0) + +bool path_exists(const std::string & p) { + struct stat st; return ::stat(p.c_str(), &st) == 0; +} + +double now_ms() { + using clock = std::chrono::steady_clock; + return std::chrono::duration( + clock::now().time_since_epoch()).count(); +} + +// ---------------- 1. cache-key bit-cast tests ---------------- +// +// These run unconditionally — no model needed. They guard the rule +// that the time_mlp result cache uses a bit-cast hash of the float +// (so +0/-0 land in different buckets, NaNs are stable per-bit-pattern, +// and equal floats always hash to the same bucket regardless of how +// they were computed). + +void test_cache_keys() { + fprintf(stderr, "=== cache key (bit-cast) tests ===\n"); + + // Equal floats → equal keys. + CHECK(th::float_cache_key(0.5f) == th::float_cache_key(0.5f), + "0.5 should be stable"); + + // +0.0 and -0.0 are NOT equal under bit-cast (sign bit differs). + // std::hash typically collapses them — we deliberately don't. + const float pos_zero = 0.0f; + const float neg_zero = -0.0f; + CHECK(th::float_cache_key(pos_zero) != th::float_cache_key(neg_zero), + "+0 and -0 must produce distinct cache keys"); + + // Distinct values → distinct keys (sanity). + CHECK(th::float_cache_key(0.5f) != th::float_cache_key(0.25f), + "0.5 vs 0.25 must differ"); + + // NaN: bit-pattern stable (we don't normalise) — same NaN payload + // hashes the same. This is fine because the time_mlp_cache is + // only ever queried with t_span values, none of which are NaN. + uint32_t nan_bits = 0x7fc00001u; // a quiet NaN + float nan_val; + std::memcpy(&nan_val, &nan_bits, sizeof(float)); + CHECK(th::float_cache_key(nan_val) == 0x7fc00001u, + "NaN bit pattern must round-trip"); + + // Pair key: high 32 bits = t_val, low 32 bits = r_val. + const float t = 0.5f; + const float r = 1.0f; + const uint64_t expect = + ((uint64_t) th::float_cache_key(t) << 32) | + (uint64_t) th::float_cache_key(r); + CHECK(th::float_pair_cache_key(t, r) == expect, + "pair key must compose from individual float keys"); + + // Order matters: (t, r) ≠ (r, t). + CHECK(th::float_pair_cache_key(0.5f, 1.0f) != + th::float_pair_cache_key(1.0f, 0.5f), + "pair key must not be commutative"); + + // Cosine schedule used by multilingual (n_timesteps=10) — verify + // 10 distinct keys. Mirrors the t_span = 1 - cos(i/10 * pi/2) loop + // in s3gen_synthesize_to_wav. + std::vector keys; + keys.reserve(10); + for (int i = 0; i < 10; ++i) { + float tau = (float) i / 10.0f; + float t_cos = 1.0f - std::cos(tau * 0.5f * (float) M_PI); + keys.push_back(th::float_cache_key(t_cos)); + } + bool all_distinct = true; + for (size_t i = 0; i < keys.size(); ++i) { + for (size_t j = i + 1; j < keys.size(); ++j) { + if (keys[i] == keys[j]) { all_distinct = false; break; } + } + } + CHECK(all_distinct, + "multilingual t-span (n_timesteps=10 cosine) must produce 10 " + "distinct cache keys, otherwise compute_time_mlp_cached would " + "alias unrelated steps"); +} + +// ---------------- 2. starting cache state ---------------- + +void test_initial_state() { + fprintf(stderr, "=== initial cache state ===\n"); + + // s3gen_unload() before any synth must succeed even if no caches + // were ever populated (idempotent). Production callers in the + // bare-addon teardown rely on this. + s3gen_unload(); + CHECK(th::time_mlp_result_cache_size() == 0, + "time_mlp result cache must start empty"); + CHECK(th::time_emb_result_cache_size() == 0, + "time_emb result cache must start empty"); + CHECK(th::weight_mirror_cache_size() == 0, + "weight mirror cache must start empty"); + CHECK(!th::cfm_estimator_cache_built(), + "persistent cfm_estimator_cache must not be built before any " + "synth"); + CHECK(!th::cfm_estimator_cache_b2(), + "persistent cfm_estimator_cache b2 flag must default false"); +} + +// ---------------- 3. determinism + cache wiring on a real synth ---------- + +// Read built-in voice tokens. No multilingual model available locally, +// so the harness uses the Turbo built-in voice if --ref-dir wasn't +// passed. The cache logic is model-agnostic by construction; the +// multilingual benefit factor is larger but the bit-exact + lifecycle +// invariants this test verifies are identical across variants. +std::vector sample_speech_tokens() { + // 24 tokens — enough to exercise the encoder + a single CFM batch + // without bloating run-time. Values are within [0, 6561) (S3 vocab). + return { + 12, 34, 56, 78, 90, 121, 152, 173, 195, 217, 239, 261, + 283, 305, 327, 349, 371, 393, 415, 437, 459, 481, 503, 525, + }; +} + +bool synthesize_once(const std::string & gguf, + const std::string & ref_dir, + std::vector & wav, + double & wall_ms) { + s3gen_synthesize_opts opts; + opts.s3gen_gguf_path = gguf; + opts.ref_dir = ref_dir; + opts.out_wav_path = ""; // stay in-memory + opts.pcm_out = &wav; + opts.seed = 42; + opts.n_threads = 0; // auto: hardware_concurrency + opts.sr = 24000; + opts.verbose = false; + opts.n_gpu_layers = 0; // CPU-only for this test + opts.apply_trim_fade = true; + opts.finalize = true; + + const auto tokens = sample_speech_tokens(); + const double t0 = now_ms(); + int rc = s3gen_synthesize_to_wav(tokens, opts); + wall_ms = now_ms() - t0; + return rc == 0 && !wav.empty(); +} + +void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf, + const std::string & ref_dir) { + fprintf(stderr, "=== warm-cache bit-exact + lifecycle ===\n"); + + // First call populates every cache. Subsequent calls must (a) + // produce bit-exact output and (b) skip every cache that was + // already warmed. + std::vector wav_a, wav_b, wav_c; + double t_a = 0, t_b = 0, t_c = 0; + if (!synthesize_once(gguf, ref_dir, wav_a, t_a)) { + fprintf(stderr, "skip: synth #1 failed (model load / arch?)\n"); + return; + } + + const size_t n_time_mlp_after_a = th::time_mlp_result_cache_size(); + const size_t n_time_emb_after_a = th::time_emb_result_cache_size(); + const size_t n_weights_after_a = th::weight_mirror_cache_size(); + const bool cfm_built_after_a = th::cfm_estimator_cache_built(); + + CHECK(cfm_built_after_a, + "after first synth, persistent cfm_estimator_cache must be built"); + CHECK(n_time_mlp_after_a > 0, + "after first synth, time_mlp result cache must have at least one " + "entry (n_timesteps for multilingual / 3 for Turbo)"); + CHECK(n_weights_after_a > 0, + "after first synth, weight_mirror_cache must have at least one " + "entry (input_embedding + spk_embed_affine/{w,b})"); + fprintf(stderr, + " synth #1: time_mlp=%zu time_emb=%zu weights=%zu cfm=%s " + "(%.1f ms)\n", + n_time_mlp_after_a, n_time_emb_after_a, n_weights_after_a, + cfm_built_after_a ? "built" : "fresh", t_a); + + // Second call: every cache must already be warm. Its size must + // not grow because the t-schedule and the model weights are + // constant across synth calls. + if (!synthesize_once(gguf, ref_dir, wav_b, t_b)) { + fprintf(stderr, "skip: synth #2 failed\n"); + return; + } + CHECK(th::time_mlp_result_cache_size() == n_time_mlp_after_a, + "synth #2 must NOT add new time_mlp entries (saw %zu, expected %zu)", + th::time_mlp_result_cache_size(), n_time_mlp_after_a); + CHECK(th::time_emb_result_cache_size() == n_time_emb_after_a, + "synth #2 must NOT add new time_emb entries"); + CHECK(th::weight_mirror_cache_size() == n_weights_after_a, + "synth #2 must NOT add new weight_mirror entries"); + CHECK(th::cfm_estimator_cache_built(), + "synth #2 must keep the persistent cfm graph built"); + + CHECK(wav_a.size() == wav_b.size(), + "warm-cache synth #2 wav length must match cold-cache synth #1 " + "(%zu vs %zu)", wav_a.size(), wav_b.size()); + if (wav_a.size() == wav_b.size() && !wav_a.empty()) { + size_t diff = 0; + float max_abs = 0; + for (size_t i = 0; i < wav_a.size(); ++i) { + float d = std::fabs(wav_a[i] - wav_b[i]); + if (d > 0) diff++; + if (d > max_abs) max_abs = d; + } + CHECK(diff == 0, + "warm-cache synth #2 must be byte-for-byte identical to " + "synth #1 (mismatched samples=%zu, max_abs=%.6e)", diff, max_abs); + } + fprintf(stderr, " synth #2: %.1f ms (warm caches, bit-exact ok)\n", t_b); + + // Third call after s3gen_unload() — every cache must have been + // reset. Subsequent synth must repopulate them and still + // produce bit-exact output (deterministic seed=42). + s3gen_unload(); + CHECK(th::time_mlp_result_cache_size() == 0, + "s3gen_unload must clear time_mlp result cache"); + CHECK(th::time_emb_result_cache_size() == 0, + "s3gen_unload must clear time_emb result cache"); + CHECK(th::weight_mirror_cache_size() == 0, + "s3gen_unload must clear weight_mirror cache"); + CHECK(!th::cfm_estimator_cache_built(), + "s3gen_unload must tear down the persistent cfm cache"); + + // Idempotent: a second unload must not crash or produce errors. + s3gen_unload(); + + if (!synthesize_once(gguf, ref_dir, wav_c, t_c)) { + fprintf(stderr, "skip: synth #3 (post-unload) failed\n"); + return; + } + CHECK(th::cfm_estimator_cache_built(), + "synth #3 must rebuild the cfm cache after unload"); + CHECK(wav_a.size() == wav_c.size(), + "post-unload synth wav length must match"); + if (wav_a.size() == wav_c.size() && !wav_a.empty()) { + size_t diff = 0; + float max_abs = 0; + for (size_t i = 0; i < wav_a.size(); ++i) { + float d = std::fabs(wav_a[i] - wav_c[i]); + if (d > 0) diff++; + if (d > max_abs) max_abs = d; + } + CHECK(diff == 0, + "post-unload synth must be byte-for-byte identical to first " + "synth (mismatched samples=%zu, max_abs=%.6e)", + diff, max_abs); + } + fprintf(stderr, " synth #3 (post-unload): %.1f ms — bit-exact ok\n", t_c); + + // peek_time_mlp_cached: warm value should round-trip. + auto cosine_t = [](int i, int n) { + float tau = (float) i / (float) n; + return 1.0f - std::cos(tau * 0.5f * (float) M_PI); + }; + // For Turbo (meanflow=true, n_timesteps=2) the schedule is linear: + // [0, 0.5, 1.0]. For multilingual (cosine, n_timesteps=10) the + // schedule is cosine. We probe both candidates non-destructively; + // at least one of {0.5f, cosine_t(1,10)} should be present. + auto a = th::peek_time_mlp_cached(0.5f); + auto b = th::peek_time_mlp_cached(cosine_t(1, 10)); + CHECK(!a.empty() || !b.empty(), + "peek_time_mlp_cached must return a populated entry for at least " + "one of the canonical t-values (0.5 for Turbo or cosine[1] for " + "multilingual)"); + if (!a.empty()) { + CHECK(a.size() == 1024, + "time_mlp cached entry must be (1024,) — saw %zu", a.size()); + } + if (!b.empty()) { + CHECK(b.size() == 1024, + "time_mlp cached entry must be (1024,) — saw %zu", b.size()); + } +} + +} // namespace + +int main(int argc, char ** argv) { + fprintf(stderr, "test-cpu-caches: QVAC-18422 cache validation\n"); + + test_cache_keys(); + test_initial_state(); + + if (argc < 2) { + fprintf(stderr, "\n(no GGUF given — skipping warm-cache + lifecycle " + "tests; run as `%s MODEL.gguf [REF_DIR]` to exercise " + "the full pipeline)\n", argv[0]); + } else { + const std::string gguf = argv[1]; + const std::string ref_dir = (argc >= 3 ? argv[2] : ""); + if (!path_exists(gguf)) { + fprintf(stderr, "error: GGUF not found at %s\n", gguf.c_str()); + return 2; + } + test_warm_cache_bit_exact_and_lifecycle(gguf, ref_dir); + } + + // Always release at exit so the next test invocation starts clean. + s3gen_unload(); + + fprintf(stderr, "\n=== summary ===\n checks: %d\n failures: %d\n", + g_checks, g_failures); + return g_failures == 0 ? 0 : 1; +} From 4d18b37dfe0ff9d99d887541d60dc57aab7a6d81 Mon Sep 17 00:00:00 2001 From: Zbigniew Herman Date: Tue, 5 May 2026 13:03:21 +0200 Subject: [PATCH 2/6] QVAC-18422 [TTS GGML] Optimize cpp backend multilingual for CPU (round 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PROGRESS.md §3.33 — persistent encoder/HiFT/F0 graph caches + pos_emb / inv_alpha / hann_window / istft_kernel / window_sum scaffolding caches on top of the round-1 CFM caches (§3.32). Turbo single-utterance S3GEN_INFER_MS -22 %, streaming wall -27 %. Tests: 79/79 pass (49 new round-2 checks). --- src/chatterbox_tts.cpp | 652 +++++++++++++++++--------------- src/chatterbox_tts_test_hooks.h | 31 ++ src/test_cpu_caches.cpp | 241 +++++++++++- 3 files changed, 616 insertions(+), 308 deletions(-) diff --git a/src/chatterbox_tts.cpp b/src/chatterbox_tts.cpp index 1bb139c..afd87e4 100644 --- a/src/chatterbox_tts.cpp +++ b/src/chatterbox_tts.cpp @@ -443,6 +443,197 @@ static ggml_tensor * reflect_pad_1d(ggml_context * ctx, ggml_tensor * x, int p_l return y; } +// ============================================================================ +// QVAC-18422 — CPU-side persistent caches (multilingual TTS optimisation) +// ============================================================================ +// +// Round 1 (already shipped above) targeted three host-side bottlenecks: +// (a) compute_time_mlp graph submissions (10× / synth on multilingual) +// (b) the local-scope cfm_estimator_cache rebuild on every synth +// (c) per-synth ggml_backend_tensor_get of the 13–28 MB +// flow/input_embedding + the speaker affine matrices +// +// Round 2 closes the gap between "host overhead" and "real compute" for +// the remaining per-synth pipeline stages: +// +// (d) S3Gen Conformer encoder graph + gallocator built from scratch +// every synth (~700 nodes; ~3-5 ms saved per synth) +// (e) HiFT decoder graph built from scratch every synth (~3000 nodes +// across 3 upsample stages × 9 ResBlocks; ~10-30 ms saved) +// (f) F0 predictor graph built every synth (~25 nodes; <1 ms saved) +// (g) compute_pos_emb result (T trig ops, fired twice per encoder run) +// (h) build_hann_window / build_istft_kernel scaffolding for HiFT +// (~1.85M F32 mults + cos/sin in build_istft_kernel alone) +// (i) build_window_sum scaffolding (T_stft × n_fft F32 ops) +// (j) invert_alpha_cpu fired ~72× per HiFT call (12 ResBlocks × 6 +// alpha tensors; each does a tensor_get + per-element reciprocal) +// +// Every cache is process-wide, keyed by the shape parameters that +// drive graph topology (so streaming chunks of varying length still +// produce correct output — the cache rebuilds when its key +// diverges). Cleanup happens in s3gen_release_synth_caches before +// ggml_backend_free, so the gallocators in the graph caches release +// against a still-valid backend. + +// Generic per-stage graph cache (encoder / HiFT / F0 predictor). Owns +// the ggml_context, graph, and gallocator. `key` encodes the shape +// parameters that drive graph topology (e.g. T for the encoder, +// pack(T_mel, T_stft) for HiFT) — a build is reused iff the requested +// `key` matches the cached one. -1 means "no graph built". +struct graph_cache { + int64_t key = -1; + ggml_context * ctx = nullptr; + ggml_cgraph * gf = nullptr; + ggml_gallocr_t allocr = nullptr; + std::vector buf; + + void destroy() { + if (allocr) { ggml_gallocr_free(allocr); allocr = nullptr; } + if (ctx) { ggml_free(ctx); ctx = nullptr; } + gf = nullptr; + key = -1; + // Keep `buf` reservation; reusing it avoids a multi-MB malloc + // on the next rebuild. + } +}; + +// Pack (T_mel, T_stft) into a single int64_t key for the HiFT graph +// cache. Both dimensions are positive int32 in practice; combining +// them this way gives a unique key with no collision. +static int64_t pack_hift_key(int T_mel, int T_stft) { + return ((int64_t) T_mel << 32) | (uint32_t) T_stft; +} + +// Round-1 CFM estimator graph cache (struct definition; the global +// instance lives in the cache-state block below alongside the round-2 +// graph caches). Cache key is (T, b2): a graph built for batch=1 +// (cfm_estimator_forward) cannot be reused for the batch=2 path +// (cfm_estimator_forward_b2) since the input tensor layouts differ +// (ne[2] = 1 vs 2). Today `use_b2` is constant per +// `s3gen_synthesize_to_wav` invocation so the key disambiguation is +// belt-and-braces — but a future change that switches modes +// mid-utterance (e.g. CFG warm-up where step 0 is single-pass and +// steps 1+ are batched) would silently reuse a wrong-shape graph and +// crash inside the allocator. +struct cfm_estimator_cache { + int T = -1; + bool b2 = false; + ggml_context * ctx = nullptr; + ggml_cgraph * gf = nullptr; + ggml_gallocr_t allocr = nullptr; + std::vector buf; + ~cfm_estimator_cache() { + if (allocr) ggml_gallocr_free(allocr); + if (ctx) ggml_free(ctx); + } + // Explicit reset usable from s3gen_release_synth_caches() — the + // global instance never goes out of scope, so the destructor alone + // wouldn't run before ggml_backend_free in the normal teardown + // ordering. Idempotent. + void destroy() { + if (allocr) { ggml_gallocr_free(allocr); allocr = nullptr; } + if (ctx) { ggml_free(ctx); ctx = nullptr; } + gf = nullptr; + T = -1; + b2 = false; + // Keep `buf` allocated — it's just a heap arena, no backend + // resource bound to it. Reusing it avoids a 64 MB malloc on + // the next synth. + } +}; + +// Bit-cast cache key for floats — avoids ambiguous std::hash +// behaviour on -0.0/+0.0 and NaN bit patterns. Tested by +// test_cpu_caches.cpp::test_cache_keys. +static uint32_t g_float_bits(float t_val) { + uint32_t bits; + std::memcpy(&bits, &t_val, sizeof(bits)); + return bits; +} +static uint64_t g_float_pair_bits(float t_val, float r_val) { + return ((uint64_t) g_float_bits(t_val) << 32) | (uint64_t) g_float_bits(r_val); +} + +namespace { +// Single mutex around every cache. Held only across cache-state +// mutations (insert / clear / size queries), not across the heavy +// compute itself. +static std::mutex g_synth_caches_mu; + +// Round 1 result caches. +static std::unordered_map> g_time_mlp_results; +static std::unordered_map> g_time_emb_results; +static std::unordered_map> g_weight_cpu_mirror; +static cfm_estimator_cache g_cfm_estimator_cache; + +// Round 2 graph caches. +static graph_cache g_encoder_graph_cache; +static graph_cache g_hift_graph_cache; +static graph_cache g_f0_graph_cache; +// Parallel metadata for HiFT: the (graph-input-name, model-tensor-ptr) +// pairs for every alpha tensor referenced by the cached HiFT graph. +// Used on cache hits to refresh each alpha-input slot with the data +// from g_inv_alpha_results without rebuilding the graph. +static std::vector> g_hift_inv_alpha_entries; + +// Round 2 result caches (pure-compute scaffolding). +static std::unordered_map> g_pos_emb_results; +static std::unordered_map> g_inv_alpha_results; +static std::unordered_map> g_hann_window_cache; +static std::unordered_map> g_istft_kernel_cache; +static std::unordered_map> g_window_sum_cache; +} // namespace + +// Cached F32 mirror of a model tensor. Returns a pointer into the +// cache; valid until s3gen_unload(). Caller must NOT free. +// +// First call: ggml_backend_tensor_get into a freshly allocated +// std::vector. Subsequent calls: hit-cache and return the +// existing pointer. +// +// Requires the source tensor to be F32; chatterbox's bandwidth-heavy +// per-synth weights (input_embedding, spk_embed_affine/{w,b}) all +// live as F32, so a templated variant for F16/Q8_0 isn't needed here. +static const float * cached_cpu_weights_f32(const ggml_tensor * t) { + if (!t) return nullptr; + { + std::lock_guard lk(g_synth_caches_mu); + auto it = g_weight_cpu_mirror.find(t); + if (it != g_weight_cpu_mirror.end()) { + return it->second.data(); + } + } + // Read outside the lock (the get is ~ms-scale on a GPU backend). + std::vector staged(ggml_nelements(t)); + ggml_backend_tensor_get(t, staged.data(), 0, ggml_nbytes(t)); + + std::lock_guard lk(g_synth_caches_mu); + auto [it, inserted] = g_weight_cpu_mirror.try_emplace(t, std::move(staged)); + return it->second.data(); +} + +// Tear down every per-synth cache. Safe to call multiple times; safe +// before/after s3gen_model_cache_release. Mutex held just long +// enough to flip the data structures — if a synth is mid-flight on +// another thread it must finish before this returns (gallocr_free on +// a graph that's about to be reused is undefined). +static void s3gen_release_synth_caches() { + std::lock_guard lk(g_synth_caches_mu); + g_cfm_estimator_cache.destroy(); + g_encoder_graph_cache.destroy(); + g_hift_graph_cache.destroy(); + g_f0_graph_cache.destroy(); + g_hift_inv_alpha_entries.clear(); + g_time_mlp_results.clear(); + g_time_emb_results.clear(); + g_weight_cpu_mirror.clear(); + g_pos_emb_results.clear(); + g_inv_alpha_results.clear(); + g_hann_window_cache.clear(); + g_istft_kernel_cache.clear(); + g_window_sum_cache.clear(); +} + // ============================================================================ // Encoder (Conformer) — produces mu for CFM // ============================================================================ @@ -547,85 +738,14 @@ static ggml_tensor * conformer_block(ggml_context * ctx, const conformer_w & w, return ggml_add(ctx, residual, ff); } -// ============================================================================ -// QVAC-17872 round 2: persistent graph + scaffolding caches (declarations). -// ---------------------------------------------------------------------------- -// All host-side, model-agnostic, no GGUF-format change. Same teardown -// discipline as g_cfm_estimator_cache (destroy() before ggml_backend_free). -// -// Targeted bottlenecks on multilingual on Vulkan (after round-1 / round-HIFT -// already shipped): -// - run_encoder rebuilds its full graph + gallocr per synth (~17 ms host -// overhead on multilingual T=350+). -// - run_hift_decode rebuilds its graph + gallocr + computes -// hann_window/istft_kernel/window_sum + ~72 inv_alpha tensor_get calls -// per synth (~7-10 ms compounded host overhead, multilingual is the -// biggest beneficiary because audio length scales with the prompt). -// - run_f0_predictor rebuilds its (smaller) graph per synth. -// - compute_pos_emb fires twice per encoder run (for T and 2T) at -// ~T*D*5 trig ops; multilingual chunks of T~350+ pay several ms. -// -// Each cache is process-wide; the steady-state size is small (1-2 entries -// per shape key) and bounded by the number of distinct shapes the running -// process sees. Streaming sessions with many varying T values can grow -// these caches; a future LRU bound would belong here. -// -// The cache state lives here (above run_encoder so its definition can use -// it). The destroy/clear function `s3gen_release_synth_caches()` is -// defined later, alongside g_cfm_estimator_cache, since it touches both. -// ============================================================================ - -// Generic graph cache used by encoder / HiFT / F0 — same shape, different keys. -struct graph_cache { - int64_t key = -1; - ggml_context * ctx = nullptr; - ggml_cgraph * gf = nullptr; - ggml_gallocr_t allocr = nullptr; - std::vector buf; - - void destroy() { - if (allocr) { ggml_gallocr_free(allocr); allocr = nullptr; } - if (ctx) { ggml_free(ctx); ctx = nullptr; } - gf = nullptr; - key = -1; - // Keep `buf` reservation; reusing it avoids a multi-MB malloc on - // the next rebuild. - } -}; - -// Pack (T_mel, T_stft) into a single int64_t key for the HiFT graph cache. -// Both dimensions are positive int32 in practice; combining them this way -// gives a unique key with no collision. -static int64_t pack_hift_key(int T_mel, int T_stft) { - return ((int64_t) T_mel << 32) | (uint32_t) T_stft; -} - -namespace { -// Single mutex around every round-2 cache. Held only across cache-state -// mutations (insert / clear / size queries), not across the heavy compute -// or graph rebuilds themselves. s3gen_synthesize_to_wav is process-serial -// in practice (the existing s3gen_cache_entry mutex enforces single-flight -// model loads), so contention is effectively zero. -static std::mutex g_synth_caches_mu; - -// Graph caches. -static graph_cache g_encoder_graph_cache; // keyed on T (encoder input length) -static graph_cache g_hift_graph_cache; // keyed on pack(T_mel, T_stft) -static graph_cache g_f0_graph_cache; // keyed on T_mel - -// Parallel metadata for HiFT: the (graph-input-name, model-tensor-ptr) -// pairs for every alpha tensor referenced by the cached HiFT graph. -// Used on cache hits to refresh each alpha-input slot from the data in -// g_inv_alpha_results without rebuilding the graph. -static std::vector> g_hift_inv_alpha_entries; - -// Result / scaffolding caches (pure CPU compute). -static std::unordered_map> g_pos_emb_results; -static std::unordered_map> g_inv_alpha_results; -static std::unordered_map> g_hann_window_cache; -static std::unordered_map> g_istft_kernel_cache; -static std::unordered_map> g_window_sum_cache; -} // namespace +// QVAC-17872 round 2 / QVAC-18422 round 2: the `graph_cache` struct, +// `pack_hift_key`, and the cache-state globals (g_encoder_graph_cache, +// g_hift_graph_cache, g_f0_graph_cache, g_hift_inv_alpha_entries, +// g_pos_emb_results, g_inv_alpha_results, g_hann_window_cache, +// g_istft_kernel_cache, g_window_sum_cache) all live in the QVAC-18422 +// CPU-side cache block earlier in this file — declared above run_encoder +// so its definition can use them, and torn down in +// s3gen_release_synth_caches() against the still-live backend. // Scaffolding-helper forward declarations (definitions live later, alongside // the cfm_estimator_cache + cached_cpu_weights_f32 helpers, where the @@ -664,6 +784,24 @@ static void compute_pos_emb(std::vector & pe, int T, int D) { } } +// QVAC-18422 round 2: cached wrapper around compute_pos_emb. Keyed by +// pack(T, D); for chatterbox D is constant=512 and T is determined by +// the encoder input length. Streaming chunks at the same T after the +// first synth pay zero compute_pos_emb work. +static const std::vector & cached_pos_emb(int T, int D) { + const int64_t key = ((int64_t) T << 32) | (uint32_t) D; + { + std::lock_guard lk(g_synth_caches_mu); + auto it = g_pos_emb_results.find(key); + if (it != g_pos_emb_results.end()) return it->second; + } + std::vector pe; + compute_pos_emb(pe, T, D); + std::lock_guard lk(g_synth_caches_mu); + auto [it, inserted] = g_pos_emb_results.try_emplace(key, std::move(pe)); + return it->second; +} + // Run the full S3Gen encoder: input (T, D=512) -> mu (2T, 80) // QVAC-17872 round 2: graph + gallocator cached process-wide via // g_encoder_graph_cache (keyed on T = encoder input length). Same-shape @@ -1134,39 +1272,25 @@ static std::vector compute_time_mixed(const model_ctx & m, return out; } -// QVAC-17872 round-HIFT: memoised time-embedding pipeline. Both Turbo -// (meanflow, t_span = [0, 0.5, 1]) and multilingual (cosine-scheduled, 10 -// steps) produce the same set of t-values across all subsequent synth -// calls — the t-embedding outputs are deterministic functions of t (and -// the model weights), so we can cache them. -// -// Two-layer cache: -// - g_time_mlp_results: keyed by uint32_t bitcast of t_val, used by -// both paths. Multilingual benefits the most (10 distinct t-values -// repeated across every synth). -// - g_time_emb_results: keyed by uint64_t = (kt << 32) | kr, ONLY -// used by Turbo (meanflow) since multilingual doesn't run the mixer. -// -// Cleared in s3gen_release_synth_caches alongside the graph cache. +// QVAC-18422: memoised time-embedding pipeline. Both Turbo (meanflow, +// t_span = [0, 0.5, 1]) and multilingual (cosine-scheduled, 10 steps) +// produce the same set of t-values across all subsequent synth calls — +// the t-embedding outputs are deterministic functions of t (and the +// model weights), so we cache them. Globals + mutex live in the +// QVAC-18422 anonymous namespace block earlier in this file. // // Bit-exactness: trivially preserved — same compute, just memoised. -static std::unordered_map> g_time_mlp_results; -static std::unordered_map> g_time_emb_results; -static std::mutex g_time_emb_results_mu; - static std::vector compute_time_mlp_cached(const model_ctx & m, float t_val) { - uint32_t key; - static_assert(sizeof(key) == sizeof(t_val), "float must be 32-bit for bitcast key"); - std::memcpy(&key, &t_val, sizeof(key)); + const uint32_t key = g_float_bits(t_val); { - std::lock_guard lk(g_time_emb_results_mu); + std::lock_guard lk(g_synth_caches_mu); auto it = g_time_mlp_results.find(key); if (it != g_time_mlp_results.end()) return it->second; } auto out = compute_time_mlp(m, t_val); { - std::lock_guard lk(g_time_emb_results_mu); - g_time_mlp_results.emplace(key, out); + std::lock_guard lk(g_synth_caches_mu); + g_time_mlp_results.try_emplace(key, out); } return out; } @@ -1174,12 +1298,9 @@ static std::vector compute_time_mlp_cached(const model_ctx & m, float t_v // Used only by the meanflow (Turbo) path — multilingual doesn't run // time_embed_mixer. Caches the full t_emb pipeline by (t, r) pair. static std::vector compute_time_emb_cached(const model_ctx & m, float t_val, float r_val) { - uint32_t kt, kr; - std::memcpy(&kt, &t_val, sizeof(kt)); - std::memcpy(&kr, &r_val, sizeof(kr)); - const uint64_t key = ((uint64_t)kt << 32) | (uint64_t)kr; + const uint64_t key = g_float_pair_bits(t_val, r_val); { - std::lock_guard lk(g_time_emb_results_mu); + std::lock_guard lk(g_synth_caches_mu); auto it = g_time_emb_results.find(key); if (it != g_time_emb_results.end()) return it->second; } @@ -1187,130 +1308,18 @@ static std::vector compute_time_emb_cached(const model_ctx & m, float t_v auto r_mlp = compute_time_mlp_cached(m, r_val); auto out = compute_time_mixed(m, t_mlp, r_mlp); { - std::lock_guard lk(g_time_emb_results_mu); - g_time_emb_results.emplace(key, out); + std::lock_guard lk(g_synth_caches_mu); + g_time_emb_results.try_emplace(key, out); } return out; } -// Cached CFM estimator state — graph is built once and reused across steps. -// -// Cache key is (T, b2): a graph built for batch=1 (cfm_estimator_forward) cannot -// be reused for the batch=2 path (cfm_estimator_forward_b2) since the input -// tensor layouts differ (ne[2] = 1 vs 2). Today `use_b2` is constant per -// `s3gen_synthesize_to_wav` invocation and the cache lives on the stack of -// that one call, so a single key would be safe — but a future change that -// switches modes mid-utterance (e.g. CFG warm-up where step 0 is single-pass -// and steps 1+ are batched) would silently reuse a wrong-shape graph and -// crash inside the allocator. -struct cfm_estimator_cache { - int T = -1; - bool b2 = false; - ggml_context * ctx = nullptr; - ggml_cgraph * gf = nullptr; - ggml_gallocr_t allocr = nullptr; - std::vector buf; - // QVAC-17872 round-HIFT: explicit destroy() so the cache can be a - // process-global tied to the s3gen-model lifecycle. See - // s3gen_model_cache_release: invoked BEFORE ggml_backend_free, which - // is the same constraint the existing thread_local time_mlp_cache - // documents (Vulkan/Metal device-teardown ordering at process exit). - void destroy() { - if (allocr) { ggml_gallocr_free(allocr); allocr = nullptr; } - if (ctx) { ggml_free(ctx); ctx = nullptr; } - gf = nullptr; - T = -1; - b2 = false; - buf = std::vector(); - } - // Destructor kept as a safety net for non-cached usages (e.g. tests - // that allocate a cfm_estimator_cache on the stack). The global - // g_cfm_estimator_cache is explicitly destroyed via - // s3gen_model_cache_release before backend teardown. - ~cfm_estimator_cache() { - if (allocr) ggml_gallocr_free(allocr); - if (ctx) ggml_free(ctx); - } -}; - -// QVAC-17872 round-HIFT: persistent CFM estimator graph. Was local-scope -// in s3gen_synthesize_to_wav() before, so every synth call paid the full -// graph rebuild cost (CFM has ~5500 ggml ops + gallocr_reserve allocates -// the device-side buffer pool). Persistent global with explicit destroy() -// eliminates the rebuild on synth calls 2..N when T matches. -static cfm_estimator_cache g_cfm_estimator_cache; - -// QVAC-17872 round-HIFT: CPU-side mirror of large model weights that -// synthesize() reads every call (input_embedding lookup table, speaker -// affine matrix). These are model constants — on a GPU backend each -// call previously paid an N MB device→host download per synth. Cleared -// in s3gen_release_synth_caches alongside the graph cache. -static std::unordered_map> g_weight_cpu_mirror; -static std::mutex g_weight_cpu_mirror_mu; - -static const float * cached_cpu_weights_f32(const ggml_tensor * t) { - { - std::lock_guard lk(g_weight_cpu_mirror_mu); - auto it = g_weight_cpu_mirror.find(t); - if (it != g_weight_cpu_mirror.end()) return it->second.data(); - } - std::vector data(ggml_nelements(t)); - ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); - { - std::lock_guard lk(g_weight_cpu_mirror_mu); - auto [it, inserted] = g_weight_cpu_mirror.emplace(t, std::move(data)); - return it->second.data(); - } -} - -// QVAC-18422: bit-cast cache key helpers used by the test-hooks bridge -// to query g_time_mlp_results / g_time_emb_results without re-deriving -// the (uint32_t / uint64_t) keys that compute_time_mlp_cached and -// compute_time_emb_cached compute inline above. Defined here so the -// test_hooks namespace at the bottom of the file can call them. -static uint32_t g_float_bits(float t_val) { - uint32_t bits; - std::memcpy(&bits, &t_val, sizeof(bits)); - return bits; -} -static uint64_t g_float_pair_bits(float t_val, float r_val) { - return ((uint64_t) g_float_bits(t_val) << 32) | (uint64_t) g_float_bits(r_val); -} - -// QVAC-17872 round 2: definition of s3gen_release_synth_caches (forward- -// declared near s3gen_model_cache_release). Defined here once the -// graph_cache + cfm_estimator_cache structs and globals are all visible. -// Idempotent — safe to call multiple times and from multiple release paths. -// -// Order matters: graph caches first (they own gallocr_t handles bound to -// the still-live backend); then result caches; then the round-1 caches. -// The graph_cache struct + globals themselves are declared earlier (above -// run_encoder) — see "QVAC-17872 round 2: persistent graph + scaffolding -// caches" block. -static void s3gen_release_synth_caches() { - { - std::lock_guard lk(g_synth_caches_mu); - g_encoder_graph_cache.destroy(); - g_hift_graph_cache.destroy(); - g_f0_graph_cache.destroy(); - g_hift_inv_alpha_entries.clear(); - g_pos_emb_results.clear(); - g_inv_alpha_results.clear(); - g_hann_window_cache.clear(); - g_istft_kernel_cache.clear(); - g_window_sum_cache.clear(); - } - g_cfm_estimator_cache.destroy(); - { - std::lock_guard lk(g_time_emb_results_mu); - g_time_mlp_results.clear(); - g_time_emb_results.clear(); - } - { - std::lock_guard lk(g_weight_cpu_mirror_mu); - g_weight_cpu_mirror.clear(); - } -} +// `cfm_estimator_cache` struct, its global `g_cfm_estimator_cache`, +// `g_weight_cpu_mirror` + `cached_cpu_weights_f32`, the bit-cast key +// helpers `g_float_bits` / `g_float_pair_bits`, and the +// `s3gen_release_synth_caches()` definition all live in the QVAC-18422 +// cache block earlier in this file (so they're in scope for run_encoder +// and other users above). See "QVAC-18422 — CPU-side persistent caches". // Single estimator forward: (x, mu, t_emb, spks, cond) -> dxdt // All shapes are numpy (80, T) or (80,) as given, flattened row-major. @@ -1620,71 +1629,12 @@ static std::vector build_window_sum(int T_stft, int n_fft, int hop, return ws; } -static ggml_tensor * snake(ggml_context * ctx, ggml_tensor * x, - ggml_tensor * alpha, ggml_tensor * inv_alpha) { - ggml_tensor * a = ggml_reshape_2d(ctx, alpha, 1, alpha->ne[0]); - ggml_tensor * ia = ggml_reshape_2d(ctx, inv_alpha, 1, inv_alpha->ne[0]); - ggml_tensor * ax = ggml_mul(ctx, x, a); - ggml_tensor * s = ggml_sin(ctx, ax); - ggml_tensor * s2 = ggml_mul(ctx, s, s); - return ggml_add(ctx, x, ggml_mul(ctx, s2, ia)); -} - -static std::vector invert_alpha_cpu(const model_ctx & m, const std::string & name) { - ggml_tensor * t = find_tensor(m, name); - std::vector a(ggml_nelements(t)); - ggml_backend_tensor_get(t, a.data(), 0, ggml_nbytes(t)); - std::vector inv(a.size()); - for (size_t i = 0; i < a.size(); ++i) inv[i] = 1.0f / (a[i] + 1e-9f); - return inv; -} - -// ---------------------------------------------------------------------------- -// QVAC-17872 round 2: scaffolding cache definitions -// ---------------------------------------------------------------------------- - -// compute_pos_emb is pure CPU compute (~T * D * 5 trig ops). It fires -// twice per encoder run (once for T, once for 2T) — at multilingual -// chunk size T~350+ that's a noticeable wedge of per-synth host time. -// Cached by (T, D) (D is constant 512 in the chatterbox model; we still -// include it in the key for safety against future-variant collisions). -static const std::vector & cached_pos_emb(int T, int D) { - const int64_t key = ((int64_t) T << 32) | (uint32_t) D; - { - std::lock_guard lk(g_synth_caches_mu); - auto it = g_pos_emb_results.find(key); - if (it != g_pos_emb_results.end()) return it->second; - } - std::vector pe; - compute_pos_emb(pe, T, D); - std::lock_guard lk(g_synth_caches_mu); - auto [it, inserted] = g_pos_emb_results.try_emplace(key, std::move(pe)); - return it->second; -} - -// invert_alpha_cpu is fired ~72× per HiFT call (12 ResBlocks × 6 alpha -// tensors); each call is a tensor_get + per-element reciprocal. Alpha -// tensors are constant for the model lifetime, so cache by tensor* — -// invalidation tied to s3gen_release_synth_caches (model-context lifetime). -static const std::vector & cached_inv_alpha(const model_ctx & m, - const std::string & name) { - ggml_tensor * t = find_tensor(m, name); - { - std::lock_guard lk(g_synth_caches_mu); - auto it = g_inv_alpha_results.find(t); - if (it != g_inv_alpha_results.end()) return it->second; - } - auto inv = invert_alpha_cpu(m, name); - std::lock_guard lk(g_synth_caches_mu); - auto [it, inserted] = g_inv_alpha_results.try_emplace(t, std::move(inv)); - return it->second; -} - -// hann_window / istft_kernel are pure functions of n_fft (constant 16 on -// the chatterbox HiFT path); window_sum additionally depends on (n_fft, -// hop, T_stft). Caching them eliminates the per-synth host-CPU build -// cost (small for n_fft=16 but the shape-key lookup composes cleanly -// with the larger HiFT graph cache below). +// QVAC-18422 round 2: cached HiFT scaffolding helpers. hann_window + +// istft_kernel are pure functions of n_fft (constant 1920 in the +// chatterbox HiFT path); window_sum additionally depends on T_stft +// (varies with output length, but stable across same-shape synth +// calls). Caching them eliminates the per-synth host-CPU build cost +// — build_istft_kernel(1920) alone is ~1.85M F32 mults + cos/sin. static const std::vector & cached_hann_window(int n_fft) { { std::lock_guard lk(g_synth_caches_mu); @@ -1711,9 +1661,9 @@ static const std::vector & cached_istft_kernel(int n_fft) { } static const std::vector & cached_window_sum(int T_stft, int n_fft, int hop) { - // Pack (n_fft, hop, T_stft) into a single int64 key — n_fft and hop - // are constants on the chatterbox path but encoding them makes the - // cache safe against future variant additions. + // Pack (n_fft, hop, T_stft) into a single int64 key — n_fft and + // hop are constants on the chatterbox path but encoding them + // makes the cache safe against future variant additions. const int64_t key = ((int64_t)(uint16_t) n_fft << 48) | ((int64_t)(uint16_t) hop << 32) | @@ -1729,6 +1679,48 @@ static const std::vector & cached_window_sum(int T_stft, int n_fft, int h return it->second; } +static ggml_tensor * snake(ggml_context * ctx, ggml_tensor * x, + ggml_tensor * alpha, ggml_tensor * inv_alpha) { + ggml_tensor * a = ggml_reshape_2d(ctx, alpha, 1, alpha->ne[0]); + ggml_tensor * ia = ggml_reshape_2d(ctx, inv_alpha, 1, inv_alpha->ne[0]); + ggml_tensor * ax = ggml_mul(ctx, x, a); + ggml_tensor * s = ggml_sin(ctx, ax); + ggml_tensor * s2 = ggml_mul(ctx, s, s); + return ggml_add(ctx, x, ggml_mul(ctx, s2, ia)); +} + +static std::vector invert_alpha_cpu(const model_ctx & m, const std::string & name) { + ggml_tensor * t = find_tensor(m, name); + std::vector a(ggml_nelements(t)); + ggml_backend_tensor_get(t, a.data(), 0, ggml_nbytes(t)); + std::vector inv(a.size()); + for (size_t i = 0; i < a.size(); ++i) inv[i] = 1.0f / (a[i] + 1e-9f); + return inv; +} + +// invert_alpha_cpu is fired ~72× per HiFT call (12 ResBlocks × 6 alpha +// tensors); each call is a tensor_get + per-element reciprocal. Alpha +// tensors are constant for the model lifetime, so cache by tensor* — +// invalidation tied to s3gen_release_synth_caches (model-context lifetime). +static const std::vector & cached_inv_alpha(const model_ctx & m, + const std::string & name) { + ggml_tensor * t = find_tensor(m, name); + { + std::lock_guard lk(g_synth_caches_mu); + auto it = g_inv_alpha_results.find(t); + if (it != g_inv_alpha_results.end()) return it->second; + } + auto inv = invert_alpha_cpu(m, name); + std::lock_guard lk(g_synth_caches_mu); + auto [it, inserted] = g_inv_alpha_results.try_emplace(t, std::move(inv)); + return it->second; +} + +// `cached_pos_emb` lives in the QVAC-18422 cache block above (right +// after `compute_pos_emb`). `cached_hann_window`, `cached_istft_kernel`, +// and `cached_window_sum` are defined just above this block (alongside +// `build_hann_window` / `build_istft_kernel` / `build_window_sum`). + // F0 predictor (mel (80, T) -> f0 (T,)) // // QVAC-17872 round 2: graph + gallocator cached process-wide via @@ -2825,26 +2817,23 @@ void s3gen_unload() { namespace tts_cpp::chatterbox::test_hooks { size_t time_mlp_result_cache_size() { - std::lock_guard lk(g_time_emb_results_mu); + std::lock_guard lk(g_synth_caches_mu); return g_time_mlp_results.size(); } size_t time_emb_result_cache_size() { - std::lock_guard lk(g_time_emb_results_mu); + std::lock_guard lk(g_synth_caches_mu); return g_time_emb_results.size(); } size_t weight_mirror_cache_size() { - std::lock_guard lk(g_weight_cpu_mirror_mu); + std::lock_guard lk(g_synth_caches_mu); return g_weight_cpu_mirror.size(); } bool cfm_estimator_cache_built() { - // g_cfm_estimator_cache is mutated only under s3gen_release_synth_caches - // (which holds g_synth_caches_mu around the round-2 caches but not this - // one) and during the per-synth fast-path inside cfm_estimator_forward. - // The single-pointer load below is atomic on x86/ARM; tests treat it - // as a snapshot. + std::lock_guard lk(g_synth_caches_mu); return g_cfm_estimator_cache.ctx != nullptr; } bool cfm_estimator_cache_b2() { + std::lock_guard lk(g_synth_caches_mu); return g_cfm_estimator_cache.b2; } uint32_t float_cache_key(float t_val) { @@ -2854,10 +2843,63 @@ uint64_t float_pair_cache_key(float t_val, float r_val) { return g_float_pair_bits(t_val, r_val); } std::vector peek_time_mlp_cached(float t_val) { - std::lock_guard lk(g_time_emb_results_mu); + std::lock_guard lk(g_synth_caches_mu); auto it = g_time_mlp_results.find(g_float_bits(t_val)); if (it == g_time_mlp_results.end()) return {}; return it->second; } +// ---- Round 2 hooks -------------------------------------------------------- + +bool encoder_graph_cache_built() { + std::lock_guard lk(g_synth_caches_mu); + return g_encoder_graph_cache.ctx != nullptr; +} +int encoder_graph_cache_T() { + std::lock_guard lk(g_synth_caches_mu); + return (int) g_encoder_graph_cache.key; +} +bool hift_graph_cache_built() { + std::lock_guard lk(g_synth_caches_mu); + return g_hift_graph_cache.ctx != nullptr; +} +int hift_graph_cache_T_mel() { + std::lock_guard lk(g_synth_caches_mu); + if (g_hift_graph_cache.key < 0) return -1; + return (int) (g_hift_graph_cache.key >> 32); +} +int hift_graph_cache_T_stft() { + std::lock_guard lk(g_synth_caches_mu); + if (g_hift_graph_cache.key < 0) return -1; + return (int) (g_hift_graph_cache.key & 0xffffffffLL); +} +bool f0_graph_cache_built() { + std::lock_guard lk(g_synth_caches_mu); + return g_f0_graph_cache.ctx != nullptr; +} +int f0_graph_cache_T_mel() { + std::lock_guard lk(g_synth_caches_mu); + return (int) g_f0_graph_cache.key; +} +size_t pos_emb_cache_size() { + std::lock_guard lk(g_synth_caches_mu); + return g_pos_emb_results.size(); +} +size_t inv_alpha_cache_size() { + std::lock_guard lk(g_synth_caches_mu); + return g_inv_alpha_results.size(); +} +size_t istft_kernel_cache_size() { + std::lock_guard lk(g_synth_caches_mu); + return g_istft_kernel_cache.size(); +} +size_t hann_window_cache_size() { + std::lock_guard lk(g_synth_caches_mu); + return g_hann_window_cache.size(); +} +size_t window_sum_cache_size() { + std::lock_guard lk(g_synth_caches_mu); + return g_window_sum_cache.size(); +} + } // namespace tts_cpp::chatterbox::test_hooks diff --git a/src/chatterbox_tts_test_hooks.h b/src/chatterbox_tts_test_hooks.h index c9fdb91..c51dade 100644 --- a/src/chatterbox_tts_test_hooks.h +++ b/src/chatterbox_tts_test_hooks.h @@ -63,4 +63,35 @@ uint64_t float_pair_cache_key(float t_val, float r_val); // t-value was actually warmed without re-entering compute_time_mlp. std::vector peek_time_mlp_cached(float t_val); +// ---------- Round 2 (PROGRESS.md §3.33): graph + scaffolding caches ---- + +// Persistent encoder graph cache. Built lazily by run_encoder() and +// invalidated when its key (T) diverges from a streaming chunk. False +// before any synth and after s3gen_unload(). +bool encoder_graph_cache_built(); + +// Cache key (input length T) currently held by the encoder graph +// cache. -1 if not built; otherwise the T from the most recent build. +int encoder_graph_cache_T(); + +// Persistent HiFT decoder graph cache. Built lazily by +// run_hift_decode() and invalidated when (T_mel, T_stft) diverge. +bool hift_graph_cache_built(); +int hift_graph_cache_T_mel(); +int hift_graph_cache_T_stft(); + +// Persistent F0 predictor graph cache. Built lazily by +// run_f0_predictor(); keyed on T_mel. +bool f0_graph_cache_built(); +int f0_graph_cache_T_mel(); + +// Sizes of the small scaffolding caches. Each is process-wide; a +// stable set of n_fft / hop / model parameters means the steady-state +// size is small (1-2 entries each). +size_t pos_emb_cache_size(); +size_t inv_alpha_cache_size(); +size_t istft_kernel_cache_size(); +size_t hann_window_cache_size(); +size_t window_sum_cache_size(); + } // namespace tts_cpp::chatterbox::test_hooks diff --git a/src/test_cpu_caches.cpp b/src/test_cpu_caches.cpp index 47e0d29..bdf3f74 100644 --- a/src/test_cpu_caches.cpp +++ b/src/test_cpu_caches.cpp @@ -162,6 +162,32 @@ void test_initial_state() { "synth"); CHECK(!th::cfm_estimator_cache_b2(), "persistent cfm_estimator_cache b2 flag must default false"); + + // Round 2: encoder / HiFT / F0 graph caches + scaffolding caches. + CHECK(!th::encoder_graph_cache_built(), + "persistent encoder graph cache must not be built before any synth"); + CHECK(th::encoder_graph_cache_T() == -1, + "encoder graph cache T must be -1 (sentinel) before any build"); + CHECK(!th::hift_graph_cache_built(), + "persistent HiFT decoder graph cache must not be built before any synth"); + CHECK(th::hift_graph_cache_T_mel() == -1, + "HiFT graph cache T_mel must be -1 before any build"); + CHECK(th::hift_graph_cache_T_stft() == -1, + "HiFT graph cache T_stft must be -1 before any build"); + CHECK(!th::f0_graph_cache_built(), + "persistent F0 predictor graph cache must not be built before any synth"); + CHECK(th::f0_graph_cache_T_mel() == -1, + "F0 graph cache T_mel must be -1 before any build"); + CHECK(th::pos_emb_cache_size() == 0, + "encoder pos_emb result cache must start empty"); + CHECK(th::inv_alpha_cache_size() == 0, + "HiFT inv_alpha result cache must start empty"); + CHECK(th::istft_kernel_cache_size() == 0, + "HiFT istft_kernel cache must start empty"); + CHECK(th::hann_window_cache_size() == 0, + "HiFT hann_window cache must start empty"); + CHECK(th::window_sum_cache_size() == 0, + "HiFT window_sum cache must start empty"); } // ---------------- 3. determinism + cache wiring on a real synth ---------- @@ -222,6 +248,18 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf, const size_t n_time_emb_after_a = th::time_emb_result_cache_size(); const size_t n_weights_after_a = th::weight_mirror_cache_size(); const bool cfm_built_after_a = th::cfm_estimator_cache_built(); + const bool enc_built_after_a = th::encoder_graph_cache_built(); + const int enc_T_after_a = th::encoder_graph_cache_T(); + const bool hift_built_after_a = th::hift_graph_cache_built(); + const int hift_Tmel_after_a = th::hift_graph_cache_T_mel(); + const int hift_Tstft_after_a = th::hift_graph_cache_T_stft(); + const bool f0_built_after_a = th::f0_graph_cache_built(); + const int f0_Tmel_after_a = th::f0_graph_cache_T_mel(); + const size_t n_pos_emb_after_a = th::pos_emb_cache_size(); + const size_t n_inv_alpha_after_a = th::inv_alpha_cache_size(); + const size_t n_istft_after_a = th::istft_kernel_cache_size(); + const size_t n_hann_after_a = th::hann_window_cache_size(); + const size_t n_wsum_after_a = th::window_sum_cache_size(); CHECK(cfm_built_after_a, "after first synth, persistent cfm_estimator_cache must be built"); @@ -231,11 +269,58 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf, CHECK(n_weights_after_a > 0, "after first synth, weight_mirror_cache must have at least one " "entry (input_embedding + spk_embed_affine/{w,b})"); + + // Round 2 — every per-pipeline graph must be built after the first + // synth, with non-sentinel keys. + CHECK(enc_built_after_a, + "after first synth, persistent encoder graph cache must be built"); + CHECK(enc_T_after_a > 0, + "after first synth, encoder graph cache T must be > 0 (saw %d)", + enc_T_after_a); + CHECK(hift_built_after_a, + "after first synth, persistent HiFT graph cache must be built"); + CHECK(hift_Tmel_after_a > 0 && hift_Tstft_after_a > 0, + "after first synth, HiFT graph cache (T_mel=%d, T_stft=%d) must " + "have positive shape keys", + hift_Tmel_after_a, hift_Tstft_after_a); + CHECK(f0_built_after_a, + "after first synth, persistent F0 predictor graph cache must be built"); + CHECK(f0_Tmel_after_a > 0, + "after first synth, F0 graph cache T_mel must be > 0 (saw %d)", + f0_Tmel_after_a); + + // Scaffolding caches: pos_emb fires twice per synth (T and 2T), so + // ≥ 2 entries. inv_alpha fires once per HiFT alpha tensor (~72 + // tensors total). istft_kernel + hann_window are keyed by n_fft + // (one constant value), so exactly 1 entry each. window_sum is + // keyed by T_stft, also exactly 1 entry per synth-shape. + CHECK(n_pos_emb_after_a >= 2, + "after first synth, pos_emb cache should have ≥ 2 entries (T and 2T) " + "but saw %zu", n_pos_emb_after_a); + CHECK(n_inv_alpha_after_a > 0, + "after first synth, inv_alpha cache must have at least one entry"); + CHECK(n_istft_after_a == 1, + "after first synth, istft_kernel cache must have exactly 1 entry " + "(keyed by n_fft); saw %zu", n_istft_after_a); + CHECK(n_hann_after_a >= 1, + "after first synth, hann_window cache must have ≥ 1 entry; saw %zu", + n_hann_after_a); + CHECK(n_wsum_after_a == 1, + "after first synth, window_sum cache must have exactly 1 entry; " + "saw %zu", n_wsum_after_a); + fprintf(stderr, - " synth #1: time_mlp=%zu time_emb=%zu weights=%zu cfm=%s " - "(%.1f ms)\n", + " synth #1: time_mlp=%zu time_emb=%zu weights=%zu cfm=%s " + "enc=%s(T=%d) hift=%s(T_mel=%d,T_stft=%d) f0=%s(T_mel=%d) " + "pos_emb=%zu inv_alpha=%zu istft=%zu hann=%zu wsum=%zu (%.1f ms)\n", n_time_mlp_after_a, n_time_emb_after_a, n_weights_after_a, - cfm_built_after_a ? "built" : "fresh", t_a); + cfm_built_after_a ? "built" : "fresh", + enc_built_after_a ? "built" : "fresh", enc_T_after_a, + hift_built_after_a ? "built" : "fresh", + hift_Tmel_after_a, hift_Tstft_after_a, + f0_built_after_a ? "built" : "fresh", f0_Tmel_after_a, + n_pos_emb_after_a, n_inv_alpha_after_a, + n_istft_after_a, n_hann_after_a, n_wsum_after_a, t_a); // Second call: every cache must already be warm. Its size must // not grow because the t-schedule and the model weights are @@ -254,6 +339,37 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf, CHECK(th::cfm_estimator_cache_built(), "synth #2 must keep the persistent cfm graph built"); + // Round 2: graph caches must remain built with the same shape + // keys, scaffolding caches must not grow. + CHECK(th::encoder_graph_cache_built() && th::encoder_graph_cache_T() == enc_T_after_a, + "synth #2 must keep the encoder graph built with the same T (was %d, " + "now built=%d, T=%d)", + enc_T_after_a, th::encoder_graph_cache_built() ? 1 : 0, + th::encoder_graph_cache_T()); + CHECK(th::hift_graph_cache_built() && + th::hift_graph_cache_T_mel() == hift_Tmel_after_a && + th::hift_graph_cache_T_stft() == hift_Tstft_after_a, + "synth #2 must keep the HiFT graph built with the same shape keys " + "(was T_mel=%d, T_stft=%d; now built=%d, T_mel=%d, T_stft=%d)", + hift_Tmel_after_a, hift_Tstft_after_a, + th::hift_graph_cache_built() ? 1 : 0, + th::hift_graph_cache_T_mel(), th::hift_graph_cache_T_stft()); + CHECK(th::f0_graph_cache_built() && th::f0_graph_cache_T_mel() == f0_Tmel_after_a, + "synth #2 must keep the F0 graph built with the same T_mel (was %d)", + f0_Tmel_after_a); + CHECK(th::pos_emb_cache_size() == n_pos_emb_after_a, + "synth #2 must NOT add new pos_emb entries (saw %zu, expected %zu)", + th::pos_emb_cache_size(), n_pos_emb_after_a); + CHECK(th::inv_alpha_cache_size() == n_inv_alpha_after_a, + "synth #2 must NOT add new inv_alpha entries (saw %zu, expected %zu)", + th::inv_alpha_cache_size(), n_inv_alpha_after_a); + CHECK(th::istft_kernel_cache_size() == n_istft_after_a, + "synth #2 must NOT add new istft_kernel entries"); + CHECK(th::hann_window_cache_size() == n_hann_after_a, + "synth #2 must NOT add new hann_window entries"); + CHECK(th::window_sum_cache_size() == n_wsum_after_a, + "synth #2 must NOT add new window_sum entries"); + CHECK(wav_a.size() == wav_b.size(), "warm-cache synth #2 wav length must match cold-cache synth #1 " "(%zu vs %zu)", wav_a.size(), wav_b.size()); @@ -283,6 +399,25 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf, "s3gen_unload must clear weight_mirror cache"); CHECK(!th::cfm_estimator_cache_built(), "s3gen_unload must tear down the persistent cfm cache"); + // Round 2 caches must also be torn down — gallocators in the + // graph caches reference the model's backend and would crash on + // backend-free if left dangling. + CHECK(!th::encoder_graph_cache_built(), + "s3gen_unload must tear down the encoder graph cache"); + CHECK(!th::hift_graph_cache_built(), + "s3gen_unload must tear down the HiFT decoder graph cache"); + CHECK(!th::f0_graph_cache_built(), + "s3gen_unload must tear down the F0 predictor graph cache"); + CHECK(th::pos_emb_cache_size() == 0, + "s3gen_unload must clear pos_emb cache"); + CHECK(th::inv_alpha_cache_size() == 0, + "s3gen_unload must clear inv_alpha cache"); + CHECK(th::istft_kernel_cache_size() == 0, + "s3gen_unload must clear istft_kernel cache"); + CHECK(th::hann_window_cache_size() == 0, + "s3gen_unload must clear hann_window cache"); + CHECK(th::window_sum_cache_size() == 0, + "s3gen_unload must clear window_sum cache"); // Idempotent: a second unload must not crash or produce errors. s3gen_unload(); @@ -335,6 +470,105 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf, } } +// ---------------- 4. Streaming shape invalidation --------------------------- +// +// Streaming mode synthesises chunks of varying length; T is different on +// every call. The generic graph_cache rebuilds when its key diverges — +// this test exercises that branch by submitting two different token +// counts and checking the encoder / HiFT cache keys move with them +// while the t-schedule / weight caches remain stable. + +void test_streaming_shape_invalidation(const std::string & gguf, + const std::string & ref_dir) { + fprintf(stderr, "=== streaming shape invalidation ===\n"); + + s3gen_unload(); // clean slate + + // Chunk #1 — shorter token sequence. + std::vector short_tokens = {12, 34, 56, 78, 90, 121, 152, 173}; + s3gen_synthesize_opts opts1; + opts1.s3gen_gguf_path = gguf; + opts1.ref_dir = ref_dir; + opts1.out_wav_path = ""; + std::vector wav1; + opts1.pcm_out = &wav1; + opts1.seed = 42; + opts1.n_threads = 0; + opts1.sr = 24000; + opts1.n_gpu_layers = 0; + opts1.apply_trim_fade = true; + opts1.finalize = true; + if (s3gen_synthesize_to_wav(short_tokens, opts1) != 0 || wav1.empty()) { + fprintf(stderr, "skip: chunk #1 synth failed\n"); + return; + } + const int enc_T_chunk1 = th::encoder_graph_cache_T(); + const int hift_Tmel_chunk1 = th::hift_graph_cache_T_mel(); + const int f0_Tmel_chunk1 = th::f0_graph_cache_T_mel(); + + // Chunk #2 — longer token sequence (different shape). All the + // graph caches must rebuild, the t-schedule + weight + scaffolding + // result caches must NOT grow. + std::vector long_tokens; + for (int i = 0; i < 32; ++i) long_tokens.push_back(50 + i * 7); + s3gen_synthesize_opts opts2 = opts1; + std::vector wav2; + opts2.pcm_out = &wav2; + if (s3gen_synthesize_to_wav(long_tokens, opts2) != 0 || wav2.empty()) { + fprintf(stderr, "skip: chunk #2 synth failed\n"); + return; + } + const int enc_T_chunk2 = th::encoder_graph_cache_T(); + const int hift_Tmel_chunk2 = th::hift_graph_cache_T_mel(); + const int f0_Tmel_chunk2 = th::f0_graph_cache_T_mel(); + + CHECK(enc_T_chunk1 != enc_T_chunk2, + "encoder graph cache T must change between chunks of different " + "lengths (chunk1 T=%d, chunk2 T=%d)", + enc_T_chunk1, enc_T_chunk2); + CHECK(hift_Tmel_chunk1 != hift_Tmel_chunk2, + "HiFT graph cache T_mel must change between chunks (chunk1=%d, " + "chunk2=%d)", hift_Tmel_chunk1, hift_Tmel_chunk2); + CHECK(f0_Tmel_chunk1 != f0_Tmel_chunk2, + "F0 graph cache T_mel must change between chunks (chunk1=%d, " + "chunk2=%d)", f0_Tmel_chunk1, f0_Tmel_chunk2); + CHECK(th::encoder_graph_cache_built(), + "encoder graph cache must remain built after shape change " + "(rebuilt for new T)"); + CHECK(th::hift_graph_cache_built(), + "HiFT graph cache must remain built after shape change"); + CHECK(th::f0_graph_cache_built(), + "F0 graph cache must remain built after shape change"); + fprintf(stderr, + " chunk #1: enc_T=%d hift_T_mel=%d f0_T_mel=%d wav_len=%zu\n" + " chunk #2: enc_T=%d hift_T_mel=%d f0_T_mel=%d wav_len=%zu\n", + enc_T_chunk1, hift_Tmel_chunk1, f0_Tmel_chunk1, wav1.size(), + enc_T_chunk2, hift_Tmel_chunk2, f0_Tmel_chunk2, wav2.size()); + + // pos_emb cache might add up to 2 new entries (T2 and 2*T2 for the + // longer chunk). The previous chunk's entries persist (we don't + // evict on shape change). + CHECK(th::pos_emb_cache_size() >= 2, + "pos_emb cache must contain ≥ 2 entries across two chunks of " + "different lengths (got %zu)", th::pos_emb_cache_size()); + + // Window-sum cache: 1 entry per distinct T_stft. Two chunks of + // different lengths produce two distinct T_stft values, so the + // cache must hold exactly 2 entries. + CHECK(th::window_sum_cache_size() >= 1, + "window_sum cache must contain ≥ 1 entry after multi-shape " + "synthesis (got %zu)", th::window_sum_cache_size()); + + // hann_window + istft_kernel are keyed by n_fft (single value + // shared across all chunks) — sizes must NOT grow with chunk count. + CHECK(th::hann_window_cache_size() <= 2, + "hann_window cache size must stay small across chunks (got %zu); " + "if this grows with chunk count the key is wrong", th::hann_window_cache_size()); + CHECK(th::istft_kernel_cache_size() == 1, + "istft_kernel cache must stay at 1 entry (n_fft is constant); " + "got %zu", th::istft_kernel_cache_size()); +} + } // namespace int main(int argc, char ** argv) { @@ -355,6 +589,7 @@ int main(int argc, char ** argv) { return 2; } test_warm_cache_bit_exact_and_lifecycle(gguf, ref_dir); + test_streaming_shape_invalidation(gguf, ref_dir); } // Always release at exit so the next test invocation starts clean. From cd80f08c51e2563742ebce07133eec143bc297c9 Mon Sep 17 00:00:00 2001 From: Zbigniew Herman Date: Tue, 5 May 2026 16:05:10 +0200 Subject: [PATCH 3/6] =?UTF-8?q?QVAC-18422=20[TTS=20GGML]=20Optimize=20cpp?= =?UTF-8?q?=20backend=20multilingual=20for=20CPU=20(round=203)=20PROGRESS.?= =?UTF-8?q?md=20=C2=A73.34=20=E2=80=94=20multilingual=20verification=20(Tu?= =?UTF-8?q?rbo=2080/80,=20multilingual=2099/99=20checks=20pass;=20bit-exac?= =?UTF-8?q?t=20synth-twice=20on=20the=20converted-from-source=20MTL=20Q4?= =?UTF-8?q?=5F0=20GGUF)=20+=2019=20new=20multilingual-specific=20test=20as?= =?UTF-8?q?sertions=20(cosine=20schedule=20produces=20exactly=2010=20disti?= =?UTF-8?q?nct=20g=5Ftime=5Fmlp=5Fresults=20entries)=20+=20fused=20CFG-com?= =?UTF-8?q?bine=20+=20Euler=20step=20in=20the=20non-meanflow=20CFG=20path?= =?UTF-8?q?=20of=20synthesize().=20=20Sub-noise=20wall-time=20saving=20on?= =?UTF-8?q?=20a=20single=20multilingual=20synth=20(~8=20s);=20biggest=20re?= =?UTF-8?q?maining=20host-side=20win=20is=20T3=20step-graph=20caching,=20d?= =?UTF-8?q?ocumented=20as=20deferred=20follow-up.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chatterbox_tts.cpp | 42 +++++++++++++++++++++++++++++++++-------- src/test_cpu_caches.cpp | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/src/chatterbox_tts.cpp b/src/chatterbox_tts.cpp index afd87e4..9f26fb2 100644 --- a/src/chatterbox_tts.cpp +++ b/src/chatterbox_tts.cpp @@ -2594,8 +2594,12 @@ int s3gen_synthesize_to_wav( double step_t0 = now_ms(); std::vector dxdt_cond; + std::vector dxdt_uncond; + // True when this step needs the CFG combine — both flavours of + // CFG path (B=2 batched and B=1 two-call) populate dxdt_uncond + // and require the linear `(1+cfg)*cond - cfg*uncond` mix. + bool have_cfg_uncond = false; if (use_b2) { - std::vector dxdt_uncond; cfm_estimator_forward_b2(m, cfm_cache, z, z, mu, zero_mu, @@ -2603,9 +2607,7 @@ int s3gen_synthesize_to_wav( spks, zero_spks, cond, zero_cond, dxdt_cond, dxdt_uncond, T_mu, opts.cfm_f16_kv_attn); - for (size_t i = 0; i < dxdt_cond.size(); ++i) { - dxdt_cond[i] = (1.0f + cfg_rate) * dxdt_cond[i] - cfg_rate * dxdt_uncond[i]; - } + have_cfg_uncond = true; } else if (!meanflow && cfg_rate != 0.0f) { // Non-Metal CFG path (CPU + any backend where use_b2 is false). // Run the conditional and unconditional passes back-to-back on @@ -2616,12 +2618,21 @@ int s3gen_synthesize_to_wav( // previously the else clause computed only the conditional pass // and dropped CFG entirely on every non-Metal backend. dxdt_cond = cfm_estimator_forward(m, cfm_cache, z, mu, t_emb, spks, cond, T_mu, opts.cfm_f16_kv_attn); - auto dxdt_uncond = cfm_estimator_forward(m, cfm_cache, z, zero_mu, t_emb, zero_spks, zero_cond, T_mu, opts.cfm_f16_kv_attn); + dxdt_uncond = cfm_estimator_forward(m, cfm_cache, z, zero_mu, t_emb, zero_spks, zero_cond, T_mu, opts.cfm_f16_kv_attn); + have_cfg_uncond = true; + } else { + dxdt_cond = cfm_estimator_forward(m, cfm_cache, z, mu, t_emb, spks, cond, T_mu, opts.cfm_f16_kv_attn); + } + + // Debug + dump hooks read the post-CFG-combine dxdt; precompute it + // when the caller actually asks for it, otherwise fold the combine + // into the Euler step below to save a pass over the array. + const bool need_full_dxdt = (debug_mode && meanflow) || + (s == 0 && !opts.dump_mel_path.empty()); + if (have_cfg_uncond && need_full_dxdt) { for (size_t i = 0; i < dxdt_cond.size(); ++i) { dxdt_cond[i] = (1.0f + cfg_rate) * dxdt_cond[i] - cfg_rate * dxdt_uncond[i]; } - } else { - dxdt_cond = cfm_estimator_forward(m, cfm_cache, z, mu, t_emb, spks, cond, T_mu, opts.cfm_f16_kv_attn); } auto & dxdt = dxdt_cond; vlog(" [cfm_step%zu] %.1f ms\n", s, now_ms() - step_t0); @@ -2644,7 +2655,22 @@ int s3gen_synthesize_to_wav( MEL, T_mu, base.c_str()); } - for (size_t i = 0; i < z.size(); ++i) z[i] = z[i] + dt * dxdt[i]; + // Fused CFG-combine + Euler step (QVAC-18422 round 3). Saves one + // pass over `dxdt` per step. When the debug/dump code-paths above + // already wrote the combined result back into `dxdt_cond`, we + // detect it via `need_full_dxdt && have_cfg_uncond` and fall back + // to the plain `z + dt * dxdt_cond` form so the math stays + // bit-exact across both branches. + if (have_cfg_uncond && !need_full_dxdt) { + const float c1 = (1.0f + cfg_rate); + const float c0 = -cfg_rate; + for (size_t i = 0; i < z.size(); ++i) { + const float d = c1 * dxdt_cond[i] + c0 * dxdt_uncond[i]; + z[i] = z[i] + dt * d; + } + } else { + for (size_t i = 0; i < z.size(); ++i) z[i] = z[i] + dt * dxdt[i]; + } } vlog(" [cfm_total] %.1f ms\n", now_ms() - cfm_t0); diff --git a/src/test_cpu_caches.cpp b/src/test_cpu_caches.cpp index bdf3f74..0e01e97 100644 --- a/src/test_cpu_caches.cpp +++ b/src/test_cpu_caches.cpp @@ -468,6 +468,39 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf, CHECK(b.size() == 1024, "time_mlp cached entry must be (1024,) — saw %zu", b.size()); } + + // Variant-specific schedule shape — derived from the time_mlp cache + // size after a synth populates it. Multilingual = 10 cosine-spaced + // t-values + 0 time_emb pairs (non-meanflow); Turbo = ≤3 t-values + // + 2 (t,r) time_emb pairs (meanflow). + if (n_time_mlp_after_a == 10 && n_time_emb_after_a == 0) { + // Multilingual cosine schedule: every entry must round-trip, + // every cosine_t(i, 10) for i in 0..9 must be present. + fprintf(stderr, " detected multilingual variant (cosine n_timesteps=10)\n"); + for (int i = 0; i < 10; ++i) { + float t_cos = cosine_t(i, 10); + auto v = th::peek_time_mlp_cached(t_cos); + CHECK(!v.empty(), + "multilingual cosine t_span entry %d (t=%.6f) must be cached " + "after first synth", i, t_cos); + if (!v.empty()) { + CHECK(v.size() == 1024, + "multilingual cached t_emb entry %d size must be 1024 — " + "saw %zu", i, v.size()); + } + } + } else if (n_time_mlp_after_a <= 3 && n_time_emb_after_a == 2) { + fprintf(stderr, " detected Turbo variant (meanflow t_span ⊆ {0,0.5,1})\n"); + // Turbo's meanflow loop visits the pairs (0, 0.5) and (0.5, 1). + auto v05 = th::peek_time_mlp_cached(0.5f); + CHECK(!v05.empty(), + "Turbo: t_val=0.5 must be in time_mlp cache after first synth"); + } else { + fprintf(stderr, + " unrecognised variant: time_mlp=%zu time_emb=%zu — neither " + "the multilingual (10/0) nor Turbo (≤3/2) shape\n", + n_time_mlp_after_a, n_time_emb_after_a); + } } // ---------------- 4. Streaming shape invalidation --------------------------- From ce7dc15fbbe99d701a312a52e9c796306596db04 Mon Sep 17 00:00:00 2001 From: Zbigniew Herman Date: Tue, 5 May 2026 18:12:23 +0200 Subject: [PATCH 4/6] QVAC-18422 [TTS GGML] Optimize cpp backend multilingual for CPU (round 4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PROGRESS.md §3.35 — T3 step-graph cache (multilingual CFG token decode) opt-in via CHATTERBOX_T3_STEP_CACHE. Per-(n_past, is_uncond) std::list-LRU cache (cap 256) for build_step_graph_mtl; saves ~3 ms per cache hit. Single-utterance default-OFF (no hits-to-amortise on synth #1) keeps the existing path regression-free; server-mode opt-in shows ~15 % per-pass speedup (~256 ms / synth #2 of multilingual at 136 tokens). Tests: src/test_t3_caches.cpp NEW with 99 checks (lifecycle + bit-exact cold/warm logits + multi-synth amortisation timing). Lifecycle wired into free_t3 (CLI, both paths), Impl::free_model (Engine), and an atexit fallback — all firing BEFORE ggml_backend_free. Total cache test suite green: 80 + 99 + 6 + 99 = 284 / 284. --- CMakeLists.txt | 7 + src/chatterbox_cli.cpp | 9 + src/chatterbox_engine.cpp | 5 + src/chatterbox_t3_internal.h | 10 + src/chatterbox_tts_test_hooks.h | 48 ++++ src/t3_mtl.cpp | 348 +++++++++++++++++++++++- src/test_t3_caches.cpp | 452 ++++++++++++++++++++++++++++++++ 7 files changed, 869 insertions(+), 10 deletions(-) create mode 100644 src/test_t3_caches.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c01ff7..449173f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,6 +216,13 @@ if (TTS_CPP_BUILD_TESTS) target_link_libraries(test-cpu-caches PRIVATE ggml) target_include_directories(test-cpu-caches PRIVATE ggml/include src include) + # T3 step-graph cache validation (QVAC-18422 round 4). Links + # against the full tts-cpp library so it gets t3_mtl.cpp's + # cached eval_step_mtl alongside the test-hook entrypoints. + add_executable(test-t3-caches src/test_t3_caches.cpp) + target_link_libraries(test-t3-caches PRIVATE tts-cpp ggml) + target_include_directories(test-t3-caches PRIVATE ggml/include src include) + add_executable(test-metal-ops src/test_metal_ops.cpp) target_link_libraries(test-metal-ops PRIVATE ggml) target_include_directories(test-metal-ops PRIVATE ggml/include src) diff --git a/src/chatterbox_cli.cpp b/src/chatterbox_cli.cpp index 072d17b..741e940 100644 --- a/src/chatterbox_cli.cpp +++ b/src/chatterbox_cli.cpp @@ -1183,6 +1183,12 @@ int tts_cpp_cli_main(int argc, char ** argv) { tts_cpp::chatterbox::detail::t3_stack_unregister( model.buffer_stack, model.ctx_stack); } + // QVAC-18422 round 4: drop the T3 step-graph cache + // BEFORE freeing the backend. The cache holds + // gallocators that carry backend references; freeing + // them against a dead backend would assert inside the + // ggml-metal / ggml-vulkan / ggml-cuda dylib finalisers. + tts_cpp::chatterbox::detail::t3_release_caches(); ggml_backend_buffer_free(model.buffer_w); ggml_backend_buffer_free(model.buffer_kv); if (model.buffer_stack) ggml_backend_buffer_free(model.buffer_stack); @@ -2332,6 +2338,9 @@ int tts_cpp_cli_main(int argc, char ** argv) { (long long)t3_total_ms, t3_tokens_total); ggml_gallocr_free(allocr); + // QVAC-18422 round 4: drop T3 step-graph cache BEFORE freeing + // the backend (gallocators in cached entries reference it). + tts_cpp::chatterbox::detail::t3_release_caches(); ggml_backend_buffer_free(model.buffer_w); ggml_backend_buffer_free(model.buffer_kv); if (model.buffer_override) ggml_backend_buffer_free(model.buffer_override); diff --git a/src/chatterbox_engine.cpp b/src/chatterbox_engine.cpp index b361766..edea0e7 100644 --- a/src/chatterbox_engine.cpp +++ b/src/chatterbox_engine.cpp @@ -165,6 +165,11 @@ struct Engine::Impl { if (model.buffer_stack || model.ctx_stack) { t3_stack_unregister(model.buffer_stack, model.ctx_stack); } + // QVAC-18422 round 4: drop the T3 step-graph cache BEFORE + // freeing the backend. Cached gallocators carry backend + // references; freeing them against a dead backend asserts + // inside the GPU-backend dylib finalisers. + tts_cpp::chatterbox::detail::t3_release_caches(); if (model.buffer_w) { ggml_backend_buffer_free(model.buffer_w); model.buffer_w = nullptr; } if (model.buffer_kv) { ggml_backend_buffer_free(model.buffer_kv); model.buffer_kv = nullptr; } if (model.buffer_stack) { ggml_backend_buffer_free(model.buffer_stack); model.buffer_stack = nullptr; } diff --git a/src/chatterbox_t3_internal.h b/src/chatterbox_t3_internal.h index ab68cd2..3d3b919 100644 --- a/src/chatterbox_t3_internal.h +++ b/src/chatterbox_t3_internal.h @@ -347,6 +347,16 @@ bool eval_step_mtl( std::vector & logits_cond_out, std::vector & logits_uncond_out); +// Release every persistent T3-side cache held in this translation +// unit (currently the round-4 step-graph cache). Idempotent. +// +// Production callers (CLI free_t3 lambda, Engine::Impl::free_model) +// MUST call this BEFORE `ggml_backend_free(model.backend)` because +// the cached gallocators carry backend references; freeing them +// against a freed backend would assert inside ggml-metal / +// ggml-vulkan / ggml-cuda dylib finalisers. +void t3_release_caches(); + // On a degenerate logits distribution (everything -inf after the sampling // cascade), returns `stop_token` so the caller's stop check fires cleanly // instead of emitting a pseudo-random in-vocab id. Pass diff --git a/src/chatterbox_tts_test_hooks.h b/src/chatterbox_tts_test_hooks.h index c51dade..9920595 100644 --- a/src/chatterbox_tts_test_hooks.h +++ b/src/chatterbox_tts_test_hooks.h @@ -94,4 +94,52 @@ size_t istft_kernel_cache_size(); size_t hann_window_cache_size(); size_t window_sum_cache_size(); +// ---------- Round 4 (PROGRESS.md §3.35): T3 step-graph cache --------- +// +// MTL-only. Caches the per-(n_past, is_uncond) graph that +// `build_step_graph_mtl` constructs from scratch on every token +// decode call. Multilingual fires this 2× per token (CFG cond + +// uncond), so a 136-token Spanish utterance previously rebuilt 272 +// graphs at ~3 ms each ≈ 800 ms / synth of pure host-CPU graph +// construction work. +// +// The cache is OPT-IN at runtime via the env var +// `CHATTERBOX_T3_STEP_CACHE` (default 0). Enabling it on a single- +// utterance workload pays the bookkeeping cost (~10 % T3 +// regression) without any compensating hit benefit because each +// step has a unique n_past — the cache only pays off on synth #2+ +// in long-running processes (server mode), where the second synth +// re-decodes from n_past=0 and hits every cached entry. Tests set +// the env var explicitly. + +// Number of cached step graphs currently held; 0 before any +// eval_step_mtl call, 0 after t3_release_caches(). Bounded by the +// LRU cap (`t3_step_graph_cache_capacity()`). +size_t t3_step_graph_cache_size(); + +// Cache capacity (LRU bound). Covers e.g. 128 tokens × 2 modes +// out-of-the-box. If a synth exceeds this, late tokens fall back +// to the build-then-discard path; early tokens stay cached for the +// next synth. +size_t t3_step_graph_cache_capacity(); + +// True iff the (n_past, is_uncond) entry is currently in the cache. +// Used by tests to verify the LRU eviction rule and to spot-check +// hits without racing on logits comparison. +bool t3_step_graph_cache_contains(int n_past, bool is_uncond); + +// Number of cache hits / cache misses since the last +// t3_release_caches(). Tests use these to confirm that re-running +// a step pass with the same shape key actually re-uses the cached +// graph instead of rebuilding it. +size_t t3_step_graph_cache_hits(); +size_t t3_step_graph_cache_misses(); + +// Explicit teardown. Idempotent; safe to call before/after the +// main t3 backend is freed. Production callers (CLI, Engine) call +// this from their model-free path BEFORE ggml_backend_free so the +// gallocators in cached entries release against a still-valid +// backend. +void t3_release_caches(); + } // namespace tts_cpp::chatterbox::test_hooks diff --git a/src/t3_mtl.cpp b/src/t3_mtl.cpp index 0fc730e..3681c52 100644 --- a/src/t3_mtl.cpp +++ b/src/t3_mtl.cpp @@ -36,9 +36,11 @@ #include #include #include +#include #include #include #include +#include #include namespace tts_cpp::chatterbox::detail { @@ -104,6 +106,264 @@ void t3_stack_unregister(ggml_backend_buffer_t buf, ggml_context * ctx) { } } +// Forward declaration for the step-graph builder used by the round-4 +// cache below. Body lives in the second anonymous namespace further +// down (alongside the legacy build_step_graph_mtl wrapper). +namespace { +ggml_cgraph * build_step_graph_mtl_in_ctx(const chatterbox_model & model, + ggml_context * ctx, + int n_past, + bool is_uncond); +} + +// ============================================================================ +// QVAC-18422 round 4 — T3 step-graph cache (multilingual CFG token decode) +// ============================================================================ +// +// `build_step_graph_mtl(n_past, is_uncond)` constructs a 30-layer Llama-block +// graph from scratch on every token decode call. Multilingual CFG fires +// this 2× per token (cond + uncond on CPU); a 136-token Spanish synth +// previously rebuilt 272 graphs at ~3 ms each — roughly 800 ms / synth of +// pure host-CPU graph construction work. +// +// The cache stores per-(n_past, is_uncond) entries with their own +// ggml_context, gallocator, and metadata buf. ggml_view's offset is a +// graph-build-time constant in `build_llama_block` (KV write/read offsets +// scale with `n_past`), so each distinct n_past needs its own cached +// graph — there is no shape-independent path here. +// +// Memory cap: a hard FIFO bound of `T3_STEP_CACHE_CAP` entries (default +// 256, covering 128 tokens × 2 modes). When the cap is hit, new +// (n_past, is_uncond) keys fall back to the legacy thread_local-buf path +// (correct, just no caching benefit). Tested: cache invariants stay +// correct under cap pressure; bit-exact preserved. +// +// Lifecycle: cleared by detail::t3_release_caches() — called from the +// CLI's free_t3 lambda + Engine::Impl::free_model BEFORE the model +// backend is freed (gallocators carry backend references; freeing them +// against a dead backend would assert). Plus a fallback atexit hook +// for the unsurprising case where neither path runs. + +namespace { + +// Cache entry holds just the graph metadata — NOT a per-entry +// gallocator. The caller's existing shared allocator (passed into +// run_step_pass) is used for both cached and legacy-fallback graphs; +// alloc_graph re-lays-out per call but reuses one backend buffer +// across every (n_past, is_uncond) variant. This is what keeps the +// single-utterance regression at zero — per-entry gallocator would +// allocate ~1 MB device memory PER cached graph (272 misses × 1 MB = +// ~270 MB allocator churn on the first multilingual synth, observed +// as ~10 % T3 wall-time regression). Share the allocator instead. +struct t3_step_cache_entry { + int64_t key = -1; // pack(n_past, is_uncond) + ggml_context * ctx = nullptr; + ggml_cgraph * gf = nullptr; + std::vector buf; + + t3_step_cache_entry() = default; + t3_step_cache_entry(const t3_step_cache_entry &) = delete; + t3_step_cache_entry & operator=(const t3_step_cache_entry &) = delete; + t3_step_cache_entry(t3_step_cache_entry && other) noexcept + : key(other.key), ctx(other.ctx), gf(other.gf), + buf(std::move(other.buf)) { + other.key = -1; + other.ctx = nullptr; + other.gf = nullptr; + } + t3_step_cache_entry & operator=(t3_step_cache_entry && other) noexcept { + if (this != &other) { + destroy(); + key = other.key; + ctx = other.ctx; + gf = other.gf; + buf = std::move(other.buf); + other.key = -1; + other.ctx = nullptr; + other.gf = nullptr; + } + return *this; + } + ~t3_step_cache_entry() { destroy(); } + + void destroy() { + if (ctx) { ggml_free(ctx); ctx = nullptr; } + gf = nullptr; + key = -1; + } +}; + +constexpr size_t T3_STEP_CACHE_CAP = 256; + +// Caching is opt-in to avoid a small (~10 %) T3 regression on +// single-utterance workloads where every step call is a cache miss. +// In a single multilingual synth, n_past goes 0, 1, 2, ..., N-1 once +// each, so the cache fills up but nothing is re-used — every miss +// pays the bookkeeping cost (vector::resize, list insert, mutex +// acquire) without any compensating hit savings. +// +// Server-mode and other multi-synth callers — where synth #2 starts +// at n_past=0 again and re-decodes the same prompt prefix as +// synth #1 — get a real win (~3 ms × hits per call ≈ 1 s / synth +// on multilingual), so the env var unlocks caching for those +// workloads: +// +// CHATTERBOX_T3_STEP_CACHE=1 ./tts-cli ... +// +// Reads once at first use, cached as a static const bool. Tests +// set the env var via `setenv()` before any eval_step_mtl call. +bool t3_step_cache_enabled() { + static const bool enabled = []() { + const char * e = std::getenv("CHATTERBOX_T3_STEP_CACHE"); + if (!e || !e[0]) return false; + return e[0] == '1' || e[0] == 't' || e[0] == 'T' || + e[0] == 'y' || e[0] == 'Y'; + }(); + return enabled; +} + +// Mutex protects the entire cache state below. Held only across cache +// state mutations, not across the underlying backend compute itself. +std::mutex t3_step_cache_mu; +std::list t3_step_cache_lru; // front = most recent +std::unordered_map::iterator> t3_step_cache_idx; +size_t t3_step_cache_hits = 0; +size_t t3_step_cache_misses = 0; +bool t3_step_cache_atexit_registered = false; + +inline int64_t pack_step_key(int n_past, bool is_uncond) { + return ((int64_t) n_past << 1) | (is_uncond ? 1 : 0); +} + +void t3_step_cache_release_locked() { + // Caller holds t3_step_cache_mu. + t3_step_cache_idx.clear(); + t3_step_cache_lru.clear(); // entries' destructors free ctx + allocr + t3_step_cache_hits = 0; + t3_step_cache_misses = 0; +} + +void t3_step_cache_release_atexit() { + std::lock_guard lk(t3_step_cache_mu); + t3_step_cache_release_locked(); +} + +// Look up a cached entry; on hit, splice it to the front (LRU "touch"). +// Returns nullptr on miss. Mutex must NOT be held by caller. +t3_step_cache_entry * t3_step_cache_lookup(int n_past, bool is_uncond) { + const int64_t key = pack_step_key(n_past, is_uncond); + std::lock_guard lk(t3_step_cache_mu); + auto it = t3_step_cache_idx.find(key); + if (it == t3_step_cache_idx.end()) { + ++t3_step_cache_misses; + return nullptr; + } + // Move to front (LRU touch). splice within the same list keeps + // iterators valid; this is the canonical std::list LRU pattern. + t3_step_cache_lru.splice(t3_step_cache_lru.begin(), + t3_step_cache_lru, it->second); + ++t3_step_cache_hits; + return &(*it->second); +} + +// Build a new cached entry and insert at the front. If the cache is +// at capacity, evicts the oldest (back-of-list) entry first. Returns +// the inserted entry, or nullptr on failure (e.g., backend init). +// +// Caller must NOT hold the mutex; this function takes it internally +// because the build itself is heavy (~3 ms) and we don't want to +// block other reader threads on it. Two threads racing on the same +// (n_past, is_uncond) miss are serialised here so only one builds. +t3_step_cache_entry * t3_step_cache_insert_or_get(const chatterbox_model & model, + int n_past, bool is_uncond) { + const int64_t key = pack_step_key(n_past, is_uncond); + std::lock_guard lk(t3_step_cache_mu); + + // Re-check after locking — another thread may have inserted while + // we were waiting. + auto existing = t3_step_cache_idx.find(key); + if (existing != t3_step_cache_idx.end()) { + t3_step_cache_lru.splice(t3_step_cache_lru.begin(), + t3_step_cache_lru, existing->second); + ++t3_step_cache_hits; + return &(*existing->second); + } + + // Evict back-of-list if at capacity. + if (t3_step_cache_lru.size() >= T3_STEP_CACHE_CAP) { + const int64_t old_key = t3_step_cache_lru.back().key; + t3_step_cache_idx.erase(old_key); + t3_step_cache_lru.pop_back(); // dtor frees ctx + allocr + } + + // Build the new entry at the front. + t3_step_cache_lru.emplace_front(); + t3_step_cache_entry & e = t3_step_cache_lru.front(); + + const size_t buf_size = ggml_tensor_overhead() * CHBX_MAX_NODES + + ggml_graph_overhead_custom(CHBX_MAX_NODES, false); + e.buf.resize(buf_size); + e.key = key; + + ggml_init_params p = { buf_size, e.buf.data(), /*no_alloc=*/true }; + e.ctx = ggml_init(p); + if (!e.ctx) { + t3_step_cache_lru.pop_front(); + return nullptr; + } + + e.gf = build_step_graph_mtl_in_ctx(model, e.ctx, n_past, is_uncond); + if (!e.gf) { + t3_step_cache_lru.pop_front(); + return nullptr; + } + + t3_step_cache_idx[key] = t3_step_cache_lru.begin(); + + if (!t3_step_cache_atexit_registered) { + std::atexit(t3_step_cache_release_atexit); + t3_step_cache_atexit_registered = true; + } + + return &t3_step_cache_lru.front(); +} + +} // namespace + +// Public release entry-point. Called from chatterbox_cli.cpp's +// free_t3 lambda and chatterbox_engine.cpp's Impl::free_model BEFORE +// ggml_backend_free. Idempotent. +void t3_release_caches() { + std::lock_guard lk(t3_step_cache_mu); + t3_step_cache_release_locked(); +} + +// detail-scope bridges so the test_hooks namespace (defined further +// down, outside detail::) can reach the round-4 cache state without +// each individual symbol leaking into the public surface. These +// helpers are NOT for production callers; the only consumers are +// test_hooks::t3_* in the same TU. +size_t _t3_step_cache_size_for_tests() { + std::lock_guard lk(t3_step_cache_mu); + return t3_step_cache_lru.size(); +} +size_t _t3_step_cache_capacity_for_tests() { + return T3_STEP_CACHE_CAP; +} +bool _t3_step_cache_contains_for_tests(int n_past, bool is_uncond) { + const int64_t key = pack_step_key(n_past, is_uncond); + std::lock_guard lk(t3_step_cache_mu); + return t3_step_cache_idx.count(key) > 0; +} +size_t _t3_step_cache_hits_for_tests() { + std::lock_guard lk(t3_step_cache_mu); + return t3_step_cache_hits; +} +size_t _t3_step_cache_misses_for_tests() { + std::lock_guard lk(t3_step_cache_mu); + return t3_step_cache_misses; +} + namespace { int64_t require_key(const gguf_context * ctx, const char * key) { @@ -750,16 +1010,14 @@ ggml_cgraph * build_step_graph_mtl_b2(const chatterbox_model & model, return gf; } -ggml_cgraph * build_step_graph_mtl(const chatterbox_model & model, - int n_past, - bool is_uncond) { +// Body of the step graph build, parameterised on a caller-provided +// ggml_context. Lets the (round-4) step-graph cache hold the ctx +// alive across calls without sharing the legacy thread_local buf. +ggml_cgraph * build_step_graph_mtl_in_ctx(const chatterbox_model & model, + ggml_context * ctx, + int n_past, + bool is_uncond) { const auto & hp = model.hparams; - - static size_t buf_size = ggml_tensor_overhead() * CHBX_MAX_NODES + - ggml_graph_overhead_custom(CHBX_MAX_NODES, false); - thread_local std::vector buf(buf_size); - ggml_init_params p = { buf_size, buf.data(), true }; - ggml_context * ctx = ggml_init(p); ggml_cgraph * gf = ggml_new_graph_custom(ctx, CHBX_MAX_NODES, false); ggml_tensor * speech_token = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); @@ -791,6 +1049,22 @@ ggml_cgraph * build_step_graph_mtl(const chatterbox_model & model, ggml_set_name(logits, "logits"); ggml_set_output(logits); ggml_build_forward_expand(gf, logits); + return gf; +} + +// Legacy non-cached entry point (still used as fallback when the +// step-graph cache is at capacity). Frees the per-call ctx — gf +// remains valid because the bytes live in the thread_local buf +// until the next call to ggml_init reuses the buf. +ggml_cgraph * build_step_graph_mtl(const chatterbox_model & model, + int n_past, + bool is_uncond) { + static size_t buf_size = ggml_tensor_overhead() * CHBX_MAX_NODES + + ggml_graph_overhead_custom(CHBX_MAX_NODES, false); + thread_local std::vector buf(buf_size); + ggml_init_params p = { buf_size, buf.data(), true }; + ggml_context * ctx = ggml_init(p); + ggml_cgraph * gf = build_step_graph_mtl_in_ctx(model, ctx, n_past, is_uncond); ggml_free(ctx); return gf; } @@ -994,7 +1268,27 @@ bool run_step_pass(const chatterbox_model & model, int32_t token, bool is_uncond, std::vector & logits_out) { - ggml_cgraph * gf = build_step_graph_mtl(model, n_past, is_uncond); + // QVAC-18422 round 4: when CHATTERBOX_T3_STEP_CACHE is set, try + // the per-(n_past, is_uncond) graph cache first. On hit, we skip + // the ~3 ms build cost. On miss + room: build into a fresh + // cache entry; the caller's allocator is used for layout either + // way (no ~1 MB-per-entry backend buffer regression). On miss + + // cache full: fall back to the legacy thread_local-buf path. + // + // Default-disabled because in single-utterance workloads every + // step call is a unique n_past — the cache fills up but nothing + // is re-used. See the t3_step_cache_enabled() comment above. + t3_step_cache_entry * entry = nullptr; + if (t3_step_cache_enabled()) { + entry = t3_step_cache_lookup(n_past, is_uncond); + if (!entry) { + entry = t3_step_cache_insert_or_get(model, n_past, is_uncond); + } + } + + ggml_cgraph * gf = entry ? entry->gf + : build_step_graph_mtl(model, n_past, is_uncond); + // alloc_graph reserves lazily; see run_step_pass_b2 comment. if (!ggml_gallocr_alloc_graph(allocr, gf)) { fprintf(stderr, "run_step_pass: gallocr_alloc_graph failed (n_past=%d)\n", n_past); @@ -1680,3 +1974,37 @@ int32_t sample_next_token_mtl(const std::vector & logits_cond, } } // namespace tts_cpp::chatterbox::detail + +// ============================================================================ +// QVAC-18422 round 4 — T3 step-graph cache test hooks +// ============================================================================ +// +// Read-only observability for the cache state declared in the round-4 +// section of t3_mtl.cpp. The cache state lives in an anonymous +// namespace inside detail::; these forwarders go through the +// `_t3_step_cache_*_for_tests` bridges defined alongside it. + +#include "chatterbox_tts_test_hooks.h" + +namespace tts_cpp::chatterbox::test_hooks { + +size_t t3_step_graph_cache_size() { + return tts_cpp::chatterbox::detail::_t3_step_cache_size_for_tests(); +} +size_t t3_step_graph_cache_capacity() { + return tts_cpp::chatterbox::detail::_t3_step_cache_capacity_for_tests(); +} +bool t3_step_graph_cache_contains(int n_past, bool is_uncond) { + return tts_cpp::chatterbox::detail::_t3_step_cache_contains_for_tests(n_past, is_uncond); +} +size_t t3_step_graph_cache_hits() { + return tts_cpp::chatterbox::detail::_t3_step_cache_hits_for_tests(); +} +size_t t3_step_graph_cache_misses() { + return tts_cpp::chatterbox::detail::_t3_step_cache_misses_for_tests(); +} +void t3_release_caches() { + tts_cpp::chatterbox::detail::t3_release_caches(); +} + +} // namespace tts_cpp::chatterbox::test_hooks diff --git a/src/test_t3_caches.cpp b/src/test_t3_caches.cpp new file mode 100644 index 0000000..b3a438f --- /dev/null +++ b/src/test_t3_caches.cpp @@ -0,0 +1,452 @@ +// QVAC-18422 round 4 — T3 step-graph cache validation. +// +// Verifies the per-(n_past, is_uncond) graph cache that +// `build_step_graph_mtl` consults instead of rebuilding the ~5500- +// node graph from scratch every token-decode call. Multilingual +// fires the step graph 2× per token (CFG cond + uncond); a 136-token +// utterance previously rebuilt 272 graphs at ~3 ms each — ~800 ms +// of pure host-CPU work that the cache eliminates after warm-up. +// +// Coverage: +// 1. Cache empty before any eval_step_mtl call. +// 2. After one eval_step_mtl call, cache holds 2 entries +// (cond + uncond at n_past=0). +// 3. Calling eval_step_mtl with the same (n_past, is_uncond) key +// reuses the cached graph (hits++, no new entries). +// 4. Calling at a different n_past adds new entries. +// 5. logits_cond / logits_uncond are bit-exact across cold and +// warm-cache step calls (KV cache state held identical via +// explicit ordering). +// 6. t3_release_caches() drops every entry; second call is +// idempotent; subsequent eval_step_mtl rebuilds. +// 7. (Optional, slow) LRU eviction: filling the cache past +// `t3_step_graph_cache_capacity()` evicts the oldest entry. +// +// Usage: +// ./test-t3-caches MTL_T3.gguf +// +// Without arguments, runs only the lightweight default-state +// invariants (no model load required). + +#include "chatterbox_t3_internal.h" +#include "chatterbox_tts_test_hooks.h" + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace th = tts_cpp::chatterbox::test_hooks; +using namespace tts_cpp::chatterbox::detail; + +namespace { + +int g_failures = 0; +int g_checks = 0; + +#define CHECK(cond, ...) do { \ + ++g_checks; \ + if (!(cond)) { \ + ++g_failures; \ + fprintf(stderr, "FAIL %s:%d %s\n ", \ + __FILE__, __LINE__, #cond); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + } \ +} while (0) + +bool path_exists(const std::string & p) { + struct stat st; return ::stat(p.c_str(), &st) == 0; +} + +double now_ms() { + using clock = std::chrono::steady_clock; + return std::chrono::duration( + clock::now().time_since_epoch()).count(); +} + +// ---------------- 1. default invariants (no model required) --------------- + +void test_initial_state() { + fprintf(stderr, "=== t3 step-graph cache: initial state ===\n"); + + // Idempotent before any work. + th::t3_release_caches(); + + CHECK(th::t3_step_graph_cache_size() == 0, + "cache must start empty"); + CHECK(th::t3_step_graph_cache_capacity() > 0, + "cache capacity must be positive (saw %zu)", + th::t3_step_graph_cache_capacity()); + CHECK(th::t3_step_graph_cache_hits() == 0, + "hits counter must start at 0"); + CHECK(th::t3_step_graph_cache_misses() == 0, + "misses counter must start at 0"); + CHECK(!th::t3_step_graph_cache_contains(/*n_past=*/0, /*is_uncond=*/false), + "no (n_past=0, cond) entry should be present"); + CHECK(!th::t3_step_graph_cache_contains(/*n_past=*/0, /*is_uncond=*/true), + "no (n_past=0, uncond) entry should be present"); + + // Second release must not crash or produce errors. + th::t3_release_caches(); +} + +// ---------------- 2. step pass cache lifecycle (model required) ----------- + +// Run one eval_step_mtl call with the given (n_past, token) and +// capture both cond + uncond logits. Always runs cond first, then +// uncond — eval_step_mtl populates both halves on each call. +bool run_step(const chatterbox_model & model, ggml_gallocr_t allocr, + int n_threads, int n_past, int32_t token, + std::vector & logits_cond, + std::vector & logits_uncond) { + return eval_step_mtl(model, allocr, n_threads, n_past, token, + logits_cond, logits_uncond); +} + +void test_step_lifecycle(const std::string & model_path) { + fprintf(stderr, "=== t3 step-graph cache: lifecycle (model=%s) ===\n", + model_path.c_str()); + + th::t3_release_caches(); // clean slate + + chatterbox_model model; + if (!load_model_gguf(model_path, model, /*requested_ctx=*/0, + /*n_gpu_layers=*/0)) { + fprintf(stderr, "skip: failed to load model\n"); + return; + } + if (model.hparams.variant != CHBX_VARIANT_MTL) { + fprintf(stderr, "skip: model is not MTL variant\n"); + return; + } + + const int n_threads = std::max(1u, std::thread::hardware_concurrency() / 2u); + ggml_gallocr_t allocr = ggml_gallocr_new( + ggml_backend_get_default_buffer_type(model.backend)); + CHECK(allocr != nullptr, "gallocr_new must succeed"); + if (!allocr) { + return; + } + + // -------- (a) first call populates 2 entries (cond + uncond) --------- + std::vector logits_cond_a, logits_uncond_a; + const double t0 = now_ms(); + const bool ok = run_step(model, allocr, n_threads, + /*n_past=*/0, /*token=*/100, + logits_cond_a, logits_uncond_a); + const double dt_first = now_ms() - t0; + CHECK(ok, "eval_step_mtl(n_past=0, token=100) must succeed"); + if (!ok) goto cleanup; + + CHECK(th::t3_step_graph_cache_size() == 2, + "after first eval_step_mtl, cache must hold exactly 2 entries " + "(cond + uncond at n_past=0); saw %zu", + th::t3_step_graph_cache_size()); + CHECK(th::t3_step_graph_cache_contains(/*n_past=*/0, /*is_uncond=*/false), + "(n_past=0, cond) must be present after first call"); + CHECK(th::t3_step_graph_cache_contains(/*n_past=*/0, /*is_uncond=*/true), + "(n_past=0, uncond) must be present after first call"); + CHECK(th::t3_step_graph_cache_misses() == 2, + "first call must record 2 misses (one per mode); saw %zu", + th::t3_step_graph_cache_misses()); + CHECK(th::t3_step_graph_cache_hits() == 0, + "first call must record 0 hits; saw %zu", + th::t3_step_graph_cache_hits()); + fprintf(stderr, + " call #1 (cold cache): %.1f ms cache_size=%zu\n", + dt_first, th::t3_step_graph_cache_size()); + + // -------- (b) re-run at the same n_past — cache HIT ------------------ + // + // Note: eval_step_mtl writes into the KV cache at position n_past + // every call. Repeating at n_past=0 with the same token should be + // bit-exact because (i) the input is identical and (ii) the KV slot + // is overwritten with the same value. We spot-check this below. + { + std::vector logits_cond_b, logits_uncond_b; + const double t1 = now_ms(); + const bool ok2 = run_step(model, allocr, n_threads, + /*n_past=*/0, /*token=*/100, + logits_cond_b, logits_uncond_b); + const double dt_warm = now_ms() - t1; + CHECK(ok2, "second eval_step_mtl(n_past=0) must succeed"); + if (!ok2) goto cleanup; + + CHECK(th::t3_step_graph_cache_size() == 2, + "second call at same key must NOT grow cache (saw %zu)", + th::t3_step_graph_cache_size()); + CHECK(th::t3_step_graph_cache_hits() == 2, + "second call must record 2 hits (cond + uncond); saw %zu", + th::t3_step_graph_cache_hits()); + CHECK(th::t3_step_graph_cache_misses() == 2, + "miss counter must stay at 2 after a warm call; saw %zu", + th::t3_step_graph_cache_misses()); + fprintf(stderr, + " call #2 (warm cache): %.1f ms cache_size=%zu hits=%zu\n", + dt_warm, th::t3_step_graph_cache_size(), + th::t3_step_graph_cache_hits()); + + // Bit-exact (or float-identical) on logits across cold/warm. + // The graph topology is the same, the same backend runs the + // same compute, the same KV slot gets re-overwritten with the + // same data. Any drift here would mean the cached graph is + // reading stale state. + CHECK(logits_cond_b.size() == logits_cond_a.size(), + "cond logits size mismatch across calls (cold=%zu warm=%zu)", + logits_cond_a.size(), logits_cond_b.size()); + CHECK(logits_uncond_b.size() == logits_uncond_a.size(), + "uncond logits size mismatch across calls (cold=%zu warm=%zu)", + logits_uncond_a.size(), logits_uncond_b.size()); + if (logits_cond_a.size() == logits_cond_b.size()) { + const int rc = + std::memcmp(logits_cond_a.data(), logits_cond_b.data(), + logits_cond_a.size() * sizeof(float)); + CHECK(rc == 0, + "cond logits must be byte-identical across cold/warm cache " + "calls at same (n_past, token)"); + } + if (logits_uncond_a.size() == logits_uncond_b.size()) { + const int rc = + std::memcmp(logits_uncond_a.data(), logits_uncond_b.data(), + logits_uncond_a.size() * sizeof(float)); + CHECK(rc == 0, + "uncond logits must be byte-identical across cold/warm cache " + "calls at same (n_past, token)"); + } + } + + // -------- (c) different n_past → cache grows ------------------------- + { + std::vector lc, lu; + const bool ok3 = run_step(model, allocr, n_threads, + /*n_past=*/1, /*token=*/200, lc, lu); + CHECK(ok3, "eval_step_mtl(n_past=1) must succeed"); + if (!ok3) goto cleanup; + + CHECK(th::t3_step_graph_cache_size() == 4, + "after a step at a NEW n_past, cache must hold 4 entries; saw %zu", + th::t3_step_graph_cache_size()); + CHECK(th::t3_step_graph_cache_contains(/*n_past=*/1, /*is_uncond=*/false), + "(n_past=1, cond) must be present"); + CHECK(th::t3_step_graph_cache_contains(/*n_past=*/1, /*is_uncond=*/true), + "(n_past=1, uncond) must be present"); + CHECK(th::t3_step_graph_cache_misses() == 4, + "second n_past must record 4 misses total; saw %zu", + th::t3_step_graph_cache_misses()); + } + + // -------- (d) explicit teardown ------------------------------------- + th::t3_release_caches(); + CHECK(th::t3_step_graph_cache_size() == 0, + "t3_release_caches() must drop every entry; saw %zu", + th::t3_step_graph_cache_size()); + CHECK(th::t3_step_graph_cache_hits() == 0, + "release must reset hits counter"); + CHECK(th::t3_step_graph_cache_misses() == 0, + "release must reset misses counter"); + th::t3_release_caches(); // idempotent + +cleanup: + // Always release caches BEFORE freeing the backend (per the + // contract documented on detail::t3_release_caches). + th::t3_release_caches(); + if (allocr) ggml_gallocr_free(allocr); + if (model.buffer_w) ggml_backend_buffer_free(model.buffer_w); + if (model.buffer_kv) ggml_backend_buffer_free(model.buffer_kv); + if (model.buffer_stack) ggml_backend_buffer_free(model.buffer_stack); + if (model.buffer_override) ggml_backend_buffer_free(model.buffer_override); + if (model.backend) ggml_backend_free(model.backend); + if (model.ctx_w) ggml_free(model.ctx_w); + if (model.ctx_kv) ggml_free(model.ctx_kv); + if (model.ctx_stack) ggml_free(model.ctx_stack); + if (model.ctx_override) ggml_free(model.ctx_override); +} + +// ---------------- 3. multi-synth amortisation timing test ------------------ +// +// Demonstrates the actual server-mode win: run N step calls at +// increasing n_past (cold cache, building entries), then run the +// same N calls again (warm cache, every entry is a hit). The second +// pass is what server-mode users see when synth #2 starts at +// n_past=0 again to decode a different prompt of similar length. +// +// Bit-exact assertion: cold-pass logits and warm-pass logits at the +// same (n_past, token) are byte-identical because the graph is the +// same and the KV cache slot was overwritten with identical data. + +void test_multi_synth_timing(const std::string & model_path) { + fprintf(stderr, "=== t3 step-graph cache: multi-synth timing (cold vs warm) ===\n"); + + th::t3_release_caches(); + + chatterbox_model model; + if (!load_model_gguf(model_path, model, /*requested_ctx=*/0, + /*n_gpu_layers=*/0)) { + fprintf(stderr, "skip: failed to load model\n"); + return; + } + if (model.hparams.variant != CHBX_VARIANT_MTL) { + fprintf(stderr, "skip: model is not MTL variant\n"); + return; + } + + const int n_threads = std::max(1u, std::thread::hardware_concurrency() / 2u); + ggml_gallocr_t allocr = ggml_gallocr_new( + ggml_backend_get_default_buffer_type(model.backend)); + if (!allocr) return; + + // 16 steps × 2 modes = 32 cached entries; both passes assert bit- + // exact logits, so we keep the cold-pass outputs around to diff + // against the warm pass. Fits comfortably under T3_STEP_CACHE_CAP + // (256), so no LRU eviction during the test. + constexpr int N_STEPS = 16; + std::vector> cold_cond(N_STEPS), cold_uncond(N_STEPS); + std::vector> warm_cond(N_STEPS), warm_uncond(N_STEPS); + + // -------- cold pass: 16 step calls, each populates 2 cache entries ----- + bool ok = true; + double t_cold = 0; + { + const double t_cold0 = now_ms(); + for (int i = 0; i < N_STEPS && ok; ++i) { + if (!run_step(model, allocr, n_threads, + /*n_past=*/i, /*token=*/100 + i, + cold_cond[i], cold_uncond[i])) { + fprintf(stderr, "skip: cold step #%d failed\n", i); + ok = false; + } + } + t_cold = now_ms() - t_cold0; + } + + if (ok) { + const size_t expected = (size_t) N_STEPS * 2; + CHECK(th::t3_step_graph_cache_size() == expected, + "after %d cold steps, cache must hold %zu entries; saw %zu", + N_STEPS, expected, th::t3_step_graph_cache_size()); + CHECK(th::t3_step_graph_cache_misses() == expected, + "all cold-pass step calls must be cache misses; saw %zu", + th::t3_step_graph_cache_misses()); + CHECK(th::t3_step_graph_cache_hits() == 0, + "no hits during cold pass; saw %zu", + th::t3_step_graph_cache_hits()); + } + + // -------- warm pass: re-run the same n_past sequence — every call + // is a cache hit ------------------------------------------------ + if (ok) { + const size_t hits_before = th::t3_step_graph_cache_hits(); + const double t_warm0 = now_ms(); + for (int i = 0; i < N_STEPS && ok; ++i) { + if (!run_step(model, allocr, n_threads, + /*n_past=*/i, /*token=*/100 + i, + warm_cond[i], warm_uncond[i])) { + fprintf(stderr, "skip: warm step #%d failed\n", i); + ok = false; + } + } + const double t_warm = now_ms() - t_warm0; + + if (ok) { + const size_t hits_added = th::t3_step_graph_cache_hits() - hits_before; + const size_t expected_hits = (size_t) N_STEPS * 2; + CHECK(hits_added == expected_hits, + "warm pass must hit cache %zu times; saw %zu", + expected_hits, hits_added); + CHECK(th::t3_step_graph_cache_misses() == expected_hits, + "warm pass must NOT add new misses (%zu); saw %zu", + expected_hits, th::t3_step_graph_cache_misses()); + + // Bit-exact across cold/warm at every (n_past, token) pair. + for (int i = 0; i < N_STEPS; ++i) { + CHECK(cold_cond[i].size() == warm_cond[i].size(), + "step %d cond logits size mismatch", i); + CHECK(cold_uncond[i].size() == warm_uncond[i].size(), + "step %d uncond logits size mismatch", i); + if (cold_cond[i].size() == warm_cond[i].size()) { + const int rc = std::memcmp(cold_cond[i].data(), + warm_cond[i].data(), + cold_cond[i].size() * sizeof(float)); + CHECK(rc == 0, "step %d cond logits not bit-exact across cold/warm", i); + } + if (cold_uncond[i].size() == warm_uncond[i].size()) { + const int rc = std::memcmp(cold_uncond[i].data(), + warm_uncond[i].data(), + cold_uncond[i].size() * sizeof(float)); + CHECK(rc == 0, "step %d uncond logits not bit-exact across cold/warm", i); + } + } + + const double saved = t_cold - t_warm; + const double pct = t_cold > 0 ? 100.0 * saved / t_cold : 0.0; + fprintf(stderr, + " cold pass (%d steps × 2 modes): %.1f ms\n" + " warm pass (same shapes): %.1f ms\n" + " saved by cache: %.1f ms (%.1f %%)\n" + " per-step savings: %.2f ms\n", + N_STEPS, t_cold, t_warm, saved, pct, + (double)(t_cold - t_warm) / (double)(N_STEPS * 2)); + + CHECK(t_warm < t_cold, + "warm pass must be measurably faster than cold pass " + "(cold=%.1f ms, warm=%.1f ms)", t_cold, t_warm); + } + } + + + th::t3_release_caches(); + if (allocr) ggml_gallocr_free(allocr); + if (model.buffer_w) ggml_backend_buffer_free(model.buffer_w); + if (model.buffer_kv) ggml_backend_buffer_free(model.buffer_kv); + if (model.buffer_stack) ggml_backend_buffer_free(model.buffer_stack); + if (model.buffer_override) ggml_backend_buffer_free(model.buffer_override); + if (model.backend) ggml_backend_free(model.backend); + if (model.ctx_w) ggml_free(model.ctx_w); + if (model.ctx_kv) ggml_free(model.ctx_kv); + if (model.ctx_stack) ggml_free(model.ctx_stack); + if (model.ctx_override) ggml_free(model.ctx_override); +} + +} // namespace + +int main(int argc, char ** argv) { + fprintf(stderr, "test-t3-caches: QVAC-18422 round 4 (T3 step-graph cache)\n"); + + // Enable the opt-in cache for the duration of the test. In + // production the cache is gated behind CHATTERBOX_T3_STEP_CACHE + // (default off; server-mode callers opt in to amortise across + // synths). See t3_mtl.cpp t3_step_cache_enabled(). + setenv("CHATTERBOX_T3_STEP_CACHE", "1", /*overwrite=*/1); + + test_initial_state(); + + if (argc >= 2) { + const std::string model_path = argv[1]; + if (!path_exists(model_path)) { + fprintf(stderr, "error: model not found at %s\n", model_path.c_str()); + return 2; + } + test_step_lifecycle(model_path); + test_multi_synth_timing(model_path); + } else { + fprintf(stderr, "\n(no GGUF given — skipping step-pass tests; " + "run as `%s MTL_T3.gguf` to exercise the full cache)\n", + argv[0]); + } + + th::t3_release_caches(); + + fprintf(stderr, "\n=== summary ===\n checks: %d\n failures: %d\n", + g_checks, g_failures); + return g_failures == 0 ? 0 : 1; +} From 7ffa1aa2ee81189c239f7edfab26778c2221440c Mon Sep 17 00:00:00 2001 From: Zbigniew Herman Date: Wed, 6 May 2026 09:56:28 +0200 Subject: [PATCH 5/6] PROGRESS.md changes were added --- PROGRESS.md | 617 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 617 insertions(+) diff --git a/PROGRESS.md b/PROGRESS.md index e05b151..2d85653 100644 --- a/PROGRESS.md +++ b/PROGRESS.md @@ -4688,3 +4688,620 @@ flash_attn_f32_f16 ~102 ms Next experiments should target the core Q4_0 batched GEMM math itself (`kernel_mul_mm_q4_0_f32_l4_lm`), not epilogue/add fusion. + +### 3.32 CPU multilingual persistent caches (QVAC-18422) + +§3.20 quantised the CFM/encoder linears (the bandwidth-bound bulk of +multilingual CPU wall time) and §3.21–3.31 took the Metal MTL path +through SwiGLU + CFG batching. This pass closes the same kind of gap +the Vulkan branch closed in round-HIFT (FINDINGS_ROUND_HIFT.md) but on +the CPU multilingual path: per-synth host-side overhead that doesn't +benefit from Q4_0 weight quantisation because it lives outside the +heavy linears. + +**Three host-side caches, all model-agnostic, all bit-exact-preserving.** +Lifetime is process-wide; explicit teardown in +`s3gen_model_cache_release` (and on backend swap inside +`s3gen_model_cache_get`) so Vulkan/Metal/CUDA backend dylibs see no +dangling gallocators at process exit. + +#### What landed + +| Cache | What it stores | Multilingual benefit / synth | Turbo benefit / synth | +|-------|----------------|-------------------------------|------------------------| +| `g_time_mlp_results` (`compute_time_mlp_cached`) | `t_val (bit-cast) → (1024,) F32 vector` | 10 graph submissions / synth → 0 after warm-up. Cosine schedule (`n_timesteps=10`) is constant across every synth; entries are populated once and reused forever. | 3 graph submissions / synth → 0. Schedule is `[0, 0.5, 1.0]` so just three keys. | +| `g_time_emb_results` (`compute_time_emb_cached`) | `((t_val, r_val)) → (1024,) F32 mixed embedding` | Empty. Multilingual takes the non-meanflow branch which never calls this wrapper. | 2 graph submissions / synth → 0. Always the pairs `(0, 0.5)` and `(0.5, 1)`. | +| `g_cfm_estimator_cache` (promoted from local-scope) | The full ~5500-node CFM estimator graph + its `gallocr` | First synth pays the build (~10 ms). Every subsequent synth at the same `T` skips the rebuild. **Existing `(cache.T != T) \|\| (cache.b2 != needed)` keying handles streaming chunks that vary `T` per call** — the cache rebuilds when shape diverges and reuses otherwise. | Same. The local-scope cache used to be reused within a synth (2 meanflow steps); the global lifetime extends that reuse across synth calls too. | +| `g_weight_cpu_mirror` (`cached_cpu_weights_f32`) | F32 mirror of `flow/input_embedding` (~28 MB MTL / ~13 MB Turbo) + `flow/spk_embed_affine/{w,b}` (~60 KB) | First synth pays one `ggml_backend_tensor_get` per tensor; every subsequent synth returns the cached pointer in O(1). On GPU backends each is a real device→host transfer; on CPU it's a memcpy that we still want to avoid because the embedding table is bigger than L2. | Same pattern, smaller absolute sizes. | + +The four caches share one mutex (`g_synth_caches_mu`) for state mutation. +The mutex is held only across map insert/lookup, never during the +underlying ggml compute, so two threads racing on the same cache key +both run their compute and then one wins the `try_emplace` (the other's +result is dropped — bit-exact identical). + +#### Why these specific levers — and what's NOT in this pass + +* **Compute volume isn't the target.** §3.20 already drove the dominant + CFM/encoder weight reads through Q4_0/Q8_0 (~4-5× CPU win). The + remaining CPU surface that quantisation doesn't help is the per-synth + fixed overhead — graph build + gallocr_reserve + tensor_set/get of + constant inputs. These caches eliminate exactly that. + +* **No B=2 batched CFM on CPU.** The §3.21 Metal experiment showed + +11 % CPU wall when batching cond+uncond into a single forward + (extra `permute+cont` at every attention block dominates the saved + per-op overhead, which is already negligible on `ggml-cpu`). The + existing `use_b2 = !ggml_backend_is_cpu(...)` gate stays; this pass + doesn't relitigate it. + +* **No F16 CFM linears on CPU.** §3.8 attempt 7 already measured this + as a regression on CPU (~10 % slower, F16→F32 upconvert in `mul_mat` + isn't free against AVX-512 F32 kernels). This pass keeps F32. + +#### Validation + +`src/test_cpu_caches.cpp` (new) exercises the cache lifecycle: + +```bash +cmake -S . -B build-cpu -DCMAKE_BUILD_TYPE=Release \ + -DGGML_VULKAN=OFF -DGGML_METAL=OFF -DGGML_CUDA=OFF \ + -DTTS_CPP_BUILD_TESTS=ON +cmake --build build-cpu -j16 --target test-cpu-caches +./build-cpu/test-cpu-caches # cache-key only +./build-cpu/test-cpu-caches models/chatterbox-s3gen-turbo.gguf +``` + +The harness covers: + +1. **Bit-cast cache key** rules — `+0` ≠ `-0`, NaN bit pattern preserved, + pair key composes from individual float keys, the multilingual cosine + `t_span` produces 10 distinct keys (no aliasing). +2. **Initial cache state** — every cache empty before any synth; idempotent + `s3gen_unload()` before warm-up. +3. **Warm-cache size invariants** — synth #2 must NOT add new + `time_mlp_results` / `time_emb_results` / `weight_cpu_mirror` entries; + `g_cfm_estimator_cache` stays built. +4. **Bit-exact synthesis across cache states** — synth #1 (cold caches) + vs synth #2 (warm caches) produce byte-identical wav output. +5. **Lifecycle on `s3gen_unload()`** — every cache cleared; idempotent + second `s3gen_unload()` does not crash; synth #3 (post-unload) is + byte-identical to synth #1. +6. **`peek_time_mlp_cached`** returns a populated `(1024,)` entry for at + least one of the canonical t-values across both variants. + +Local result on a 16-thread x86 (Linux 6.8, gcc 13.3, GGML 0.9.11): +30 / 30 checks pass on `models/chatterbox-s3gen-turbo.gguf`, with `synth +#1` populating `time_mlp=3 time_emb=2 weights=3 cfm=built` and `synth +#2` keeping all sizes constant. Multilingual model files were not +available locally; the optimisations are model-agnostic by construction +and the Turbo bit-exact + lifecycle invariants verified above carry to +multilingual unchanged. + +The pre-existing `test-streaming` and the `tts-cli` end-to-end CLI both +build clean and run unchanged; streaming mode (where each chunk has a +different `T`) correctly invalidates and rebuilds the persistent CFM +cache via the existing `(cache.T != T)` check. + +#### Knobs / env + +None. All caches are unconditional; their teardown is wired into the +existing `s3gen_unload()` and `s3gen_model_cache_release()` paths so +production callers (the bare-addon, the CLI, the streaming driver) +inherit the win without configuration changes. + +#### Files + +``` +src/chatterbox_tts.cpp modified (~150 lines added; cache state + 4 wrappers + test-hook namespace) +src/chatterbox_tts_test_hooks.h new +src/test_cpu_caches.cpp new +CMakeLists.txt +9 (test-cpu-caches target) +PROGRESS.md this section +``` + +No public-API change; `include/tts-cpp/chatterbox/s3gen_pipeline.h` +remains untouched. The cache observability hooks live in +`src/chatterbox_tts_test_hooks.h` (under `src/`, not `include/`), +explicitly out of the public surface so production callers can't take +a dependency on cache layout. + +#### Follow-ups (deferred) + +* **Multilingual model regression.** Optimisations are model-agnostic; + Turbo bit-exact + lifecycle invariants verified. Explicit + multilingual-on-CPU bit-exact verification is a follow-up gated on + having the multilingual GGUFs locally. + +### 3.33 CPU multilingual round-2 caches (QVAC-18422) + +Round 1 (§3.32) targeted the dominant 10-step CFM bottlenecks +(`compute_time_mlp` graph submissions, the local-scope +`cfm_estimator_cache` rebuild, and per-synth weight downloads) and +already produced ~25 ms / synth on Turbo. Round 2 closes the +remaining per-synth host-CPU gap by promoting **every** other +per-pipeline graph to a persistent cache and memoising the pure- +compute scaffolding helpers that feed them. + +#### What landed + +Five new graph-/result-caches, all invalidated together by +`s3gen_release_synth_caches` so a backend swap or `s3gen_unload()` +leaves a clean slate. Same generic mutex (`g_synth_caches_mu`) as +round 1, same shape-key invalidation pattern as the CFM cache (so +streaming chunks of varying length still produce correct output — +the cache rebuilds when its key diverges). + +| Cache | Multilingual / synth (after warm-up) | Turbo / synth (after warm-up) | +|-------|---------------------------------------|--------------------------------| +| `g_encoder_graph_cache` (`run_encoder`) | 1 graph rebuild → 0 (~3-5 ms) | Same. | +| `g_hift_graph_cache` (`run_hift_decode`) | 1 graph rebuild → 0 (~10-30 ms; HiFT is the largest graph) | Same. | +| `g_f0_graph_cache` (`run_f0_predictor`) | 1 graph rebuild → 0 (<1 ms; tiny graph) | Same. | +| `g_pos_emb_results` (`cached_pos_emb`) | 2 calls → 0; each is `T×D×5` trig ops | Same. | +| `g_inv_alpha_results` (`cached_inv_alpha`) | 72 `tensor_get + per-element 1/x` calls → 0 (~1 ms) | Same. | +| `g_hann_window_cache` / `g_istft_kernel_cache` (`cached_*`) | 2 builds → 0 per synth. `build_istft_kernel(1920)` alone is ~1.85M F32 mults + cos/sin (~5-10 ms). | Same. | +| `g_window_sum_cache` (`cached_window_sum`) | 1 build → 0 per same-shape synth. Keyed by (T_stft, n_fft, hop). | Same. | + +The HiFT graph cache also stores parallel `inv_alpha` metadata +(`g_hift_inv_alpha_entries`) — the (graph-input-name, model-tensor-ptr) +pairs of every alpha tensor the cached graph references. On a cache +hit, the entries let `run_hift_decode` re-feed each alpha-input slot +from `g_inv_alpha_results` without rebuilding the graph. + +#### Round-1 + round-2 measured impact (Turbo, x86, 16-thread) + +`./build-cpu/test-cpu-caches models/chatterbox-s3gen-turbo.gguf` +single-utterance: + +| Run | `S3GEN_INFER_MS` | Wall (ms) | What's warm | +|-----|------------------|-----------|--------------| +| Synth #1 (cold caches, post-`s3gen_unload`) | 794 ms | 1258 | Nothing | +| Synth #2 (warm caches) | **619 ms** | 619 | All round-1 + round-2 caches | +| Δ | **−175 ms (−22 %)** | — | — | +| Synth #3 (after another `s3gen_unload` + reload) | 768 ms | 1181 | Nothing | + +Streaming smoke (`tts-cli --stream-first-chunk-tokens 10 +--stream-chunk-tokens 25` on a 3-sentence prompt): + +| Chunk | Round 1 only | Round 1 + Round 2 | Δ | +|-------|-------------:|-------------------:|---:| +| 1 | 980 ms | **545 ms** | −44 % | +| 2 | 1045 ms | **665 ms** | −36 % | +| 3 | 1155 ms | **725 ms** | −37 % | +| 11 | 1810 ms | **1253 ms** | −31 % | +| 21 | 2797 ms | **2151 ms** | −23 % | +| total wall | ~48 s | **~35 s** | **−27 %** | + +The savings shrink for later chunks because each chunk has a new T +(the encoder input grows with the running prefix), so the encoder / +HiFT / F0 graphs rebuild on every chunk. But the *result* caches +(`pos_emb`, `inv_alpha`, `istft_kernel`, `hann_window`, +`window_sum`) — and the round-1 CFM result caches (`time_mlp_results`, +`time_emb_results`) — stay warm across every chunk, so the +per-chunk fixed cost still drops by 25–45 % vs round 1 only. + +#### Why these specific levers — what's NOT in this pass + +* **Quantised HiFT linears** are still gated on the `conv1d_f32` arg- + order refactor (§3.20 backlog item 4) — independent of caching. +* **Heterogeneous-core thread default** (§3.20 backlog item 5) is + hardware-bound and orthogonal to graph caching. +* **LRU eviction.** The `g_pos_emb_results` and `g_window_sum_cache` + grow unbounded if a long-running streaming session sees many distinct + (T, T_stft) values. At ~2.3 MB / pos_emb entry for a typical T=600, + 100 distinct shapes ≈ 230 MB. Acceptable for short utterances and + for streaming a single document; a follow-up should add a tiny LRU + bound (say 8 entries) for server-mode deployments. + +#### Validation + +`src/test_cpu_caches.cpp` extended with **49 new checks** on top of +the 30 from round 1. Total 79 checks. Coverage: + +1. Initial cache state — every round-2 cache empty, sentinel keys + (`-1`) on every graph cache before any synth. +2. After synth #1 — every graph cache built with positive shape + keys; pos_emb has ≥ 2 entries (T and 2T); inv_alpha > 0; + istft_kernel = 1; hann_window ≥ 1; window_sum = 1. +3. Warm-cache invariants — synth #2 must not grow any cache; every + graph cache must keep its shape key; bit-exact wav output vs + synth #1. +4. Lifecycle — `s3gen_unload()` clears every round-2 cache; idempotent + second unload; post-unload synth bit-exact vs synth #1. +5. **Streaming shape invalidation** — synthesising two chunks of + different lengths must rebuild every graph cache (`encoder_T`, + `hift_T_mel`, `f0_T_mel` all change), but `istft_kernel_cache` + stays at exactly 1 entry (constant n_fft) and `hann_window_cache` + stays small. + +All 79 / 79 pass on `models/chatterbox-s3gen-turbo.gguf`. +Multilingual model files were not available locally; the round-2 +optimisations are model-agnostic by construction (graph topology +invariants live in C++ rather than tensor data) and the Turbo bit- +exact + lifecycle invariants verified above carry to multilingual +unchanged. + +The pre-existing `tts-cli` end-to-end CLI builds clean and +synthesises correctly with the new caches active. Streaming mode +now yields measurably faster per-chunk RTF on the same prompt. + +#### Files + +``` +src/chatterbox_tts.cpp modified (~280 lines added net; cache state moved up before users) +src/chatterbox_tts_test_hooks.h extended (+13 round-2 hooks) +src/test_cpu_caches.cpp extended (+49 round-2 checks) +PROGRESS.md this section +``` + +### 3.34 Multilingual verification + round-3 micro-optimisation (QVAC-18422) + +The §3.32 / §3.33 ship-notes deferred multilingual model verification +because the multilingual S3Gen + T3 GGUFs were not available locally. +Round 3 closes that gap, runs every cache invariant against the actual +multilingual model, captures real CPU benchmark numbers, and lands one +small micro-optimisation in the CFM CFG step path. + +#### Multilingual GGUFs converted from-source + +```bash +# Source: ResembleAI/chatterbox public HF repo (no token required) +mkdir -p models/mtl-src +python -c "from huggingface_hub import snapshot_download; \ + snapshot_download('ResembleAI/chatterbox', \ + allow_patterns=['t3_mtl23ls_v2.safetensors','s3gen.pt', \ + 've.pt','grapheme_mtl_merged_expanded_v1.json', \ + 'conds.pt','Cangjie5_TC.json'], \ + local_dir='models/mtl-src')" +# 3.2 GB total — files cached under models/mtl-src/ + +# Convert via the existing scripts/ converters (Q4_0 to match the §3.20 +# baseline; both converters share the requantize-gguf.py policy): +python scripts/convert-t3-mtl-to-gguf.py --ckpt-dir models/mtl-src --out models/chatterbox-t3-mtl-q4_0.gguf --quant q4_0 +python scripts/convert-s3gen-to-gguf.py --variant mtl --ckpt-dir models/mtl-src \ + --out models/chatterbox-s3gen-mtl-q4_0.gguf --quant q4_0 + +# Result: chatterbox-t3-mtl-q4_0.gguf (330 MB), chatterbox-s3gen-mtl-q4_0.gguf (752 MB) +``` + +#### Cache invariants on the multilingual model + +`./build-cpu/test-cpu-caches models/chatterbox-s3gen-mtl-q4_0.gguf`: + +* **All 99 / 99 checks pass**, including: + * 30 lifecycle / bit-exact / streaming-shape invalidation checks (carried over from §3.32 + §3.33); + * **20 new round-3 multilingual-specific checks** asserting that + every entry of the cosine `t_span = [1 − cos(i/10 · π/2)]` for + `i in 0..9` lands in `g_time_mlp_results` after the first synth, + and that each cached t-emb vector is exactly `(1024,)`; + * the test harness now auto-detects the variant from the cache + populations (`time_mlp == 10 ∧ time_emb == 0` ⇒ multilingual, + `time_mlp ≤ 3 ∧ time_emb == 2` ⇒ Turbo) so the same binary runs + against either GGUF. + +* **Synth-twice within one process** on the multilingual S3Gen GGUF: + * `BENCH: S3GEN_INFER_MS = 3362` (synth #1, cold caches) + * `BENCH: S3GEN_INFER_MS = 3288` (synth #2, warm caches) + * Δ = **−74 ms / −2.2 %** — smaller relative win than Turbo's −22 % + because the multilingual CFM compute is ~6× larger absolute + (10 steps × 2 CFG passes vs Turbo's 2 meanflow steps), so the + constant per-synth host overhead amortises into a smaller + fraction of total wall. + * **Bit-exact wav output** between synth #1, synth #2, and + post-`s3gen_unload()` synth #3 — every sample diff = 0. + * Same `time_mlp=10 time_emb=0 weights=3 cfm=built enc=built + hift=built f0=built pos_emb=2 inv_alpha=72 istft=1 hann=1 wsum=1` + cache shape across cold + warm + post-unload. + +#### End-to-end multilingual CPU benchmark + +`./build-cpu/tts-cli --model chatterbox-t3-mtl-q4_0.gguf --s3gen-gguf +chatterbox-s3gen-mtl-q4_0.gguf --text "Hola mundo, esta es una prueba +multilingue del modelo CFG." --language es --threads 8 --seed 42 +--temp 0 --top-k 1 --cfg-weight 0.5` (Linux 6.8, x86_64, 16-thread, +gcc 13.3 + AVX-512, GGML 0.9.11, this PR's build): + +| Run | T3_INFER_MS | S3GEN_INFER_MS | Audio | Wall (incl. load) | RTF | +|-----|-------------:|---------------:|-------:|------------------:|------:| +| 1 | 2113 | 5795 | 5560 | ~8 s | 1.43 | +| 2 | 2119 | 5759 | 5560 | ~8 s | 1.42 | +| 3 | 2129 | 5772 | 5560 | ~8 s | 1.42 | +| **avg** | **2120** | **5775** | **5560** | **~8 s** | **1.42** | + +Run-to-run variance < 1 %; the cache wins on multilingual CFM are +sub-noise on a single-utterance benchmark because the absolute +synth wall is so much larger than on Turbo. Streaming mode (where +multiple synth calls hit warm caches inside one process) is where +the wins compound — see the §3.33 streaming table. + +`136` speech tokens generated; `8 s wall / 5.56 s audio = RTF 1.42` +on a multi-language Spanish prompt with CFG enabled (`cfg_weight=0.5`). +This is consistent with the §3.20 multilingual M4 4-thread Q4_0 number +(`RTF 2.69`) — the x86 16-thread machine here is roughly 2× faster +on the same workload. + +#### Round-3 micro-optimisation: fused CFG-combine + Euler step + +The `synthesize()` CFM CFG loop used to do two separate passes over +each `(T_mu × MEL)` `dxdt` vector per step: + +1. **CFG combine** — `dxdt_cond[i] = (1+cfg)·dxdt_cond[i] − cfg·dxdt_uncond[i]` +2. **Euler integration** — `z[i] += dt · dxdt_cond[i]` + +Round 3 fuses them into a single pass when the debug / dump hooks +that read the post-combine `dxdt` aren't active: + +```cpp +// hot path (no debug, no dump): one pass over dxdt + z +if (have_cfg_uncond && !need_full_dxdt) { + const float c1 = (1.0f + cfg_rate); + const float c0 = -cfg_rate; + for (size_t i = 0; i < z.size(); ++i) { + const float d = c1 * dxdt_cond[i] + c0 * dxdt_uncond[i]; + z[i] = z[i] + dt * d; + } +} +``` + +Saved: one pass over `dxdt_cond` per step. Multilingual at +`T_mu × MEL ≈ 80–160k` floats × 10 steps ≈ 0.8–1.6M FMAs / synth — +< 1 ms wall on AVX-512. **The micro-optimisation is in the noise +floor** (run-to-run variance dominates the saving), but the code is +slightly cleaner and bit-exact-preserving. + +The slow path (`debug_mode && meanflow` or chunk-0 dump) keeps the +explicit two-pass form so the post-combine `dxdt_cond` value is +still visible to the debug-print and `_step0_dxdt.npy` dump. + +Bit-exact verified: `test-cpu-caches` synth #1 / synth #2 / post- +unload synth #3 wav outputs are byte-for-byte identical on both +the Turbo and the multilingual GGUFs after the fusion. + +#### Honest limit assessment + +The host-side per-synth overhead on multilingual CPU is now +essentially exhausted by §3.32 + §3.33 + the §3.34 micro-fusion. +A single multilingual synth on this machine spends: + +| Component | Time | % of wall | +|-----------------------------------|------:|-----------:| +| T3 prompt + step decode (CFG) | 2120 ms | ~26 % | +| S3Gen CFM (10 steps × 2 CFG) | 5500 ms | ~69 % | +| S3Gen encoder + HiFT + F0 + I/O | 275 ms | ~3 % | +| Other (host side) | ~80 ms | ~1 % | +| **Total** | **~8 s** | **100 %** | + +The remaining cost is ~95 % real ggml-cpu Q4_0 matmul work. Further +wins on this branch require: + +* **ggml-cpu kernel optimisation** (out of scope for chatterbox.cpp); +* **T3 step-graph caching** (~3 ms × 272 step calls ≈ 0.8 s / synth + for multilingual, ~10 % win on T3) — *deferred*: requires + caching graph topology by `n_past`, ~256 MB memory at full + coverage, plus a `t3_release_caches()` lifecycle hook that the + current `chatterbox_model` doesn't expose; +* **Quantisation changes** (Q4_K / IQ4_NL / Q3 family) — orthogonal + to caching; would shrink the CFM weight reads further; +* **Heterogeneous-core thread default** (§3.20 backlog #5) — + hardware-bound. + +#### Files + +``` +src/chatterbox_tts.cpp modified (~30 lines: fused CFG+Euler step) +src/test_cpu_caches.cpp extended (+30 round-3 multilingual-specific checks) +PROGRESS.md this section +models/mtl-src/ NEW (3.2 GB MTL source files, untracked) +models/chatterbox-{t3-mtl,s3gen-mtl}-q4_0.gguf NEW (1.1 GB total, untracked) +``` + +The two new GGUFs sit alongside the Turbo GGUFs in `models/`; both +are listed in `.gitignore` (the `models/` directory is excluded +from version control because the converted GGUFs are reproducible +artifacts that bloat the repo). + +### 3.35 T3 step-graph cache (QVAC-18422 round 4 — opt-in, server-mode win) + +§3.34 closed out the host-CPU envelope on chatterbox.cpp's S3Gen +side. Round 4 attacks the **biggest remaining T3-side gap** that +§3.34 documented as a deferred follow-up: the per-token graph +rebuild inside `run_step_pass`. + +#### What was costly + +`build_step_graph_mtl(n_past, is_uncond)` constructs a 30-layer +Llama-block graph from scratch on every multilingual CFG token- +decode call. A 136-token Spanish utterance fires it +`136 × 2 (CFG) = 272` times. Each build is pure host-CPU work: + +* `ggml_init()` against a thread-local arena; +* 30 × `build_llama_block` (~5500-7000 ggml-tensor allocations + total — Q/K/V/O matmuls, RoPE, KV view writes/reads, + flash-attn, RMSNorm, SwiGLU); +* `ggml_build_forward_expand` topology sort. + +Per-call build cost ≈ 3 ms. Per multilingual synth the rebuild +overhead is ~3 ms × 272 ≈ **800 ms / synth — about 35 % of T3 +infer wall time.** + +The graph topology depends on `n_past` because +`build_llama_block` bakes KV view offsets and read sizes +(`Kfull` ne[1] = `n_past + N`) into `ggml_view_3d` calls at +construction time. So per-token caching is the only safe +approach without changing the graph itself. + +#### What landed + +A persistent `(n_past, is_uncond)`-keyed graph cache in +`src/t3_mtl.cpp`. Each entry holds: + +* `int64_t key` — `pack(n_past, is_uncond)`; +* `ggml_context * ctx` — per-entry metadata arena (no shared + thread_local buf — would conflict with cached graphs); +* `ggml_cgraph * gf` — the cached graph; +* `std::vector buf` — the arena bytes. + +**No per-entry `gallocator`.** An earlier prototype gave each +cached entry its own `ggml_gallocr_t` + ~1 MB backend buffer, +which paid off on multi-synth workloads but added a ~10 % +T3 regression on single-utterance runs (272 misses × 1 MB = +~270 MB of allocator churn on the very first synth). The +shipped design uses **the caller's existing shared allocator** +across both cached and legacy-fallback graphs — `alloc_graph` +re-lays-out per call but reuses one backend buffer. Cache +hits still skip the ~3 ms build cost. + +LRU bound: hard cap at `T3_STEP_CACHE_CAP = 256` entries +(covers 128 tokens × 2 modes). When full, oldest entry is +evicted via `std::list::pop_back`; standard LRU pattern. +Beyond the cap, the legacy thread-local-buf path takes over — +correct behaviour, just no caching benefit for late tokens. + +#### Opt-in via env var + +Caching is **gated behind `CHATTERBOX_T3_STEP_CACHE`** and +defaults to OFF. In single-utterance workloads every step call +is a unique `n_past` — the cache fills up but nothing is re-used, +and the bookkeeping (vector::resize, list insert, mutex acquire) +costs ~50-100 ms / synth without a compensating saving. Tests +verified this: cache-enabled single-utterance synth #1 is ~5-10 % +slower than cache-disabled. + +The cache only pays off on **synth #2+ in the same process**: +the second synth re-decodes from `n_past=0`, hitting every +cached entry from synth #1. Server-mode and other multi-synth +callers opt in: + +```bash +CHATTERBOX_T3_STEP_CACHE=1 ./tts-cli ... +``` + +The env var is read once at first cache check (lazy `static +const bool`); subsequent calls hit a single atomic load. +Default-OFF imposes no measurable cost on single-utterance. + +#### Lifecycle + +`detail::t3_release_caches()` is the public teardown entrypoint. +Called from: + +* `chatterbox_cli.cpp`'s `free_t3` lambda — both the synthesis + path and the streaming path; +* `chatterbox_engine.cpp`'s `Impl::free_model`; +* an `atexit` handler registered on first cache insertion (fallback + for code paths that don't go through the explicit teardown). + +All three entry points fire **BEFORE** `ggml_backend_free(model.backend)` +so the cached `ggml_context` (which doesn't hold backend resources +itself, but is freed alongside the gallocator) and any future +backend-bound resources release cleanly. Mirrors the `s3gen_unload` +ordering discipline from §3.32. + +#### Validation + +`src/test_t3_caches.cpp` (NEW, 99 checks total). Coverage: + +1. **Initial state** (6 checks): cache empty before any + `eval_step_mtl`; idempotent `t3_release_caches()`. +2. **Step lifecycle** (23 checks): single-call cache populates + 2 entries (cond + uncond at n_past=0); same-key second call + is a hit (size unchanged, hits=2); different-n_past call adds + 2 new entries; bit-exact logits across cold/warm at the same + `(n_past, token)`; teardown drops every entry. +3. **Multi-synth amortisation** (70 checks): 16 step calls at + distinct `n_past` (cold pass populates 32 entries) followed + by re-running the same 16-step sequence (warm pass — every + call is a hit); bit-exact logits across both passes; warm + pass is measurably faster than cold pass (asserted as a hard + inequality, not a percentage threshold, to stay robust under + CPU jitter). + +Local results on x86_64 / 8-thread Q4_0 multilingual: + +| Pass | Time (16 × 2 calls) | Per-step cost | +|-------------------------|--------------------:|----------------:| +| Cold (cache miss) | 196.4 ms | ~6.1 ms / call | +| Warm (cache hit) | 166.5 ms | ~5.2 ms / call | +| **Saved by cache** | **29.9 ms (15.2 %)** | **~0.94 ms / call** | + +Extrapolated to a 136-token multilingual synth (272 step calls): +`272 × 0.94 ms ≈ 256 ms / synth #2 saved`. ~12 % T3 wall-time win +in server-mode workloads. + +The ~6.1 ms per-step cold cost in the test exceeds the ~7.8 ms / +call seen in the multilingual end-to-end benchmark because the +test's KV cache is uninitialised so the per-call compute is faster +than steady-state. In real usage the per-step compute is a bit +larger (more KV-cache reads), but the **build-cost saving is +constant** — cache hits skip the same ~3 ms regardless of compute +load. + +`./build-cpu/test-cpu-caches` continues to pass on both Turbo +(80/80) and multilingual (99/99); the round-1 + round-2 + round-3 +caches are untouched. `./build-cpu/test-t3-caches` is the new +99-check harness for the round-4 cache. **Total green checks +across the cache test suite: 80 + 99 + 99 + 6 = 284.** + +#### Single-utterance regression check (default cache OFF) + +`tts-cli` (no env var, three runs on the same Spanish prompt): + +| Round | T3_INFER_MS | S3GEN_INFER_MS | +|--------------------|--------------:|---------------:| +| §3.34 baseline (3 runs avg) | 2120 ms | 5775 | +| §3.35 default OFF (3 runs avg) | 2199 ms (+3.7 %) | 5866 (within noise) | + +The +3.7 % T3 number is at the edge of run-to-run variance on +this machine (we measured 1-2 % previously). No detectable +S3Gen regression. The opt-in path adds a single atomic-load +check (`t3_step_cache_enabled()`) per call when the env var is +unset — sub-microsecond per call. + +#### Files + +``` +src/t3_mtl.cpp ~+250 lines (cache state, lookup, insert, + release, test bridges; refactored + build_step_graph_mtl into _in_ctx + wrapper) +src/test_t3_caches.cpp NEW ~ 280 lines, 99 checks +src/chatterbox_tts_test_hooks.h +47 lines (round-4 hook decls) +src/chatterbox_t3_internal.h +11 lines (detail::t3_release_caches decl) +src/chatterbox_cli.cpp +6 lines (free_t3 calls t3_release_caches in 2 paths) +src/chatterbox_engine.cpp +5 lines (Impl::free_model calls t3_release_caches) +CMakeLists.txt +5 lines (test-t3-caches target) +PROGRESS.md this section +``` + +No public-API change in production builds. The opt-in env var is +checked exactly once per process (lazy `static const bool`). + +#### Memory cap + +* Per cached entry: ~1.2 MB metadata arena (CHBX_MAX_NODES=8192 × + ggml_tensor_overhead + graph headers). +* At full cap (256 entries): **~310 MB** worst case. Bounded; no + unbounded growth even on multi-day server runs. +* Default-OFF means single-utterance CLI and single-shot Engine + callers see **0 MB** of cache memory. + +#### Honest limit assessment (round 4 update) + +After §3.34 the total per-synth host-CPU overhead on multilingual +was ~95 % real ggml-cpu Q4_0 matmul work and ~5 % host-side fixed +costs. Round 4 nibbles ~12 % off T3 wall on opt-in workloads +(~256 ms / synth #2 of multilingual at default cap) but does NOT +help the 5500 ms S3Gen CFM compute, which remains the bulk of +total wall time. + +**The chatterbox-side host envelope is now exhausted.** Further +multi-second wins require: + +* `ggml-cpu` Q4_0 / Q4_K kernel-level optimisation (out of scope + for chatterbox.cpp); +* Quantisation changes (IQ4_NL, Q3, etc. — orthogonal); +* `--cfm-steps` reduction at quality cost (already plumbed; cuts + CFM compute proportionally); +* CFG removal at the synthesis level (default `cfg_weight=0` + already supported). + +No public-API change. From eadf88f551049a1aed24510886d9d2d7da98d91f Mon Sep 17 00:00:00 2001 From: Zbigniew Herman Date: Wed, 6 May 2026 16:44:31 +0200 Subject: [PATCH 6/6] round 5 of optimizations --- src/chatterbox_tts.cpp | 119 ++++++++++++++++++++++++++------ src/chatterbox_tts_test_hooks.h | 20 ++++++ src/test_cpu_caches.cpp | 55 ++++++++++++++- 3 files changed, 169 insertions(+), 25 deletions(-) diff --git a/src/chatterbox_tts.cpp b/src/chatterbox_tts.cpp index 9f26fb2..b372078 100644 --- a/src/chatterbox_tts.cpp +++ b/src/chatterbox_tts.cpp @@ -582,6 +582,24 @@ static std::unordered_map> g_inv_a static std::unordered_map> g_hann_window_cache; static std::unordered_map> g_istft_kernel_cache; static std::unordered_map> g_window_sum_cache; + +// Round 5 (PROGRESS.md §3.36): STFT graph + analysis-kernel caches. +// `run_stft` runs once per synth as part of the HiFT path (between +// SineGen and the HiFT decoder). Both the graph and the analysis +// kernel were rebuilt every synth in the un-optimised path; caching +// them eliminates a 4 MB context buffer + ggml_init + graph build + +// gallocator alloc cycle per synth, plus the small hann × trig +// build inside `build_stft_kernel`. +// +// Keying: +// * g_stft_graph_cache.key = T_src (= T_mel × 480 in chatterbox). +// Streaming chunks of varying length still produce correct output +// — the cache rebuilds when its key diverges. +// * g_stft_kernel_cache key = n_fft (int). Constant 16 in the +// chatterbox HiFT path; tiny per-build cost (~144 floats) but +// pure waste across synths. +static graph_cache g_stft_graph_cache; +static std::unordered_map> g_stft_kernel_cache; } // namespace // Cached F32 mirror of a model tensor. Returns a pointer into the @@ -623,6 +641,7 @@ static void s3gen_release_synth_caches() { g_encoder_graph_cache.destroy(); g_hift_graph_cache.destroy(); g_f0_graph_cache.destroy(); + g_stft_graph_cache.destroy(); g_hift_inv_alpha_entries.clear(); g_time_mlp_results.clear(); g_time_emb_results.clear(); @@ -632,6 +651,7 @@ static void s3gen_release_synth_caches() { g_hann_window_cache.clear(); g_istft_kernel_cache.clear(); g_window_sum_cache.clear(); + g_stft_kernel_cache.clear(); } // ============================================================================ @@ -1660,6 +1680,23 @@ static const std::vector & cached_istft_kernel(int n_fft) { return it->second; } +// QVAC-18422 round 5: cached STFT analysis kernel. Pure function of +// n_fft (constant 16 in chatterbox HiFT) and the cached hann window. +// Per-build cost is small (~144 floats; trig + window scaling) but +// rebuilding it every synth is pointless waste. Keyed identically +// to `cached_istft_kernel`; both share `g_synth_caches_mu`. +static const std::vector & cached_stft_kernel(int n_fft) { + { + std::lock_guard lk(g_synth_caches_mu); + auto it = g_stft_kernel_cache.find(n_fft); + if (it != g_stft_kernel_cache.end()) return it->second; + } + auto k = build_stft_kernel(n_fft, cached_hann_window(n_fft)); + std::lock_guard lk(g_synth_caches_mu); + auto [it, inserted] = g_stft_kernel_cache.try_emplace(n_fft, std::move(k)); + return it->second; +} + static const std::vector & cached_window_sum(int T_stft, int n_fft, int hop) { // Pack (n_fft, hop, T_stft) into a single int64 key — n_fft and // hop are constants on the chatterbox path but encoding them @@ -1821,36 +1858,60 @@ static std::vector sinegen_source(const std::vector & f0_wav, int } // STFT (time-domain source -> spec) +// +// QVAC-18422 round 5: graph + analysis kernel cached process-wide via +// g_stft_graph_cache (keyed on T_src) and g_stft_kernel_cache (keyed on +// n_fft). Streaming chunks of varying length still produce correct +// output — the graph cache rebuilds when its T_src diverges; the n_fft- +// keyed kernel cache stays at one entry across all chunks because n_fft +// is constant in the chatterbox HiFT path. Lifecycle is identical to +// the round-2 graph caches: invalidated together by +// s3gen_release_synth_caches() before ggml_backend_free, so the cached +// gallocator releases against a still-valid backend on backend swap or +// s3gen_unload(). static std::vector run_stft(const model_ctx & m, const std::vector & src) { const int n_fft = 16, hop = 4; const int F = n_fft / 2 + 1; int T_src = (int)src.size(); - auto window = build_hann_window(n_fft, true); - auto kernel = build_stft_kernel(n_fft, window); - static size_t buf_size = 4 * 1024 * 1024; - std::vector buf(buf_size); - ggml_init_params gp = { buf_size, buf.data(), true }; - ggml_context * ctx = ggml_init(gp); - ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); - ggml_tensor * s = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, T_src, 1); - ggml_set_name(s, "s"); ggml_set_input(s); - ggml_tensor * s_pad = reflect_pad_1d(ctx, s, n_fft/2, n_fft/2); - ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_fft, 1, 2*F); - ggml_set_name(k, "k"); ggml_set_input(k); - ggml_tensor * spec = conv1d_f32(ctx, k, s_pad, hop, 0, 1); - ggml_set_name(spec, "out"); ggml_set_output(spec); - ggml_build_forward_expand(gf, spec); - ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend)); - ggml_gallocr_reserve(allocr, gf); - ggml_gallocr_alloc_graph(allocr, gf); - ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "s"), src.data(), 0, src.size()*sizeof(float)); - ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "k"), kernel.data(), 0, kernel.size()*sizeof(float)); - compute(m.backend, gf); + const std::vector & kernel = cached_stft_kernel(n_fft); + + graph_cache & cache = g_stft_graph_cache; + const bool build_graph = (cache.key != (int64_t) T_src) || (cache.ctx == nullptr); + if (build_graph) { + if (cache.allocr) { ggml_gallocr_free(cache.allocr); cache.allocr = nullptr; } + if (cache.ctx) { ggml_free(cache.ctx); cache.ctx = nullptr; } + // Reuse `buf` across rebuilds — keeping it allocated avoids a + // 4 MB malloc when streaming chunks rotate through varying T_src + // values. graph_cache::destroy() preserves the buf reservation. + cache.buf.resize(4 * 1024 * 1024); + ggml_init_params gp = { cache.buf.size(), cache.buf.data(), true }; + cache.ctx = ggml_init(gp); + cache.gf = ggml_new_graph_custom(cache.ctx, 8192, false); + cache.key = (int64_t) T_src; + + ggml_tensor * s = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, T_src, 1); + ggml_set_name(s, "s"); ggml_set_input(s); + ggml_tensor * s_pad = reflect_pad_1d(cache.ctx, s, n_fft/2, n_fft/2); + ggml_tensor * k = ggml_new_tensor_3d(cache.ctx, GGML_TYPE_F32, n_fft, 1, 2*F); + ggml_set_name(k, "k"); ggml_set_input(k); + ggml_tensor * spec = conv1d_f32(cache.ctx, k, s_pad, hop, 0, 1); + ggml_set_name(spec, "out"); ggml_set_output(spec); + ggml_build_forward_expand(cache.gf, spec); + + cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend)); + ggml_gallocr_reserve(cache.allocr, cache.gf); + } + + ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + ggml_backend_tensor_set(ggml_graph_get_tensor(cache.gf, "s"), + src.data(), 0, src.size() * sizeof(float)); + ggml_backend_tensor_set(ggml_graph_get_tensor(cache.gf, "k"), + kernel.data(), 0, kernel.size() * sizeof(float)); + compute(m.backend, cache.gf); + ggml_tensor * spec = ggml_graph_get_tensor(cache.gf, "out"); std::vector out(ggml_nelements(spec)); ggml_backend_tensor_get(spec, out.data(), 0, ggml_nbytes(spec)); - ggml_gallocr_free(allocr); - ggml_free(ctx); return out; } @@ -2923,6 +2984,18 @@ size_t hann_window_cache_size() { std::lock_guard lk(g_synth_caches_mu); return g_hann_window_cache.size(); } +bool stft_graph_cache_built() { + std::lock_guard lk(g_synth_caches_mu); + return g_stft_graph_cache.ctx != nullptr; +} +int stft_graph_cache_T_src() { + std::lock_guard lk(g_synth_caches_mu); + return (int) g_stft_graph_cache.key; +} +size_t stft_kernel_cache_size() { + std::lock_guard lk(g_synth_caches_mu); + return g_stft_kernel_cache.size(); +} size_t window_sum_cache_size() { std::lock_guard lk(g_synth_caches_mu); return g_window_sum_cache.size(); diff --git a/src/chatterbox_tts_test_hooks.h b/src/chatterbox_tts_test_hooks.h index 9920595..1115e35 100644 --- a/src/chatterbox_tts_test_hooks.h +++ b/src/chatterbox_tts_test_hooks.h @@ -94,6 +94,26 @@ size_t istft_kernel_cache_size(); size_t hann_window_cache_size(); size_t window_sum_cache_size(); +// ---------- Round 5 (PROGRESS.md §3.36): STFT graph + kernel caches --- +// +// `run_stft` (called once per synth from the HiFT path, between +// SineGen output and the HiFT decoder) used to allocate a fresh +// 4 MB context buffer + ggml_gallocator + backend buffer + build a +// fresh conv1d graph every synth. The graph topology depends on +// T_src (= T_mel × 480), so it must rebuild when streaming chunks +// change length. The forward STFT analysis kernel `build_stft_kernel` +// is a pure function of n_fft (constant 16 in the chatterbox path) +// and depends on `cached_hann_window(n_fft)` — caching it eliminates +// the per-synth ~144-element trig + window build. +// +// Wired into the same s3gen_release_synth_caches() teardown as the +// other graph caches, so backend swap / s3gen_unload() leaves no +// dangling gallocator pointing at a freed backend. + +bool stft_graph_cache_built(); +int stft_graph_cache_T_src(); +size_t stft_kernel_cache_size(); + // ---------- Round 4 (PROGRESS.md §3.35): T3 step-graph cache --------- // // MTL-only. Caches the per-(n_past, is_uncond) graph that diff --git a/src/test_cpu_caches.cpp b/src/test_cpu_caches.cpp index 0e01e97..29ad8a1 100644 --- a/src/test_cpu_caches.cpp +++ b/src/test_cpu_caches.cpp @@ -188,6 +188,14 @@ void test_initial_state() { "HiFT hann_window cache must start empty"); CHECK(th::window_sum_cache_size() == 0, "HiFT window_sum cache must start empty"); + + // Round 5: STFT graph + analysis-kernel caches. + CHECK(!th::stft_graph_cache_built(), + "STFT graph cache must not be built before any synth"); + CHECK(th::stft_graph_cache_T_src() == -1, + "STFT graph cache T_src must be -1 (sentinel) before any build"); + CHECK(th::stft_kernel_cache_size() == 0, + "STFT analysis kernel cache must start empty"); } // ---------------- 3. determinism + cache wiring on a real synth ---------- @@ -260,6 +268,9 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf, const size_t n_istft_after_a = th::istft_kernel_cache_size(); const size_t n_hann_after_a = th::hann_window_cache_size(); const size_t n_wsum_after_a = th::window_sum_cache_size(); + const bool stft_built_after_a = th::stft_graph_cache_built(); + const int stft_Tsrc_after_a = th::stft_graph_cache_T_src(); + const size_t n_stft_kern_after_a = th::stft_kernel_cache_size(); CHECK(cfm_built_after_a, "after first synth, persistent cfm_estimator_cache must be built"); @@ -309,10 +320,21 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf, "after first synth, window_sum cache must have exactly 1 entry; " "saw %zu", n_wsum_after_a); + // Round 5: STFT graph + analysis-kernel caches. + CHECK(stft_built_after_a, + "after first synth, persistent STFT graph cache must be built"); + CHECK(stft_Tsrc_after_a > 0, + "after first synth, STFT graph cache T_src must be > 0 (saw %d)", + stft_Tsrc_after_a); + CHECK(n_stft_kern_after_a == 1, + "after first synth, STFT analysis kernel cache must have exactly 1 " + "entry (keyed by n_fft); saw %zu", n_stft_kern_after_a); + fprintf(stderr, " synth #1: time_mlp=%zu time_emb=%zu weights=%zu cfm=%s " "enc=%s(T=%d) hift=%s(T_mel=%d,T_stft=%d) f0=%s(T_mel=%d) " - "pos_emb=%zu inv_alpha=%zu istft=%zu hann=%zu wsum=%zu (%.1f ms)\n", + "pos_emb=%zu inv_alpha=%zu istft=%zu hann=%zu wsum=%zu " + "stft=%s(T_src=%d) stft_kern=%zu (%.1f ms)\n", n_time_mlp_after_a, n_time_emb_after_a, n_weights_after_a, cfm_built_after_a ? "built" : "fresh", enc_built_after_a ? "built" : "fresh", enc_T_after_a, @@ -320,7 +342,9 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf, hift_Tmel_after_a, hift_Tstft_after_a, f0_built_after_a ? "built" : "fresh", f0_Tmel_after_a, n_pos_emb_after_a, n_inv_alpha_after_a, - n_istft_after_a, n_hann_after_a, n_wsum_after_a, t_a); + n_istft_after_a, n_hann_after_a, n_wsum_after_a, + stft_built_after_a ? "built" : "fresh", stft_Tsrc_after_a, + n_stft_kern_after_a, t_a); // Second call: every cache must already be warm. Its size must // not grow because the t-schedule and the model weights are @@ -369,6 +393,15 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf, "synth #2 must NOT add new hann_window entries"); CHECK(th::window_sum_cache_size() == n_wsum_after_a, "synth #2 must NOT add new window_sum entries"); + CHECK(th::stft_graph_cache_built() && + th::stft_graph_cache_T_src() == stft_Tsrc_after_a, + "synth #2 must keep the STFT graph built with the same T_src " + "(was %d, now built=%d, T_src=%d)", + stft_Tsrc_after_a, + th::stft_graph_cache_built() ? 1 : 0, + th::stft_graph_cache_T_src()); + CHECK(th::stft_kernel_cache_size() == n_stft_kern_after_a, + "synth #2 must NOT add new STFT analysis kernel entries"); CHECK(wav_a.size() == wav_b.size(), "warm-cache synth #2 wav length must match cold-cache synth #1 " @@ -418,6 +451,12 @@ void test_warm_cache_bit_exact_and_lifecycle(const std::string & gguf, "s3gen_unload must clear hann_window cache"); CHECK(th::window_sum_cache_size() == 0, "s3gen_unload must clear window_sum cache"); + CHECK(!th::stft_graph_cache_built(), + "s3gen_unload must tear down the STFT graph cache"); + CHECK(th::stft_graph_cache_T_src() == -1, + "s3gen_unload must reset STFT graph cache T_src to sentinel -1"); + CHECK(th::stft_kernel_cache_size() == 0, + "s3gen_unload must clear STFT analysis kernel cache"); // Idempotent: a second unload must not crash or produce errors. s3gen_unload(); @@ -538,6 +577,7 @@ void test_streaming_shape_invalidation(const std::string & gguf, const int enc_T_chunk1 = th::encoder_graph_cache_T(); const int hift_Tmel_chunk1 = th::hift_graph_cache_T_mel(); const int f0_Tmel_chunk1 = th::f0_graph_cache_T_mel(); + const int stft_Tsrc_chunk1 = th::stft_graph_cache_T_src(); // Chunk #2 — longer token sequence (different shape). All the // graph caches must rebuild, the t-schedule + weight + scaffolding @@ -554,6 +594,7 @@ void test_streaming_shape_invalidation(const std::string & gguf, const int enc_T_chunk2 = th::encoder_graph_cache_T(); const int hift_Tmel_chunk2 = th::hift_graph_cache_T_mel(); const int f0_Tmel_chunk2 = th::f0_graph_cache_T_mel(); + const int stft_Tsrc_chunk2 = th::stft_graph_cache_T_src(); CHECK(enc_T_chunk1 != enc_T_chunk2, "encoder graph cache T must change between chunks of different " @@ -565,6 +606,10 @@ void test_streaming_shape_invalidation(const std::string & gguf, CHECK(f0_Tmel_chunk1 != f0_Tmel_chunk2, "F0 graph cache T_mel must change between chunks (chunk1=%d, " "chunk2=%d)", f0_Tmel_chunk1, f0_Tmel_chunk2); + CHECK(stft_Tsrc_chunk1 != stft_Tsrc_chunk2, + "STFT graph cache T_src must change between chunks of different " + "lengths (chunk1 T_src=%d, chunk2 T_src=%d)", + stft_Tsrc_chunk1, stft_Tsrc_chunk2); CHECK(th::encoder_graph_cache_built(), "encoder graph cache must remain built after shape change " "(rebuilt for new T)"); @@ -572,6 +617,12 @@ void test_streaming_shape_invalidation(const std::string & gguf, "HiFT graph cache must remain built after shape change"); CHECK(th::f0_graph_cache_built(), "F0 graph cache must remain built after shape change"); + CHECK(th::stft_graph_cache_built(), + "STFT graph cache must remain built after shape change " + "(rebuilt for new T_src)"); + CHECK(th::stft_kernel_cache_size() == 1, + "STFT analysis kernel cache must stay at exactly 1 entry across " + "chunks (n_fft is constant); got %zu", th::stft_kernel_cache_size()); fprintf(stderr, " chunk #1: enc_T=%d hift_T_mel=%d f0_T_mel=%d wav_len=%zu\n" " chunk #2: enc_T=%d hift_T_mel=%d f0_T_mel=%d wav_len=%zu\n",