From 7a278aa7983032fffacbd1a76e79f09c851b6bf1 Mon Sep 17 00:00:00 2001 From: ogad-tether Date: Fri, 15 May 2026 12:08:24 +0100 Subject: [PATCH 1/6] tts-cpp: supertonic Engine streaming via multilingual chunker + callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors the chatterbox StreamCallback API: a second synthesize() overload takes an on_chunk callback that receives PCM chunk-by-chunk while the returned SynthesisResult still accumulates the full audio (callback is an addition, not a replacement). Supertonic's vector estimator is non-autoregressive (5-step CFM denoise over the full duration-predicted latent), so the chatterbox token-level streaming pattern doesn't transfer. Instead this splits text into sentence-aligned chunks and runs the full pipeline per chunk: - New src/supertonic_chunker.{h,cpp}: Unicode-aware splitter. Sentence- end gets a wide implicit search window (target/2..3*target) because sentence prosody dominates audio quality on this model — chunks cut mid-clause receive an artificial trailing period from preprocess and the model emits muddled / dropped words in response. Clause and whitespace fallbacks use the user-supplied tolerance. - Multilingual punctuation tables: ASCII .?! plus CJK fullwidth, double exclamation/question, Devanagari danda, Urdu full stop for sentences; ASCII / fullwidth / Arabic comma, semicolon, colon and closing brackets for clauses. Whitespace fallback handles CJK / Thai / Lao / Khmer where punctuation may be absent. - Engine streaming path runs the full pipeline per chunk with opts.seed (no per-chunk perturbation; different chunks have different latent_len so noise tensors differ even with the same seed, and an earlier per-chunk seed bump occasionally landed chunks on nearby seeds where the model produces phantom-phoneme tail artifacts). - 10 ms raised-cosine anti-click fade on inter-chunk seams only. First chunk start and last chunk end stay untouched so streamed output is acoustically equivalent to batch at the endpoints. - CLI gains --stream-chunk-tokens / --stream-first-chunk-tokens / --stream-chunk-tolerance-pct flags. --out - streams raw s16le PCM on stdout for incremental playback (pipe into ffplay / sox -d). SUPERTONIC_LOG_CHUNKS=1 logs chunker boundaries; SUPERTONIC_DUMP_CHUNK_WAVS_PREFIX=path- dumps per-chunk WAVs for debugging. Validated end-to-end at ~35x realtime on M2 Metal: streamed output is acoustically equivalent to batch on the same seed; first audio drops in ~1 s for an 18 s utterance instead of waiting the full ~4-5 s for batch synth to complete. Co-Authored-By: Claude Opus 4.7 (1M context) --- tts-cpp/CMakeLists.txt | 1 + tts-cpp/include/tts-cpp/supertonic/engine.h | 63 +++++ tts-cpp/src/supertonic_chunker.cpp | 269 ++++++++++++++++++++ tts-cpp/src/supertonic_chunker.h | 43 ++++ tts-cpp/src/supertonic_cli.cpp | 120 ++++++++- tts-cpp/src/supertonic_engine.cpp | 133 +++++++++- 6 files changed, 617 insertions(+), 12 deletions(-) create mode 100644 tts-cpp/src/supertonic_chunker.cpp create mode 100644 tts-cpp/src/supertonic_chunker.h diff --git a/tts-cpp/CMakeLists.txt b/tts-cpp/CMakeLists.txt index d404842c064..ea9c6e1f4fb 100644 --- a/tts-cpp/CMakeLists.txt +++ b/tts-cpp/CMakeLists.txt @@ -207,6 +207,7 @@ set(TTS_CPP_LIB_SOURCES src/supertonic_text_encoder.cpp src/supertonic_vector_estimator.cpp src/supertonic_engine.cpp + src/supertonic_chunker.cpp src/mtl_tokenizer.cpp ) diff --git a/tts-cpp/include/tts-cpp/supertonic/engine.h b/tts-cpp/include/tts-cpp/supertonic/engine.h index fad8fffd14d..7700d915ea0 100644 --- a/tts-cpp/include/tts-cpp/supertonic/engine.h +++ b/tts-cpp/include/tts-cpp/supertonic/engine.h @@ -45,6 +45,8 @@ #include "tts-cpp/backend.h" #include "tts-cpp/export.h" +#include +#include #include #include #include @@ -116,8 +118,60 @@ struct EngineOptions { // predicted length) and the seeded RNG is bypassed. Useful for // byte-exact reproduction of an ONNX/PyTorch reference run. std::string noise_npy_path; + + // ---------------- Streaming synthesis ---------------------------- + // + // When `stream_chunk_tokens > 0` AND a non-empty callback is passed + // to synthesize(), the engine splits `text` into chunks of roughly + // `stream_chunk_tokens` Unicode code points (Supertonic's text-token + // grain — see supertonic_text_to_ids), runs the full pipeline per + // chunk, and invokes the callback with each chunk's PCM as it's + // produced. The returned SynthesisResult.pcm still contains the + // concatenated audio (the callback is an *addition*, not a + // replacement). Streaming is disabled when stream_chunk_tokens == 0 + // OR the callback is empty — both paths fall through to the batch + // path with no per-chunk overhead. + // + // stream_chunk_tokens Target chunk size in text tokens. + // ~50 ≈ 1-3 s English audio; CJK + // languages are denser so a lower + // target (~25-30) tends to feel + // better. 0 disables streaming. + // + // stream_first_chunk_tokens Override for the *first* chunk so + // first audio lands early while later + // chunks stay at the larger target + // for steady-state throughput. + // 0 = same as stream_chunk_tokens. + // + // stream_chunk_tolerance_pct Boundary-snap window for CLAUSE and + // WHITESPACE fallbacks (±N% of target). + // Sentence-end breaks are searched on a + // much wider implicit window (target/2 + // to 3× target) regardless of this + // setting, because sentence prosody + // dominates audio quality: chunks cut + // mid-clause receive an artificial + // terminal period from preprocess and + // the model emits muddled audio in + // response. Default 20. + int stream_chunk_tokens = 0; + int stream_first_chunk_tokens = 0; + int stream_chunk_tolerance_pct = 20; }; +// Per-chunk PCM callback for streaming synthesis. Receives a pointer to +// `samples` consecutive float32 mono samples at SynthesisResult::sample_rate +// (typically 44.1 kHz — read from model metadata, not hard-coded). The +// buffer is owned by the engine and must not be retained past the +// callback; copy out if you need the data. +// `chunk_index` 0-based index of the chunk within the current synth. +// `is_last` true on the final chunk (after which synthesize() returns). +// Throwing from this callback aborts synthesis (the exception propagates +// out of synthesize()). +using StreamCallback = std::function; + struct SynthesisResult { std::vector pcm; int sample_rate = 44100; @@ -150,6 +204,15 @@ class TTS_CPP_API Engine { // Not safe to call concurrently on the same Engine instance. SynthesisResult synthesize(const std::string & text); + // Same as above, but when `options().stream_chunk_tokens > 0` and + // `on_chunk` is non-empty, runs the chunked pipeline and invokes + // `on_chunk` with each chunk's PCM in order. The returned + // SynthesisResult.pcm still contains the concatenated audio (the + // callback is an *addition*, not a replacement). Falls through to + // the batch path when either condition is false. + SynthesisResult synthesize(const std::string & text, + const StreamCallback & on_chunk); + // Best-effort cancel of an in-flight synthesize() call on another // thread. Setting the flag is all this does; actual termination // happens at the next cancellation check inside the vector- diff --git a/tts-cpp/src/supertonic_chunker.cpp b/tts-cpp/src/supertonic_chunker.cpp new file mode 100644 index 00000000000..8cfb351e362 --- /dev/null +++ b/tts-cpp/src/supertonic_chunker.cpp @@ -0,0 +1,269 @@ +#include "supertonic_chunker.h" + +#include +#include + +namespace tts_cpp::supertonic::detail { +namespace { + +// Minimal UTF-8 decoder — same shape as the anon-namespace helpers in +// supertonic_preprocess.cpp. Kept local so the chunker has no cross-file +// dependency beyond its own header. Replaces malformed sequences with +// U+FFFD and a 1-byte advance (matches preprocess behaviour for parity). +bool utf8_decode(const char * s, size_t len, size_t & pos, uint32_t & cp) { + if (pos >= len) return false; + uint8_t b0 = (uint8_t) s[pos]; + if (b0 < 0x80) { cp = b0; pos += 1; return true; } + int extra = 0; + if ((b0 & 0xE0) == 0xC0) { cp = b0 & 0x1F; extra = 1; } + else if ((b0 & 0xF0) == 0xE0) { cp = b0 & 0x0F; extra = 2; } + else if ((b0 & 0xF8) == 0xF0) { cp = b0 & 0x07; extra = 3; } + else { cp = 0xFFFD; pos += 1; return true; } + if (pos + 1 + extra > len) { cp = 0xFFFD; pos += 1; return true; } + for (int i = 0; i < extra; ++i) { + uint8_t b = (uint8_t) s[pos + 1 + i]; + if ((b & 0xC0) != 0x80) { cp = 0xFFFD; pos += 1; return true; } + cp = (cp << 6) | (b & 0x3F); + } + pos += 1 + extra; + return true; +} + +struct cp_at { + uint32_t cp; // code point + size_t byte_pos; // byte offset of this code point in the source string +}; + +std::vector decode_with_byte_offsets(const std::string & s) { + std::vector out; + out.reserve(s.size()); + size_t pos = 0; + while (pos < s.size()) { + size_t start = pos; + uint32_t cp = 0; + if (!utf8_decode(s.data(), s.size(), pos, cp)) break; + out.push_back({cp, start}); + } + return out; +} + +bool is_space_cp(uint32_t cp) { + return cp == 0x09 || cp == 0x0A || cp == 0x0B || cp == 0x0C || cp == 0x0D || + cp == 0x20 || cp == 0x85 || cp == 0xA0 || cp == 0x1680 || + (cp >= 0x2000 && cp <= 0x200A) || cp == 0x2028 || cp == 0x2029 || + cp == 0x202F || cp == 0x205F || cp == 0x3000; +} + +// Sentence-end punctuation across ASCII, CJK, Devanagari, and the +// extended Unicode punctuation range. Conservative — symbols that +// can be sentence-terminating but ambiguous (e.g. ellipsis "…") are +// intentionally excluded since they often continue a thought. +bool is_sentence_end_cp(uint32_t cp) { + switch (cp) { + case 0x002E: // . + case 0x003F: // ? + case 0x0021: // ! + case 0x3002: // 。 CJK ideographic full stop + case 0xFF1F: // ? fullwidth question mark + case 0xFF01: // ! fullwidth exclamation mark + case 0x203C: // ‼ double exclamation + case 0x2047: // ⁇ double question + case 0x2048: // ⁈ question exclamation + case 0x2049: // ⁉ exclamation question + case 0x0964: // । Devanagari danda + case 0x0965: // ॥ Devanagari double danda + case 0x06D4: // ۔ Urdu full stop + return true; + default: + return false; + } +} + +// Clause-end punctuation (lower priority than sentence-end). Includes +// CJK and Arabic equivalents. Closing brackets count — a clause that +// just ended a parenthetical is a reasonable break point too. +bool is_clause_end_cp(uint32_t cp) { + switch (cp) { + case 0x002C: // , + case 0x003B: // ; + case 0x003A: // : + case 0xFF0C: // , fullwidth comma + case 0x3001: // 、 ideographic comma + case 0xFF1B: // ; fullwidth semicolon + case 0xFF1A: // : fullwidth colon + case 0x060C: // ، Arabic comma + case 0x061B: // ؛ Arabic semicolon + case 0x0029: // ) + case 0x005D: // ] + case 0x007D: // } + case 0xFF09: // ) + return true; + default: + return false; + } +} + +// Scan for the first index in (lo, hi] where pred(cps[idx-1].cp) is true. +// Right-first sweep from `target`, then leftward — chunks that end ON +// the punctuation/space read more naturally than chunks that end one +// character before it. Returns SIZE_MAX if no match. +size_t scan_for(const std::vector & cps, + size_t target, + size_t lo, + size_t hi, + bool (*pred)(uint32_t)) +{ + if (hi <= lo + 1) return SIZE_MAX; + const size_t t = std::clamp(target, lo + 1, hi); + for (size_t r = t; r <= hi; ++r) { + if (pred(cps[r - 1].cp)) return r; + } + for (size_t r = t; r > lo + 1; --r) { + if (pred(cps[r - 2].cp)) return r - 1; + } + return SIZE_MAX; +} + +// Find the best boundary index for splitting. Two windows: +// +// `sent_lo..sent_hi` — wide window for sentence-end punctuation. +// Sentence prosody dominates audio quality on +// this model (chunks that end mid-clause and +// get an artificial trailing period make the +// model emit muddled/dropped words), so we +// search a much larger range for sentences +// than for any other boundary type. +// +// `norm_lo..norm_hi` — tight user-controlled window for clause and +// whitespace fallbacks when no sentence is in +// reach. Hard-cut at `norm_hi` as last resort. +// +// Returns the index AFTER the break (chunk = cps[start..break)). +size_t pick_break(const std::vector & cps, + size_t target, + size_t sent_lo, size_t sent_hi, + size_t norm_lo, size_t norm_hi) +{ + if (size_t b = scan_for(cps, target, sent_lo, sent_hi, is_sentence_end_cp); + b != SIZE_MAX) return b; + if (size_t b = scan_for(cps, target, norm_lo, norm_hi, is_clause_end_cp); + b != SIZE_MAX) return b; + if (size_t b = scan_for(cps, target, norm_lo, norm_hi, is_space_cp); + b != SIZE_MAX) return b; + return norm_hi; // hard cut +} + +std::string slice_to_string(const std::vector & cps, + size_t start_idx, + size_t end_idx, + const std::string & source) { + if (start_idx >= end_idx) return {}; + const size_t byte_start = cps[start_idx].byte_pos; + const size_t byte_end = (end_idx < cps.size()) + ? cps[end_idx].byte_pos + : source.size(); + std::string out = source.substr(byte_start, byte_end - byte_start); + + // Trim leading + trailing whitespace at the code-point level. Done + // by scanning the slice — cheaper than re-decoding given the slice + // is typically tens of bytes. + size_t l = 0; + while (l < out.size() && (out[l] == ' ' || out[l] == '\t' || + out[l] == '\n' || out[l] == '\r')) ++l; + size_t r = out.size(); + while (r > l && (out[r - 1] == ' ' || out[r - 1] == '\t' || + out[r - 1] == '\n' || out[r - 1] == '\r')) --r; + return out.substr(l, r - l); +} + +} // namespace + +std::vector split_for_streaming( + const std::string & text, + int target_tokens, + int first_chunk_tokens, + int tolerance_pct) +{ + std::vector out; + if (target_tokens <= 0 || text.empty()) { + // Caller is responsible for falling back to the batch path when + // target_tokens <= 0; returning a single-element vector here so + // the chunker remains usable as a defensive no-op splitter. + if (!text.empty()) out.push_back(text); + return out; + } + + const std::vector cps = decode_with_byte_offsets(text); + if (cps.empty()) return out; + + const int tol_pct = std::clamp(tolerance_pct, 0, 100); + + const size_t total = cps.size(); + size_t start = 0; + int chunk_idx = 0; + + while (start < total) { + const int target_this = (chunk_idx == 0 && first_chunk_tokens > 0) + ? first_chunk_tokens + : target_tokens; + + // Tight window — for clause/whitespace boundaries and the + // hard-cut fallback. Driven by the user-supplied tolerance. + const int norm_lo_rel = std::max(1, target_this - target_this * tol_pct / 100); + const int norm_hi_rel = target_this + target_this * tol_pct / 100; + + // Wide window — sentence-end search. Reaches back to half the + // target (so we don't pick a sentence break that makes the chunk + // ridiculously short) and forward to 3× the target (so a fairly + // distant period is still preferred over a mid-clause whitespace + // cut). 3× is empirical: covers typical English sentence-length + // variance without letting one runaway sentence destroy + // streaming latency. + const int sent_lo_rel = std::max(1, target_this / 2); + const int sent_hi_rel = target_this * 3; + + const size_t norm_lo = std::min(start + (size_t) norm_lo_rel, total); + const size_t norm_hi = std::min(start + (size_t) norm_hi_rel, total); + const size_t sent_lo = std::min(start + (size_t) sent_lo_rel, total); + const size_t sent_hi = std::min(start + (size_t) sent_hi_rel, total); + + size_t brk; + if (norm_hi <= start + 1 || total - start <= (size_t) norm_hi_rel) { + // Entire remainder fits inside this chunk's upper tolerance — + // take it all. Avoids leaving a tiny sub-tolerance tail. + brk = total; + } else { + const size_t target_abs = std::min(start + (size_t) target_this, total); + brk = pick_break(cps, target_abs, + sent_lo, sent_hi, + norm_lo, norm_hi); + } + + std::string chunk = slice_to_string(cps, start, brk, text); + if (!chunk.empty()) out.push_back(std::move(chunk)); + start = brk; + ++chunk_idx; + } + + // Tail-merge heuristic: if the last chunk has fewer than max(8, + // target/3) tokens AND we have at least two chunks, fold it into + // the previous chunk. Avoids paying full pipeline cost for a + // handful of trailing tokens. Mirrors chatterbox_engine.cpp:608. + if (out.size() >= 2) { + const std::vector tail_cps = decode_with_byte_offsets(out.back()); + const int min_tail = std::max(8, target_tokens / 3); + if ((int) tail_cps.size() < min_tail) { + std::string merged = out[out.size() - 2]; + // Re-insert a single space between fragments if both sides + // are non-empty after trim, so spoken prosody isn't glued. + if (!merged.empty() && !out.back().empty()) merged.push_back(' '); + merged += out.back(); + out.pop_back(); + out.back() = std::move(merged); + } + } + + return out; +} + +} // namespace tts_cpp::supertonic::detail diff --git a/tts-cpp/src/supertonic_chunker.h b/tts-cpp/src/supertonic_chunker.h new file mode 100644 index 00000000000..2dbdeefb263 --- /dev/null +++ b/tts-cpp/src/supertonic_chunker.h @@ -0,0 +1,43 @@ +#pragma once + +// Multilingual streaming chunker for the Supertonic engine. +// +// Splits an input string into a list of substrings sized for per-chunk +// synthesis, preferring natural boundaries when available: +// +// 1. sentence-end punctuation (. ? ! 。 ? ! ‼ ⁇ ⁈ ⁉ । ॥) +// 2. clause-end punctuation (, ; : , 、 ; : ؛ ، and closing brackets) +// 3. whitespace (handles CJK/Thai/Lao/Khmer where 1+2 are absent) +// 4. hard cut (last-resort cap at the upper tolerance bound) +// +// Token grain matches `supertonic_text_to_ids` (one ID per Unicode code +// point after normalization), so the input character count IS the token +// count that the engine will see. No model tokenizer call is required +// for sizing. + +#include +#include + +namespace tts_cpp::supertonic::detail { + +// Split `text` into chunks sized roughly `target_tokens` code points +// each, snapping to the best available boundary within ±`tolerance_pct` +// of the target. When `first_chunk_tokens > 0`, the first chunk uses +// that smaller target instead (latency knob — first audio lands earlier +// while subsequent chunks stay large to keep throughput up). +// +// Leading/trailing whitespace on each chunk is trimmed. Adjacent chunks +// concatenated back together (modulo trimmed whitespace) reproduce the +// input. Empty / whitespace-only chunks are not emitted. +// +// Tail-merge: if the last chunk would carry fewer than ~max(8, target/3) +// tokens, it is merged into the previous chunk to avoid paying full +// pipeline cost for a handful of trailing tokens (mirrors Chatterbox's +// chatterbox_engine.cpp:608 heuristic). +std::vector split_for_streaming( + const std::string & text, + int target_tokens, + int first_chunk_tokens = 0, + int tolerance_pct = 20); + +} // namespace tts_cpp::supertonic::detail diff --git a/tts-cpp/src/supertonic_cli.cpp b/tts-cpp/src/supertonic_cli.cpp index 0705fa696b5..6a12e86b5e5 100644 --- a/tts-cpp/src/supertonic_cli.cpp +++ b/tts-cpp/src/supertonic_cli.cpp @@ -4,8 +4,10 @@ #include #include #include +#include #include #include +#include namespace { @@ -21,8 +23,18 @@ void usage(const char * argv0) { " audit-identified hot matmul / pwconv weights;\n" " defaults to auto: on for GPU, off for CPU)\n" " [--precision f32|f16|q8_0] (default: f32)\n" - " [--noise-npy /path/to/noise.npy]\n", - argv0); + " [--noise-npy /path/to/noise.npy]\n" + " [--stream-chunk-tokens N] (0 = batch; >0 enables\n" + " streaming with target ~N text-token chunks)\n" + " [--stream-first-chunk-tokens N] (override 1st-chunk target;\n" + " 0 = same as --stream-chunk-tokens)\n" + " [--stream-chunk-tolerance-pct N] (boundary-snap window; default 20)\n" + "\n" + " When --out is '-', the CLI emits raw s16le PCM to stdout as\n" + " each chunk completes. Pipe into a player, e.g.:\n" + " %s --model ... --text '...' --out - --stream-chunk-tokens 50 \\\n" + " | aplay -f S16_LE -r 44100 -c 1\n", + argv0, argv0); } tts_cpp::supertonic::Precision parse_precision(const std::string & s) { @@ -32,6 +44,19 @@ tts_cpp::supertonic::Precision parse_precision(const std::string & s) { throw std::runtime_error("unknown --precision value: " + s + " (expected f32|f16|q8_0)"); } +// Emit `pcm` as raw signed-16-bit little-endian samples on stdout. Used +// by the streaming path so a consumer like `aplay -f S16_LE -r 44100 -c 1` +// can begin playback as soon as the first chunk arrives. Returns once +// the buffer has been written and flushed. +void stream_emit_pcm_stdout(const float * pcm, std::size_t samples) { + for (std::size_t i = 0; i < samples; ++i) { + float c = std::max(-1.0f, std::min(1.0f, pcm[i])); + int16_t v = (int16_t) std::lrintf(c * 32767.0f); + std::fwrite(&v, 2, 1, stdout); + } + std::fflush(stdout); +} + void write_wav(const std::string & path, const std::vector & wav, int sr) { FILE * f = std::fopen(path.c_str(), "wb"); if (!f) throw std::runtime_error("cannot open output wav: " + path); @@ -82,6 +107,15 @@ int main(int argc, char ** argv) { else if (arg == "--f16-weights") opts.f16_weights = std::stoi(next("--f16-weights")); else if (arg == "--precision") opts.precision = parse_precision(next("--precision")); else if (arg == "--noise-npy") opts.noise_npy_path = next("--noise-npy"); + else if (arg == "--stream-chunk-tokens") { + opts.stream_chunk_tokens = std::stoi(next("--stream-chunk-tokens")); + } + else if (arg == "--stream-first-chunk-tokens") { + opts.stream_first_chunk_tokens = std::stoi(next("--stream-first-chunk-tokens")); + } + else if (arg == "--stream-chunk-tolerance-pct") { + opts.stream_chunk_tolerance_pct = std::stoi(next("--stream-chunk-tolerance-pct")); + } else if (arg == "-h" || arg == "--help") { usage(argv[0]); return 0; } else { fprintf(stderr, "unknown arg: %s\n", arg.c_str()); usage(argv[0]); return 2; } } @@ -90,10 +124,84 @@ int main(int argc, char ** argv) { return 2; } try { - auto result = tts_cpp::supertonic::synthesize(opts, text); - write_wav(out, result.pcm, result.sample_rate); - fprintf(stderr, "wrote %s (%.2fs @ %d Hz, %zu samples)\n", - out.c_str(), result.duration_s, result.sample_rate, result.pcm.size()); + const bool streaming = opts.stream_chunk_tokens > 0; + const bool stdout_pcm = (out == "-"); + + if (!streaming) { + if (stdout_pcm) { + fprintf(stderr, + "error: --out - requires --stream-chunk-tokens > 0 " + "(stdout streaming is the streaming-mode output)\n"); + return 2; + } + auto result = tts_cpp::supertonic::synthesize(opts, text); + write_wav(out, result.pcm, result.sample_rate); + fprintf(stderr, "wrote %s (%.2fs @ %d Hz, %zu samples)\n", + out.c_str(), result.duration_s, result.sample_rate, result.pcm.size()); + return 0; + } + + // Streaming path. Construct a persistent Engine so per-chunk + // synth doesn't pay GGUF load each iteration. + tts_cpp::supertonic::Engine engine(opts); + if (stdout_pcm) { + fprintf(stderr, + "streaming: emitting raw s16le PCM on stdout " + "(chunk target: %d text tokens; first chunk: %d; backend: %s)\n", + opts.stream_chunk_tokens, + opts.stream_first_chunk_tokens > 0 + ? opts.stream_first_chunk_tokens + : opts.stream_chunk_tokens, + engine.backend_name().c_str()); + } + + // Optional per-chunk WAV dump for debugging. When the env var + // SUPERTONIC_DUMP_CHUNK_WAVS_PREFIX is set, the callback writes + // each chunk's PCM to ".wav" so you can play chunks + // individually and see which one contains a glitch. + const char * dump_prefix = std::getenv("SUPERTONIC_DUMP_CHUNK_WAVS_PREFIX"); + + std::size_t total_samples = 0; + int n_chunks = 0; + auto on_chunk = [&](const float * pcm, std::size_t samples, + int chunk_index, bool is_last) { + if (stdout_pcm) { + stream_emit_pcm_stdout(pcm, samples); + } + if (dump_prefix) { + std::string path = std::string(dump_prefix) + + std::to_string(chunk_index) + ".wav"; + std::vector tmp(pcm, pcm + samples); + // 44.1 kHz is the Supertonic model default; the real SR + // comes back on the final SynthesisResult but isn't + // visible here. Hard-coding here is fine for a debug + // dump — if a future model ships at a different SR this + // will be wrong, but the callback signature doesn't + // surface it. + write_wav(path, tmp, 44100); + } + total_samples += samples; + ++n_chunks; + fprintf(stderr, + "chunk %d%s: %zu samples%s%s\n", + chunk_index, is_last ? " (last)" : "", + samples, + stdout_pcm ? " -> stdout" : "", + dump_prefix ? " (+ dumped)" : ""); + }; + + auto result = engine.synthesize(text, on_chunk); + + if (!stdout_pcm) { + // File mode: write the concatenated PCM as a WAV. + write_wav(out, result.pcm, result.sample_rate); + fprintf(stderr, "wrote %s (%.2fs @ %d Hz, %zu samples across %d chunks)\n", + out.c_str(), result.duration_s, result.sample_rate, + result.pcm.size(), n_chunks); + } else { + fprintf(stderr, "streamed %zu samples across %d chunks (%.2fs)\n", + total_samples, n_chunks, result.duration_s); + } return 0; } catch (const std::exception & e) { fprintf(stderr, "error: %s\n", e.what()); diff --git a/tts-cpp/src/supertonic_engine.cpp b/tts-cpp/src/supertonic_engine.cpp index b4be7f27ea0..e0fc373b6ac 100644 --- a/tts-cpp/src/supertonic_engine.cpp +++ b/tts-cpp/src/supertonic_engine.cpp @@ -1,11 +1,14 @@ #define TTS_CPP_BUILD #include "tts-cpp/supertonic/engine.h" +#include "supertonic_chunker.h" #include "supertonic_internal.h" #include "npy.h" #include #include +#include +#include #include #include #include @@ -171,11 +174,12 @@ struct Engine::Impl { Impl(const Impl &) = delete; Impl & operator=(const Impl &) = delete; - SynthesisResult synthesize(const std::string & text) { - if (text.empty()) { - throw std::runtime_error("Supertonic Engine: text is empty"); - } - + // Single-chunk synthesis worker. Runs the full Supertonic pipeline + // (preprocess → duration → noise → text encoder → vector estimator + // CFM loop → vocoder) on `text`, using `seed` for the noise RNG so + // streaming callers can perturb per-chunk seeds without colliding + // on identical starting noise. Throws on cancel or any stage error. + SynthesisResult run_single_chunk(const std::string & text, int seed) { const std::string voice = opts.voice.empty() ? model.hparams.default_voice : opts.voice; @@ -229,7 +233,7 @@ struct Engine::Impl { latent.resize(noise.n_elements()); std::memcpy(latent.data(), npy_as_f32(noise), latent.size() * sizeof(float)); } else { - numpy_random_state rng((uint32_t) opts.seed); + numpy_random_state rng((uint32_t) seed); latent.assign((size_t) model.hparams.latent_channels * latent_len, 0.0f); for (float & v : latent) v = rng.standard_normal(); } @@ -282,6 +286,112 @@ struct Engine::Impl { return result; } + SynthesisResult synthesize(const std::string & text) { + if (text.empty()) { + throw std::runtime_error("Supertonic Engine: text is empty"); + } + return run_single_chunk(text, opts.seed); + } + + // Streaming path: chunk text via the multilingual splitter, run the + // full per-chunk pipeline, apply an anti-click raised-cosine fade + // across inter-chunk seams, invoke `on_chunk` synchronously, and + // accumulate the full PCM in the returned result (callback is an + // *addition*, not a replacement — matches Chatterbox semantics). + SynthesisResult synthesize_streaming(const std::string & text, + const StreamCallback & on_chunk) { + if (text.empty()) { + throw std::runtime_error("Supertonic Engine: text is empty"); + } + + std::vector chunks = detail::split_for_streaming( + text, + opts.stream_chunk_tokens, + opts.stream_first_chunk_tokens, + opts.stream_chunk_tolerance_pct); + + if (chunks.empty()) { + throw std::runtime_error("Supertonic Engine: chunker produced no chunks"); + } + + // Optional chunk-boundary trace for debugging the multilingual + // splitter. Off by default; opt-in via env var so production + // synthesis isn't slowed by stderr writes. + if (const char * env = std::getenv("SUPERTONIC_LOG_CHUNKS"); env && env[0] == '1') { + for (size_t i = 0; i < chunks.size(); ++i) { + std::fprintf(stderr, "chunk[%zu] (%zu bytes): %s\n", + i, chunks[i].size(), chunks[i].c_str()); + } + } + + SynthesisResult full; + full.duration_s = 0.0f; + + const int n_chunks = (int) chunks.size(); + for (int k = 0; k < n_chunks; ++k) { + if (cancel_flag.load(std::memory_order_acquire)) { + throw std::runtime_error( + "Supertonic Engine: cancelled during streaming chunk " + + std::to_string(k)); + } + + // Use opts.seed for every chunk. Each chunk has a different + // predicted latent_len (driven by its own text and duration + // model), so the RNG produces different-length noise tensors + // for each chunk even with the same seed — there's no risk + // of identical starting noise across chunks. An earlier + // version perturbed the seed per chunk (opts.seed + k) as + // a defensive measure, but that landed some chunks on + // nearby seeds where the model produces phantom phoneme + // artifacts ("park.K" tail). Keeping the user's chosen + // seed across chunks gives consistent, controllable output. + SynthesisResult chunk_res = run_single_chunk(chunks[k], opts.seed); + + // Anti-click raised-cosine fade across inter-chunk seams. + // Without HiFT cache continuity (Supertonic runs each chunk + // as a fresh independent pipeline), plain concatenation can + // produce a faint click at the boundary. ~10 ms is enough + // to hide the click without audibly attenuating speech. + // Applied to the start of every non-first chunk and the end + // of every non-last chunk. The very-first chunk start and + // very-last chunk end are left untouched so the streamed + // output is acoustically equivalent to the batch output at + // those endpoints. + const int sr = chunk_res.sample_rate; + const size_t fade_n = std::min( + (size_t)(sr * 10 / 1000), + chunk_res.pcm.size() / 2); + const bool is_first = (k == 0); + const bool is_last = (k == n_chunks - 1); + + if (!is_first && fade_n > 0) { + for (size_t i = 0; i < fade_n; ++i) { + const float t = (float) i / (float) fade_n; + const float w = 0.5f * (1.0f - std::cos((float) M_PI * t)); + chunk_res.pcm[i] *= w; + } + } + if (!is_last && fade_n > 0) { + const size_t n = chunk_res.pcm.size(); + for (size_t i = 0; i < fade_n; ++i) { + const float t = (float) i / (float) fade_n; + const float w = 0.5f * (1.0f - std::cos((float) M_PI * t)); + chunk_res.pcm[n - 1 - i] *= w; + } + } + + // Fire callback before accumulating, so the consumer sees + // the same buffer it would receive in pure-streaming mode. + on_chunk(chunk_res.pcm.data(), chunk_res.pcm.size(), k, is_last); + + full.pcm.insert(full.pcm.end(), chunk_res.pcm.begin(), chunk_res.pcm.end()); + full.duration_s += chunk_res.duration_s; + full.sample_rate = chunk_res.sample_rate; + } + + return full; + } + std::string backend_name() const { if (!model.backend) return "(unknown)"; if (const char * name = ggml_backend_name(model.backend)) { @@ -303,6 +413,17 @@ SynthesisResult Engine::synthesize(const std::string & text) { return pimpl_->synthesize(text); } +SynthesisResult Engine::synthesize(const std::string & text, + const StreamCallback & on_chunk) { + // Fall through to the batch path when streaming is disabled or no + // callback is wired up. Both conditions match the Chatterbox + // semantics — callers can pass a no-op callback safely. + if (!on_chunk || pimpl_->opts.stream_chunk_tokens <= 0) { + return pimpl_->synthesize(text); + } + return pimpl_->synthesize_streaming(text, on_chunk); +} + void Engine::cancel() { pimpl_->cancel_flag.store(true, std::memory_order_release); } From ea08f1a818342b7171ac66676af76cded6577fdb Mon Sep 17 00:00:00 2001 From: ogad-tether Date: Fri, 15 May 2026 14:02:15 +0100 Subject: [PATCH 2/6] =?UTF-8?q?tts-cpp:=20supertonic=20streaming=20v2=20?= =?UTF-8?q?=E2=80=94=20min-chunk=20guard=20+=20continuation=20flag?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two empirically-driven additions on top of the sentence-aligned chunker: 1. is_continuation flag through supertonic_preprocess_text + supertonic_text_to_ids. When the engine produces a mid-clause / mid-word chunk during streaming, the preprocess skips its auto-appended terminal period. Without the flag the model spoke stub chunks as complete sentences with falling intonation and trailing-phoneme artifacts (the original "park.K" tail bug). The engine detects per-chunk whether the chunk ends on a natural sentence terminator (ASCII .?! plus CJK / Devanagari / Urdu equivalents) and passes through the flag accordingly. 2. stream_min_chunk_tokens (default 30) on EngineOptions. Below ~30 tokens the model emits dropped / muddled phonemes on stub input regardless of the continuation flag (verified on multiple seeds and texts — short text is a model-level failure mode, not a preprocess one). The chunker treats min_chunk_tokens as a hard floor: effective target = max(target, min), the sentence/clause/ whitespace search lower bound is clamped to start + min, and any trailing chunk below the floor is merged into its predecessor. The min floor is the practical ceiling on what Option A streaming can achieve. True seam-free streaming inside one utterance would require model retraining (causal attention, per-token duration, mel-frame cache continuity — the bits chatterbox has by design but supertonic was not trained for). Documenting that as the trade-off honestly rather than papering over it. Behavior: - Multi-sentence input → sentence-aligned chunks (the v1 behavior). Acoustically equivalent to batch on the same seed. - Long single-sentence input → multi-chunk output at the min floor, each chunk passed to the model without an artificial terminal period. Inter-chunk pauses and rate shifts are inherent to per-chunk synthesis on a non-streaming-trained model. Co-Authored-By: Claude Opus 4.7 (1M context) --- tts-cpp/include/tts-cpp/supertonic/engine.h | 34 ++++++--- tts-cpp/src/supertonic_chunker.cpp | 78 +++++++++++++-------- tts-cpp/src/supertonic_chunker.h | 14 ++-- tts-cpp/src/supertonic_cli.cpp | 7 ++ tts-cpp/src/supertonic_engine.cpp | 69 ++++++++++++++++-- tts-cpp/src/supertonic_internal.h | 6 +- tts-cpp/src/supertonic_preprocess.cpp | 17 +++-- 7 files changed, 168 insertions(+), 57 deletions(-) diff --git a/tts-cpp/include/tts-cpp/supertonic/engine.h b/tts-cpp/include/tts-cpp/supertonic/engine.h index 7700d915ea0..2cafacb9351 100644 --- a/tts-cpp/include/tts-cpp/supertonic/engine.h +++ b/tts-cpp/include/tts-cpp/supertonic/engine.h @@ -146,18 +146,34 @@ struct EngineOptions { // // stream_chunk_tolerance_pct Boundary-snap window for CLAUSE and // WHITESPACE fallbacks (±N% of target). - // Sentence-end breaks are searched on a - // much wider implicit window (target/2 - // to 3× target) regardless of this - // setting, because sentence prosody - // dominates audio quality: chunks cut - // mid-clause receive an artificial - // terminal period from preprocess and - // the model emits muddled audio in - // response. Default 20. + // Sentence-end is searched on a much + // wider implicit window (target/2 to + // 3× target) because sentence-aligned + // chunks let the per-chunk duration + // predictor and attention phrase + // naturally; mid-clause cuts work + // (continuation flag in preprocess + // avoids the artificial trailing + // period that would otherwise make + // the model speak the stub as a + // complete sentence) but produce + // audible pauses + rate shifts at + // seams since the model is not + // streaming-trained. Default 20. + // + // stream_min_chunk_tokens Hard floor on every chunk's size. + // Effective targets are + // max(target, min) — below the floor + // the model glitches on stub input + // (dropped / muddled phonemes, + // verified empirically). Trailing + // chunks shorter than the floor are + // merged into the previous chunk. + // Default 30. int stream_chunk_tokens = 0; int stream_first_chunk_tokens = 0; int stream_chunk_tolerance_pct = 20; + int stream_min_chunk_tokens = 30; }; // Per-chunk PCM callback for streaming synthesis. Receives a pointer to diff --git a/tts-cpp/src/supertonic_chunker.cpp b/tts-cpp/src/supertonic_chunker.cpp index 8cfb351e362..23cfa10efaa 100644 --- a/tts-cpp/src/supertonic_chunker.cpp +++ b/tts-cpp/src/supertonic_chunker.cpp @@ -128,15 +128,22 @@ size_t scan_for(const std::vector & cps, // // `sent_lo..sent_hi` — wide window for sentence-end punctuation. // Sentence prosody dominates audio quality on -// this model (chunks that end mid-clause and -// get an artificial trailing period make the -// model emit muddled/dropped words), so we -// search a much larger range for sentences -// than for any other boundary type. +// this model (the duration predictor and +// attention run per-chunk, so chunk-aligned +// sentence breaks let the model phrase +// naturally), so sentence search reaches +// much further than clause/whitespace. // // `norm_lo..norm_hi` — tight user-controlled window for clause and // whitespace fallbacks when no sentence is in -// reach. Hard-cut at `norm_hi` as last resort. +// reach. Hard-cut at `norm_hi` as last +// resort. Continuation flag in the engine +// makes the resulting mid-clause chunk audio +// tolerable; the bigger seam artifacts (small +// pauses, rate shifts) are inherent to +// per-chunk synthesis on a non-streaming- +// trained model and can't be removed at this +// layer. // // Returns the index AFTER the break (chunk = cps[start..break)). size_t pick_break(const std::vector & cps, @@ -182,7 +189,8 @@ std::vector split_for_streaming( const std::string & text, int target_tokens, int first_chunk_tokens, - int tolerance_pct) + int tolerance_pct, + int min_chunk_tokens) { std::vector out; if (target_tokens <= 0 || text.empty()) { @@ -196,31 +204,45 @@ std::vector split_for_streaming( const std::vector cps = decode_with_byte_offsets(text); if (cps.empty()) return out; - const int tol_pct = std::clamp(tolerance_pct, 0, 100); + const int tol_pct = std::clamp(tolerance_pct, 0, 100); + const int min_chunk = std::max(1, min_chunk_tokens); + // Effective targets clamp up to min_chunk so the chunker never aims + // for a sub-minimum chunk (the model glitches on stub input below + // ~30 tokens — verified empirically on multiple seeds and texts). + const int target_eff = std::max(target_tokens, min_chunk); + const int first_eff = first_chunk_tokens > 0 + ? std::max(first_chunk_tokens, min_chunk) + : 0; const size_t total = cps.size(); size_t start = 0; int chunk_idx = 0; while (start < total) { - const int target_this = (chunk_idx == 0 && first_chunk_tokens > 0) - ? first_chunk_tokens - : target_tokens; + const int target_this = (chunk_idx == 0 && first_eff > 0) + ? first_eff + : target_eff; // Tight window — for clause/whitespace boundaries and the // hard-cut fallback. Driven by the user-supplied tolerance. - const int norm_lo_rel = std::max(1, target_this - target_this * tol_pct / 100); - const int norm_hi_rel = target_this + target_this * tol_pct / 100; + // Lower bound is bumped to start + min_chunk so a break can't + // produce a sub-minimum chunk on this iteration. + int norm_lo_rel = std::max(1, target_this - target_this * tol_pct / 100); + int norm_hi_rel = target_this + target_this * tol_pct / 100; + norm_lo_rel = std::max(norm_lo_rel, min_chunk); + norm_hi_rel = std::max(norm_hi_rel, norm_lo_rel); // Wide window — sentence-end search. Reaches back to half the - // target (so we don't pick a sentence break that makes the chunk - // ridiculously short) and forward to 3× the target (so a fairly - // distant period is still preferred over a mid-clause whitespace - // cut). 3× is empirical: covers typical English sentence-length - // variance without letting one runaway sentence destroy - // streaming latency. - const int sent_lo_rel = std::max(1, target_this / 2); - const int sent_hi_rel = target_this * 3; + // effective target (so a sentence break that yields a too-small + // chunk is rejected by the min_chunk floor) and forward to 3× + // the target (so a fairly distant period is still preferred + // over a mid-clause whitespace cut). 3× is empirical: covers + // typical English sentence-length variance without letting one + // runaway sentence destroy streaming latency. + int sent_lo_rel = std::max(1, target_this / 2); + int sent_hi_rel = target_this * 3; + sent_lo_rel = std::max(sent_lo_rel, min_chunk); + sent_hi_rel = std::max(sent_hi_rel, sent_lo_rel); const size_t norm_lo = std::min(start + (size_t) norm_lo_rel, total); const size_t norm_hi = std::min(start + (size_t) norm_hi_rel, total); @@ -245,17 +267,15 @@ std::vector split_for_streaming( ++chunk_idx; } - // Tail-merge heuristic: if the last chunk has fewer than max(8, - // target/3) tokens AND we have at least two chunks, fold it into - // the previous chunk. Avoids paying full pipeline cost for a - // handful of trailing tokens. Mirrors chatterbox_engine.cpp:608. + // Tail-merge heuristic: if the last chunk has fewer than min_chunk + // tokens AND we have at least two chunks, fold it into the previous + // chunk. Avoids paying full pipeline cost for a handful of + // trailing tokens AND avoids handing the model a sub-minimum tail + // chunk where it glitches. Mirrors chatterbox_engine.cpp:608. if (out.size() >= 2) { const std::vector tail_cps = decode_with_byte_offsets(out.back()); - const int min_tail = std::max(8, target_tokens / 3); - if ((int) tail_cps.size() < min_tail) { + if ((int) tail_cps.size() < min_chunk) { std::string merged = out[out.size() - 2]; - // Re-insert a single space between fragments if both sides - // are non-empty after trim, so spoken prosody isn't glued. if (!merged.empty() && !out.back().empty()) merged.push_back(' '); merged += out.back(); out.pop_back(); diff --git a/tts-cpp/src/supertonic_chunker.h b/tts-cpp/src/supertonic_chunker.h index 2dbdeefb263..acce112adc2 100644 --- a/tts-cpp/src/supertonic_chunker.h +++ b/tts-cpp/src/supertonic_chunker.h @@ -26,18 +26,20 @@ namespace tts_cpp::supertonic::detail { // that smaller target instead (latency knob — first audio lands earlier // while subsequent chunks stay large to keep throughput up). // +// `min_chunk_tokens` is a hard floor on every chunk's size: the +// effective target is `max(target_tokens, min_chunk_tokens)` (and +// similarly for first-chunk). The trailing chunk is merged into the +// previous one if it ends up below the floor. Default 30 — empirically +// the model emits dropped/muddled phonemes when fed shorter stubs. +// // Leading/trailing whitespace on each chunk is trimmed. Adjacent chunks // concatenated back together (modulo trimmed whitespace) reproduce the // input. Empty / whitespace-only chunks are not emitted. -// -// Tail-merge: if the last chunk would carry fewer than ~max(8, target/3) -// tokens, it is merged into the previous chunk to avoid paying full -// pipeline cost for a handful of trailing tokens (mirrors Chatterbox's -// chatterbox_engine.cpp:608 heuristic). std::vector split_for_streaming( const std::string & text, int target_tokens, int first_chunk_tokens = 0, - int tolerance_pct = 20); + int tolerance_pct = 20, + int min_chunk_tokens = 30); } // namespace tts_cpp::supertonic::detail diff --git a/tts-cpp/src/supertonic_cli.cpp b/tts-cpp/src/supertonic_cli.cpp index 6a12e86b5e5..f4fe3da574f 100644 --- a/tts-cpp/src/supertonic_cli.cpp +++ b/tts-cpp/src/supertonic_cli.cpp @@ -29,6 +29,10 @@ void usage(const char * argv0) { " [--stream-first-chunk-tokens N] (override 1st-chunk target;\n" " 0 = same as --stream-chunk-tokens)\n" " [--stream-chunk-tolerance-pct N] (boundary-snap window; default 20)\n" + " [--stream-min-chunk-tokens N] (hard floor on chunk size;\n" + " default 30 — below this the model glitches\n" + " on stub input; chunks below the floor are\n" + " merged with their neighbor)\n" "\n" " When --out is '-', the CLI emits raw s16le PCM to stdout as\n" " each chunk completes. Pipe into a player, e.g.:\n" @@ -116,6 +120,9 @@ int main(int argc, char ** argv) { else if (arg == "--stream-chunk-tolerance-pct") { opts.stream_chunk_tolerance_pct = std::stoi(next("--stream-chunk-tolerance-pct")); } + else if (arg == "--stream-min-chunk-tokens") { + opts.stream_min_chunk_tokens = std::stoi(next("--stream-min-chunk-tokens")); + } else if (arg == "-h" || arg == "--help") { usage(argv[0]); return 0; } else { fprintf(stderr, "unknown arg: %s\n", arg.c_str()); usage(argv[0]); return 2; } } diff --git a/tts-cpp/src/supertonic_engine.cpp b/tts-cpp/src/supertonic_engine.cpp index e0fc373b6ac..f44b62d6ca4 100644 --- a/tts-cpp/src/supertonic_engine.cpp +++ b/tts-cpp/src/supertonic_engine.cpp @@ -110,6 +110,44 @@ class numpy_random_state { } }; +// Heuristic: does this chunk end at a natural sentence terminator? +// Used by streaming to decide whether to skip the auto-appended period +// (continuation chunks) or keep it (complete-sentence chunks). +// Comma / clause punctuation are NOT sentence terminators here — +// chunks ending in commas still want is_continuation=true so the model +// hears them as a continuation, not a mini-sentence. +bool chunk_ends_with_sentence_term(const std::string & s) { + // Trim trailing ASCII whitespace. + size_t i = s.size(); + while (i > 0 && (s[i - 1] == ' ' || s[i - 1] == '\t' || + s[i - 1] == '\n' || s[i - 1] == '\r')) --i; + if (i == 0) return false; + if (s[i - 1] == '.' || s[i - 1] == '?' || s[i - 1] == '!') return true; + // Decode the final UTF-8 code point: scan back to the leading byte. + size_t pos = i - 1; + while (pos > 0 && ((uint8_t) s[pos] & 0xC0) == 0x80) --pos; + const size_t bytes = i - pos; + uint32_t cp = 0; + if (bytes == 1) cp = (uint8_t) s[pos]; + else if (bytes == 2) cp = ((s[pos] & 0x1F) << 6) | (s[pos + 1] & 0x3F); + else if (bytes == 3) cp = ((s[pos] & 0x0F) << 12) | + ((s[pos + 1] & 0x3F) << 6) | + (s[pos + 2] & 0x3F); + else if (bytes == 4) cp = ((s[pos] & 0x07) << 18) | + ((s[pos + 1] & 0x3F) << 12) | + ((s[pos + 2] & 0x3F) << 6) | + (s[pos + 3] & 0x3F); + switch (cp) { + case 0x3002: case 0xFF1F: case 0xFF01: // 。 ? ! + case 0x203C: case 0x2047: case 0x2048: case 0x2049: // ‼ ⁇ ⁈ ⁉ + case 0x0964: case 0x0965: // । ॥ + case 0x06D4: // ۔ + return true; + default: + return false; + } +} + } // namespace struct Engine::Impl { @@ -176,10 +214,12 @@ struct Engine::Impl { // Single-chunk synthesis worker. Runs the full Supertonic pipeline // (preprocess → duration → noise → text encoder → vector estimator - // CFM loop → vocoder) on `text`, using `seed` for the noise RNG so - // streaming callers can perturb per-chunk seeds without colliding - // on identical starting noise. Throws on cancel or any stage error. - SynthesisResult run_single_chunk(const std::string & text, int seed) { + // CFM loop → vocoder) on `text` with the given seed. When + // `is_continuation` is true the preprocess skips the auto-appended + // terminal period — used by streaming for mid-utterance chunks so + // the model isn't told "this is a complete sentence" when it isn't. + SynthesisResult run_single_chunk(const std::string & text, int seed, + bool is_continuation = false) { const std::string voice = opts.voice.empty() ? model.hparams.default_voice : opts.voice; @@ -200,7 +240,8 @@ struct Engine::Impl { std::vector text_ids_i32; std::string normalized; std::string error; - if (!supertonic_text_to_ids(model, text, opts.language, text_ids_i32, &normalized, &error)) { + if (!supertonic_text_to_ids(model, text, opts.language, text_ids_i32, + &normalized, &error, is_continuation)) { throw std::runtime_error("Supertonic Engine: text preprocessing failed: " + error); } std::vector text_ids(text_ids_i32.begin(), text_ids_i32.end()); @@ -308,7 +349,8 @@ struct Engine::Impl { text, opts.stream_chunk_tokens, opts.stream_first_chunk_tokens, - opts.stream_chunk_tolerance_pct); + opts.stream_chunk_tolerance_pct, + opts.stream_min_chunk_tokens); if (chunks.empty()) { throw std::runtime_error("Supertonic Engine: chunker produced no chunks"); @@ -345,7 +387,20 @@ struct Engine::Impl { // nearby seeds where the model produces phantom phoneme // artifacts ("park.K" tail). Keeping the user's chosen // seed across chunks gives consistent, controllable output. - SynthesisResult chunk_res = run_single_chunk(chunks[k], opts.seed); + // + // is_continuation: chunks that DON'T end on a natural + // sentence terminator (.?! and the CJK / Devanagari / Urdu + // equivalents) need preprocess to skip the auto-appended + // period. Otherwise the model hears the stub as a complete + // sentence with falling intonation + trailing artifacts — + // the failure mode that originally restricted us to + // sentence-only chunking. With the flag, mid-clause / + // mid-word chunk endings flow through with their natural + // (un-punctuated) tail so the model treats them as a + // continuation. + const bool is_continuation = !chunk_ends_with_sentence_term(chunks[k]); + SynthesisResult chunk_res = run_single_chunk(chunks[k], opts.seed, + is_continuation); // Anti-click raised-cosine fade across inter-chunk seams. // Without HiFT cache continuity (Supertonic runs each chunk diff --git a/tts-cpp/src/supertonic_internal.h b/tts-cpp/src/supertonic_internal.h index 94e3f04e5af..7417a15e23c 100644 --- a/tts-cpp/src/supertonic_internal.h +++ b/tts-cpp/src/supertonic_internal.h @@ -282,13 +282,15 @@ ggml_tensor * try_pretransposed_weight(const supertonic_model & model, const ggm std::string supertonic_preprocess_text(const std::string & text, const std::string & language, - const std::string & language_wrap_mode); + const std::string & language_wrap_mode, + bool is_continuation = false); bool supertonic_text_to_ids(const supertonic_model & model, const std::string & text, const std::string & language, std::vector & ids, std::string * normalized_text = nullptr, - std::string * error = nullptr); + std::string * error = nullptr, + bool is_continuation = false); bool supertonic_vocoder_forward_cpu(const supertonic_model & model, const float * latent, diff --git a/tts-cpp/src/supertonic_preprocess.cpp b/tts-cpp/src/supertonic_preprocess.cpp index 60ffdbacc73..dfd42f0f10c 100644 --- a/tts-cpp/src/supertonic_preprocess.cpp +++ b/tts-cpp/src/supertonic_preprocess.cpp @@ -171,7 +171,8 @@ bool is_supported_language(const std::string & language) { std::string supertonic_preprocess_text(const std::string & text, const std::string & language, - const std::string & language_wrap_mode) { + const std::string & language_wrap_mode, + bool is_continuation) { if (!is_supported_language(language)) { throw std::runtime_error("invalid Supertonic language: " + language); } @@ -211,7 +212,13 @@ std::string supertonic_preprocess_text(const std::string & text, while (s.find("``") != std::string::npos) replace_all(s, "``", "`"); s = collapse_spaces(s); - if (!has_terminal_punct(s)) s += "."; + // Skip the auto-period for continuation chunks (streaming). The + // model was trained on sentence-terminated input; on chunked mid- + // utterance text a fake period makes it speak the stub as a + // complete sentence with falling intonation + trailing artifacts. + // Continuation chunks pass through with their natural ending (word, + // comma, etc.) so the model isn't lied to about sentence end. + if (!is_continuation && !has_terminal_punct(s)) s += "."; if (language_wrap_mode == "none") return s; if (language_wrap_mode == "prefix") return "<" + language + ">" + s + " "; if (language_wrap_mode == "open_close") return "<" + language + ">" + s + ""; @@ -223,9 +230,11 @@ bool supertonic_text_to_ids(const supertonic_model & model, const std::string & language, std::vector & ids, std::string * normalized_text, - std::string * error) { + std::string * error, + bool is_continuation) { try { - std::string normalized = supertonic_preprocess_text(text, language, model.hparams.language_wrap_mode); + std::string normalized = supertonic_preprocess_text( + text, language, model.hparams.language_wrap_mode, is_continuation); std::vector cps = utf8_to_cps(normalized); ids.clear(); ids.reserve(cps.size()); From 6f387ffe3d30c10403ddf138f3a23719fa5f3970 Mon Sep 17 00:00:00 2001 From: ogad-tether Date: Fri, 15 May 2026 20:02:06 +0100 Subject: [PATCH 3/6] =?UTF-8?q?tts-cpp:=20supertonic=20chunker=20=E2=80=94?= =?UTF-8?q?=20relax=20tail-merge=20to=20chatterbox-style=20threshold?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tail-merge was using min_chunk_tokens (30) as its threshold, which on languages denser than English (CJK in particular) merged the last chunk into the previous one even when that last chunk was a complete sentence. Concrete: Korean "공원에서 산책하기 좋은 날이다." is 18 code points — below the 30-cp floor — so the merger folded it into the previous chunk, which contained TWO sentences, producing a single 172-byte chunk for the whole utterance and zero streaming benefit. Switch to chatterbox_engine.cpp:608's heuristic: tail_thresh = max(6, target_tokens/3) (16 for target=50). Genuinely tiny stubs (<16 cps) still merge; real sentence chunks stay independent. The min_chunk_tokens floor governs what the chunker proactively *aims for* during iteration, not what it does with whatever's left after the last natural boundary. Verified: Korean 3-sentence text now chunks into 2 (first chunk spans 2 sentences due to first-sentence-below-min-floor, last sentence stays separate at 18 cps). English 3-sentence test stays at 3 sentence-aligned chunks. Co-Authored-By: Claude Opus 4.7 (1M context) --- tts-cpp/src/supertonic_chunker.cpp | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tts-cpp/src/supertonic_chunker.cpp b/tts-cpp/src/supertonic_chunker.cpp index 23cfa10efaa..c186cdc7806 100644 --- a/tts-cpp/src/supertonic_chunker.cpp +++ b/tts-cpp/src/supertonic_chunker.cpp @@ -267,14 +267,24 @@ std::vector split_for_streaming( ++chunk_idx; } - // Tail-merge heuristic: if the last chunk has fewer than min_chunk - // tokens AND we have at least two chunks, fold it into the previous - // chunk. Avoids paying full pipeline cost for a handful of - // trailing tokens AND avoids handing the model a sub-minimum tail - // chunk where it glitches. Mirrors chatterbox_engine.cpp:608. + // Tail-merge heuristic: if the last chunk is genuinely tiny, fold + // it into the previous chunk to avoid paying full pipeline cost for + // a handful of trailing tokens. Mirrors chatterbox_engine.cpp:608. + // + // Threshold is intentionally `max(6, target_tokens/3)`, NOT + // `min_chunk_tokens` — using min_chunk here would merge any + // last-chunk shorter than the floor, which can swallow a complete + // final sentence (e.g. Korean "공원에서 산책하기 좋은 날이다." + // is 18 code points, below a min_chunk=30 floor, but is itself a + // valid sentence-aligned chunk that the model handles fine because + // CJK information density per code point is much higher than ASCII). + // The min_chunk floor governs what the chunker proactively *aims + // for*, not what it does with whatever's left after the last natural + // boundary. if (out.size() >= 2) { const std::vector tail_cps = decode_with_byte_offsets(out.back()); - if ((int) tail_cps.size() < min_chunk) { + const int tail_thresh = std::max(6, target_tokens / 3); + if ((int) tail_cps.size() < tail_thresh) { std::string merged = out[out.size() - 2]; if (!merged.empty() && !out.back().empty()) merged.push_back(' '); merged += out.back(); From 9ed0939ef1a5feae82f62f4e5adb50c07c3da33b Mon Sep 17 00:00:00 2001 From: ogad-tether Date: Fri, 15 May 2026 20:05:56 +0100 Subject: [PATCH 4/6] =?UTF-8?q?tts-cpp:=20supertonic=20chunker=20=E2=80=94?= =?UTF-8?q?=20tighten=20sentence=20search=20to=202x=20target?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 3x sentence-search window slurped runaway-sentence tails as one huge "sentence-aligned" chunk: a 245-char single sentence with the final period 109 chars past start was found by the wide window, so chunker took the whole remainder as chunk[3] instead of falling through to whitespace and producing multiple sub-sentence chunks. 2x is still wide enough to catch a long-but-reasonable first sentence in multi-sentence input (covers up to ~90 chars at target=50, ample for typical English / French / Portuguese sentences) but narrow enough that genuinely runaway sentences (>2x target with no internal periods) fall through to whitespace and stream. Empirical: same 245-char English run-on now produces 5 evenly-sized chunks (30, 52, 54, 52, 56) instead of 4 with the tail-blob (30, 52, 54, 109). Multi-sentence test unchanged (still 3 sentence- aligned chunks). Co-Authored-By: Claude Opus 4.7 (1M context) --- tts-cpp/src/supertonic_chunker.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tts-cpp/src/supertonic_chunker.cpp b/tts-cpp/src/supertonic_chunker.cpp index c186cdc7806..200d3b2611c 100644 --- a/tts-cpp/src/supertonic_chunker.cpp +++ b/tts-cpp/src/supertonic_chunker.cpp @@ -234,13 +234,16 @@ std::vector split_for_streaming( // Wide window — sentence-end search. Reaches back to half the // effective target (so a sentence break that yields a too-small - // chunk is rejected by the min_chunk floor) and forward to 3× - // the target (so a fairly distant period is still preferred - // over a mid-clause whitespace cut). 3× is empirical: covers - // typical English sentence-length variance without letting one - // runaway sentence destroy streaming latency. + // chunk is rejected by the min_chunk floor) and forward to 2× + // the target. 2× is empirical: catches a long-but-reasonable + // first sentence in multi-sentence text (~75-90 chars at + // target=50), but narrow enough that for a genuinely runaway + // sentence (>2× target with no internal periods), the chunker + // falls through to whitespace and produces multiple sub- + // sentence chunks instead of slurping the whole tail as one + // huge "sentence-aligned" chunk. int sent_lo_rel = std::max(1, target_this / 2); - int sent_hi_rel = target_this * 3; + int sent_hi_rel = target_this * 2; sent_lo_rel = std::max(sent_lo_rel, min_chunk); sent_hi_rel = std::max(sent_hi_rel, sent_lo_rel); From 16ca08ea7ec806f86a8854960fac5085755f7d4c Mon Sep 17 00:00:00 2001 From: ogad-tether Date: Mon, 18 May 2026 12:38:12 +0100 Subject: [PATCH 5/6] =?UTF-8?q?tts-cpp:=20supertonic=20streaming=20?= =?UTF-8?q?=E2=80=94=20review=20fixes=20(shared=20term=20table,=20buffered?= =?UTF-8?q?=20stdout)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two review-comment fixes from PR #20: 1. De-duplicated the sentence-terminator code-point table between supertonic_chunker.cpp's is_sentence_end_cp() and the engine's chunk_ends_with_sentence_term(). is_sentence_end_cp() is now declared in supertonic_chunker.h and called from the engine's per-chunk continuation detector — the engine still owns the UTF-8 trim/decode logic, but the predicate (and its multilingual table) live in one place. Adding Ethiopic ።, Tibetan ། or any other terminator now needs one edit, not two. 2. stream_emit_pcm_stdout was doing a per-sample fwrite(&v, 2, 1, stdout) loop — ~44k-132k syscall-adjacent calls per chunk. Build the chunk's int16 buffer once and write it in a single fwrite; flush after. No semantic change to the bytes on stdout; just throughput. Verified: multi-sentence chunker still produces 3 sentence-aligned chunks (unchanged); stdout streaming byte count still equals samples * 2 exactly. Co-Authored-By: Claude Opus 4.7 (1M context) --- tts-cpp/src/supertonic_chunker.cpp | 55 ++++++++++++++++-------------- tts-cpp/src/supertonic_chunker.h | 8 +++++ tts-cpp/src/supertonic_cli.cpp | 16 +++++---- tts-cpp/src/supertonic_engine.cpp | 27 ++++++--------- 4 files changed, 59 insertions(+), 47 deletions(-) diff --git a/tts-cpp/src/supertonic_chunker.cpp b/tts-cpp/src/supertonic_chunker.cpp index 200d3b2611c..9d2bc2385cc 100644 --- a/tts-cpp/src/supertonic_chunker.cpp +++ b/tts-cpp/src/supertonic_chunker.cpp @@ -54,31 +54,6 @@ bool is_space_cp(uint32_t cp) { cp == 0x202F || cp == 0x205F || cp == 0x3000; } -// Sentence-end punctuation across ASCII, CJK, Devanagari, and the -// extended Unicode punctuation range. Conservative — symbols that -// can be sentence-terminating but ambiguous (e.g. ellipsis "…") are -// intentionally excluded since they often continue a thought. -bool is_sentence_end_cp(uint32_t cp) { - switch (cp) { - case 0x002E: // . - case 0x003F: // ? - case 0x0021: // ! - case 0x3002: // 。 CJK ideographic full stop - case 0xFF1F: // ? fullwidth question mark - case 0xFF01: // ! fullwidth exclamation mark - case 0x203C: // ‼ double exclamation - case 0x2047: // ⁇ double question - case 0x2048: // ⁈ question exclamation - case 0x2049: // ⁉ exclamation question - case 0x0964: // । Devanagari danda - case 0x0965: // ॥ Devanagari double danda - case 0x06D4: // ۔ Urdu full stop - return true; - default: - return false; - } -} - // Clause-end punctuation (lower priority than sentence-end). Includes // CJK and Arabic equivalents. Closing brackets count — a clause that // just ended a parenthetical is a reasonable break point too. @@ -185,6 +160,36 @@ std::string slice_to_string(const std::vector & cps, } // namespace +// Sentence-end punctuation across ASCII, CJK, Devanagari, and the +// extended Unicode punctuation range. Conservative — symbols that +// can be sentence-terminating but ambiguous (e.g. ellipsis "…") are +// intentionally excluded since they often continue a thought. +// +// Public (declared in supertonic_chunker.h) so the engine's per-chunk +// "does this end on a natural sentence terminator?" helper shares the +// same table — additions (e.g. Ethiopic ።, Tibetan ། later) land in +// one place instead of needing to be synced across compilation units. +bool is_sentence_end_cp(uint32_t cp) { + switch (cp) { + case 0x002E: // . + case 0x003F: // ? + case 0x0021: // ! + case 0x3002: // 。 CJK ideographic full stop + case 0xFF1F: // ? fullwidth question mark + case 0xFF01: // ! fullwidth exclamation mark + case 0x203C: // ‼ double exclamation + case 0x2047: // ⁇ double question + case 0x2048: // ⁈ question exclamation + case 0x2049: // ⁉ exclamation question + case 0x0964: // । Devanagari danda + case 0x0965: // ॥ Devanagari double danda + case 0x06D4: // ۔ Urdu full stop + return true; + default: + return false; + } +} + std::vector split_for_streaming( const std::string & text, int target_tokens, diff --git a/tts-cpp/src/supertonic_chunker.h b/tts-cpp/src/supertonic_chunker.h index acce112adc2..99c0142ce53 100644 --- a/tts-cpp/src/supertonic_chunker.h +++ b/tts-cpp/src/supertonic_chunker.h @@ -15,6 +15,7 @@ // count that the engine will see. No model tokenizer call is required // for sizing. +#include #include #include @@ -42,4 +43,11 @@ std::vector split_for_streaming( int tolerance_pct = 20, int min_chunk_tokens = 30); +// Sentence-end predicate over a Unicode code point. Public so the +// engine's per-chunk "does this end on a natural sentence terminator?" +// helper can share the table with the chunker's boundary search — +// keeps additions (e.g. Ethiopic ።, Tibetan ། in the future) in one +// place. See supertonic_chunker.cpp for the full set. +bool is_sentence_end_cp(uint32_t cp); + } // namespace tts_cpp::supertonic::detail diff --git a/tts-cpp/src/supertonic_cli.cpp b/tts-cpp/src/supertonic_cli.cpp index f4fe3da574f..09845685b78 100644 --- a/tts-cpp/src/supertonic_cli.cpp +++ b/tts-cpp/src/supertonic_cli.cpp @@ -49,15 +49,19 @@ tts_cpp::supertonic::Precision parse_precision(const std::string & s) { } // Emit `pcm` as raw signed-16-bit little-endian samples on stdout. Used -// by the streaming path so a consumer like `aplay -f S16_LE -r 44100 -c 1` -// can begin playback as soon as the first chunk arrives. Returns once -// the buffer has been written and flushed. +// by the streaming path so a consumer like `ffplay -f s16le -ar 44100 ...` +// can begin playback as soon as the first chunk arrives. Builds the +// full chunk's worth of int16 into a contiguous buffer and writes it +// with a single fwrite — a per-sample fwrite loop would do ~44k-132k +// syscall-adjacent calls per chunk and noticeably tax streaming +// throughput on slower terminals / pipes. void stream_emit_pcm_stdout(const float * pcm, std::size_t samples) { + std::vector buf(samples); for (std::size_t i = 0; i < samples; ++i) { - float c = std::max(-1.0f, std::min(1.0f, pcm[i])); - int16_t v = (int16_t) std::lrintf(c * 32767.0f); - std::fwrite(&v, 2, 1, stdout); + float c = std::max(-1.0f, std::min(1.0f, pcm[i])); + buf[i] = (int16_t) std::lrintf(c * 32767.0f); } + std::fwrite(buf.data(), sizeof(int16_t), samples, stdout); std::fflush(stdout); } diff --git a/tts-cpp/src/supertonic_engine.cpp b/tts-cpp/src/supertonic_engine.cpp index f44b62d6ca4..24b1f35e066 100644 --- a/tts-cpp/src/supertonic_engine.cpp +++ b/tts-cpp/src/supertonic_engine.cpp @@ -112,18 +112,21 @@ class numpy_random_state { // Heuristic: does this chunk end at a natural sentence terminator? // Used by streaming to decide whether to skip the auto-appended period -// (continuation chunks) or keep it (complete-sentence chunks). -// Comma / clause punctuation are NOT sentence terminators here — -// chunks ending in commas still want is_continuation=true so the model -// hears them as a continuation, not a mini-sentence. +// (continuation chunks) or keep it (complete-sentence chunks). Commas +// and other clause punctuation are NOT counted here — chunks ending in +// a comma still want is_continuation=true so the model hears them as +// a continuation, not a mini-sentence. +// +// Trims trailing whitespace, then decodes the final UTF-8 code point +// and delegates to the chunker's `is_sentence_end_cp` so the +// terminator table is defined in exactly one place (see +// supertonic_chunker.cpp). bool chunk_ends_with_sentence_term(const std::string & s) { - // Trim trailing ASCII whitespace. size_t i = s.size(); while (i > 0 && (s[i - 1] == ' ' || s[i - 1] == '\t' || s[i - 1] == '\n' || s[i - 1] == '\r')) --i; if (i == 0) return false; - if (s[i - 1] == '.' || s[i - 1] == '?' || s[i - 1] == '!') return true; - // Decode the final UTF-8 code point: scan back to the leading byte. + // Walk back to the leading byte of the final UTF-8 sequence. size_t pos = i - 1; while (pos > 0 && ((uint8_t) s[pos] & 0xC0) == 0x80) --pos; const size_t bytes = i - pos; @@ -137,15 +140,7 @@ bool chunk_ends_with_sentence_term(const std::string & s) { ((s[pos + 1] & 0x3F) << 12) | ((s[pos + 2] & 0x3F) << 6) | (s[pos + 3] & 0x3F); - switch (cp) { - case 0x3002: case 0xFF1F: case 0xFF01: // 。 ? ! - case 0x203C: case 0x2047: case 0x2048: case 0x2049: // ‼ ⁇ ⁈ ⁉ - case 0x0964: case 0x0965: // । ॥ - case 0x06D4: // ۔ - return true; - default: - return false; - } + return detail::is_sentence_end_cp(cp); } } // namespace From 16c2cd2af9c6d448c3e462b480da13d207faa3f3 Mon Sep 17 00:00:00 2001 From: ogad-tether Date: Mon, 18 May 2026 12:51:57 +0100 Subject: [PATCH 6/6] =?UTF-8?q?tts-cpp:=20supertonic=20streaming=20?= =?UTF-8?q?=E2=80=94=20log=20per-chunk=20is=5Fcontinuation=20under=20SUPER?= =?UTF-8?q?TONIC=5FLOG=5FCHUNKS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds one line per chunk to the existing SUPERTONIC_LOG_CHUNKS env-var trace, showing the is_continuation flag the engine resolved before handing the chunk to run_single_chunk: chunk[0] (44 bytes): The quick brown fox jumps over the lazy dog. chunk[0] is_continuation=0 chunk[1] (64 bytes): Then she said hello to the world, ... chunk[1] is_continuation=0 Useful for validating that the engine's per-chunk continuation detector and the chunker's boundary search agree on what counts as a sentence terminator across UTF-8 — they share the same detail::is_sentence_end_cp table, but the engine reaches it via a UTF-8-decode of the final code point in the chunk string, so the two paths can in principle disagree on a malformed input. The log makes that observable in one place. Co-Authored-By: Claude Opus 4.7 (1M context) --- tts-cpp/src/supertonic_engine.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tts-cpp/src/supertonic_engine.cpp b/tts-cpp/src/supertonic_engine.cpp index 24b1f35e066..58be89d0c0d 100644 --- a/tts-cpp/src/supertonic_engine.cpp +++ b/tts-cpp/src/supertonic_engine.cpp @@ -394,6 +394,11 @@ struct Engine::Impl { // (un-punctuated) tail so the model treats them as a // continuation. const bool is_continuation = !chunk_ends_with_sentence_term(chunks[k]); + if (const char * env = std::getenv("SUPERTONIC_LOG_CHUNKS"); + env && env[0] == '1') { + std::fprintf(stderr, "chunk[%d] is_continuation=%d\n", + k, (int) is_continuation); + } SynthesisResult chunk_res = run_single_chunk(chunks[k], opts.seed, is_continuation);