diff --git a/tts-cpp/.gitignore b/tts-cpp/.gitignore index ca1d3c4c339..ba5670bf11a 100644 --- a/tts-cpp/.gitignore +++ b/tts-cpp/.gitignore @@ -1,5 +1,8 @@ # Vendored ggml (cloned separately at setup time; see README) -ggml/ +/ggml/ +# (We DO commit cmake/vcpkg-overlay-ports/ggml/ — it's the QVAC ggml port +# overlay carrying our Supertonic custom-op patches. The `/ggml/` above is +# anchored to the tts-cpp root only.) # Build artifacts build/ diff --git a/tts-cpp/CMakeLists.txt b/tts-cpp/CMakeLists.txt index b1521db83c0..d404842c064 100644 --- a/tts-cpp/CMakeLists.txt +++ b/tts-cpp/CMakeLists.txt @@ -115,23 +115,23 @@ if (NOT TARGET ggml) endif() add_library(ggml ALIAS ggml::ggml) else() - # In-tree subtree of qvac-ext-lib-whisper.cpp: the standalone - # patches/ folder + scripts/setup-ggml.sh tooling is intentionally - # absent here. Without them, an add_subdirectory(ggml) build - # would silently miss the ggml-backend-reg-filename-prefix patch - # that GGML_BACKEND_DL_PROJECT_PREFIX="speech-" depends on, so - # libspeech-ggml-*.so files would exist on disk but the runtime - # loader would still search for libggml-*.so under - # GGML_BACKEND_DL=ON. Reject up front with a pointer at the - # right consumption path. - if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/patches") + # Bundled-ggml dev build path (TTS_CPP_USE_SYSTEM_GGML=OFF). + # Expects `tts-cpp/ggml/` to be a checkout of the + # tetherto/qvac-ext-ggml repo on the `speech` branch — the QVAC + # fork carrying every infrastructure patch + the Supertonic 2 + # fused custom op family as commits (not as a patches/ overlay). + # + # Run `bash tts-cpp/scripts/setup-ggml.sh` first to clone + + # check out the pinned commit. No patches/ directory is + # consulted: the speech branch is already pre-patched at the + # commit level. + if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ggml/CMakeLists.txt") message(FATAL_ERROR - "tts-cpp: this in-tree subtree does not ship the patches/ " - "directory. Pass -DTTS_CPP_USE_SYSTEM_GGML=ON to consume " - "the QVAC speech-stack `ggml-speech` vcpkg port (which " - "carries the pre-applied patches), or use the standalone " - "github.com/gianni-cor/chatterbox.cpp repo for a " - "bundled-ggml dev build with patches/ present.") + "tts-cpp: bundled-ggml build requires tts-cpp/ggml/ to be " + "a checkout of tetherto/qvac-ext-ggml@speech. Run " + "`bash tts-cpp/scripts/setup-ggml.sh` first, or pass " + "-DTTS_CPP_USE_SYSTEM_GGML=ON to consume the QVAC " + "speech-stack `ggml-speech` vcpkg port.") endif() add_subdirectory(ggml) endif() diff --git a/tts-cpp/PROGRESS_SUPERTONIC.md b/tts-cpp/PROGRESS_SUPERTONIC.md index 6f0b34b122e..3c65019810e 100644 --- a/tts-cpp/PROGRESS_SUPERTONIC.md +++ b/tts-cpp/PROGRESS_SUPERTONIC.md @@ -612,9 +612,908 @@ spelled out (most TDD, written before the implementation lands). `flash_attn_f32_f16` enabled) to confirm the Supertonic bottleneck shifts from custom CPU ops to `kernel_mul_mm_f32_f32` and the same convnext block shape that chatterbox already profiled. +- ~~Evaluate GPU backends after CPU graph structure is fully stable.~~ — initial + Metal port landed 2026-05-11; see "Metal baseline (2026-05-11)" below. - Add CI coverage for converter help/setup syntax and portable Supertonic build targets. +## Metal baseline (2026-05-11) + +First end-to-end Metal run of the Supertonic 2 pipeline. Approach mirrors +Chatterbox's pattern: single `ggml_backend_metal_init()` at model load, no +backend scheduler, and CPU-only `ggml_custom_4d` fast paths gated on +`!ggml_backend_is_cpu(model.backend)` so the same graph builders fall through +to stock `ggml_im2col` + `ggml_mul_mat` (etc.) when the backend is Metal. + +Implementation: + +- `model_prefers_cpu_kernels(const supertonic_model &)` added in + `src/supertonic_internal.h`. Returns `true` when `model.backend == nullptr` + or `ggml_backend_is_cpu(model.backend)`. +- Per-stage helpers (`conv1d_f32`, `depthwise_same_ggml`, `layer_norm_ggml`, + `dense_matmul_time_ggml`, `bias_gelu_ggml`, `pw2_residual_ggml`, + `conv1d_causal_ggml`, `depthwise_conv1d_causal_ggml`, plus the tail-update + custom op in `vector_estimator.cpp`) now take a `bool use_cpu_fastpath` and + AND it into the existing dtype/shape gates. +- Per-stage builders inject + `const bool use_cpu_fastpath = model_prefers_cpu_kernels(model);` at the top + and pass it down through `vector_convnext_ggml`, `convnext_block_ggml`, the + text/vector/style attention cache builders, the tail graph builder, and the + trace builder. +- `text_encoder.cpp` and `duration.cpp` accept the flag for call-site + uniformity but mark it `[[maybe_unused]]` — those stages have always built + their graphs via stock ggml ops and are Metal-safe at HEAD. +- `supertonic_bench.cpp` gains `--n-gpu-layers N` (passed through to + `load_supertonic_gguf`) so the same harness drives CPU and Metal. + +Smoke test (`supertonic-cli --n-gpu-layers 1`) produces a 1.44 s WAV that is +byte-length-identical to the CPU output, confirming the graph builders run +end-to-end on Metal. A `GGML_ASSERT([rsets->data count] == 0)` fires inside +`ggml_metal_device_free` at process exit (atexit ordering with Metal's +residency-set finaliser) — same shape as the Chatterbox `t3_stack_registry` +atexit issue; cosmetic, fires after the WAV is fully written. Mitigation TBD. + +Benchmark (Apple M2, q8_0 GGUF, 4 threads, 3.204 s of audio, 5-step CFM, 5 runs ++ 1 warmup, same flags as `supertonic-cpp.json` / `supertonic-onnx-cpu.json`): + +| Stage | CPU q8_0 | Metal q8_0 | Δ vs CPU | ONNX CPU f32 | +|-----------------------------|-----------:|-----------:|---------:|-------------:| +| preprocess | 0.01 ms | 0.01 ms | — | 0.06 ms | +| duration | 1.76 ms | 2.50 ms | +0.74 | 1.48 ms | +| text_encoder | 13.44 ms | 13.83 ms | +0.39 | 9.04 ms | +| vector_estimator (5 steps) | 94.86 ms | 173.08 ms | +78.22 | 82.65 ms | +| vocoder | 43.44 ms | 59.74 ms | +16.30 | 51.32 ms | +| **total** | **153.5** | **249.9** | **+96.4 (+63%)** | **144.9** | +| RTF | 0.048 | 0.078 | | 0.045 | +| real-time multiplier | 20.9× | 12.8× | | 22.1× | + +Verdict: the Metal port is **correctness-validated but slower than CPU at this +graph shape**. Two ggml-side stages dominate the regression: + +- **`vector_estimator` +82 %** (94.9 → 173.1 ms median). The 5 denoising steps + build many small ConvNeXt graphs (depthwise + pointwise + norm + GELU + + pointwise, repeated across blocks). On M2 these become Metal kernel + launches that are too short to amortise launch overhead; the CPU fast paths + (cblas-backed `pointwise_op` / unrolled depthwise K=5) had a real lead. +- **`vocoder` +38 %** (43.4 → 59.7 ms median). Same kernel-launch-bound + pattern, smaller deficit because the vocoder graph is a single persistent + cgraph that's reused across calls (less per-step overhead than the + vector-estimator's per-block cgraphs). + +`text_encoder` and `duration` are unchanged within noise — expected, those +already used the stock-op path on CPU. + +`supertonic-bench --runs 8 --warmup 3 --n-gpu-layers 1` drifted to ~288 ms +median (up from ~250 ms at runs=5 / warmup=1), suggesting Metal residency +sets accumulate across calls in this harness; investigate before drawing +percentile-style conclusions from longer Metal runs. + +Artifacts: `artifacts/bench/supertonic-cpu.json`, +`artifacts/bench/supertonic-cpu-after.json` (post-gating CPU regression +check, median 158.2 ms / +3 % vs the pre-port baseline — within noise), +`artifacts/bench/supertonic-metal.json`, +`artifacts/bench/supertonic-onnx-cpu.json`, +`artifacts/bench/supertonic-onnx-coreml.json`, +`artifacts/bench/metal-phase-a.txt` (the Phase A failure-mode trace before +gating). + +### Next: Metal optimisation passes (Phase E in the plan) + +Backlog **revised after the 2026-05-11 dispatch-count profile** (see +"Dispatch-count profile" below). The pre-profile working hypothesis +(step batching, QKV stacking, f16 weights) turned out to be wrong on +multiple counts. Revised priority order: + +1. **Single-graph consolidation per CFM step (THE PR).** The diagnostic + shows ~21 separate `graph_compute` calls per step (front prep + + text-attention + style-qkv + style-attention + style-residual-norm + inline × 4 groups + tail). On M2 each call carries ~1.86 ms of fixed + command-buffer overhead regardless of node count. Consolidating into + ONE `ggml_cgraph` per step (5 dispatches per synth, projected total + Metal ~46 ms) is by far the biggest win available; the rest of the + backlog only matters if this leaves residual gap. Specific work + below. +2. **(Was step batching across CFM iterations.)** Closed: the CFM step + loop has a sequential dependency (`latent.swap(next)` at + `supertonic_engine.cpp:240`), so Chatterbox-style batching along + `ne[2]` doesn't apply here. The win from item 1 above is bigger + anyway; revisit only if a future flow-matching variant decouples the + steps. +3. **(Was QKV stacking on text-attention.)** Deprioritised. With item 1 + the QKV matmuls live inside the same dispatch as everything else — + stacking saves 3 in-graph nodes per attention but doesn't reduce + dispatch count. Only worth doing if Metal frame capture shows the + three per-attention `kernel_mul_mm` launches are individually + expensive after consolidation. +4. **(Was f16 weights for Metal.)** Closed: f16 GGUF is *slower* than + q8_0 on both CPU and Metal (see "f16 GGUF experiment (2026-05-11)" + below). q8_0's weight-bandwidth win beats f16's no-dequant on this + graph shape. +5. **Custom Metal depthwise kernel.** Standby — only revisit if item 1 + leaves ConvNeXt depthwise as the residual hotspot. The `im2col + + mul_mat` fallback would be replaceable with a single + `kernel_depthwise_conv_1d` per call; `test/test_metal_ops.cpp` is + the parity harness. +6. **Metal `rsets` keep-alive tuning** for long-running daemons. + Cosmetic for benchmarks; investigate if a hosted-service user + reports memory growth. + +### Plan for item 1 — per-step graph consolidation + +Architecture: introduce a `vector_step_full_cache` (per-shape +thread_local) that owns ONE `ggml_context`, ONE `ggml_cgraph`, ONE +`ggml_gallocr`. Build the entire per-step computation (proj_in → +4 × (ConvNeXt blocks + time-add + ConvNeXt + Q/K/V projection + RoPE + +flash-attention + out_fc + residual + layer-norm + style Q/K/V +projection + flash-attention + out_fc + residual + layer-norm) + +last_convnext × 4 + proj_out + mask + noise add) as one graph. ONE +`ggml_backend_graph_compute` per step. + +The existing `build_text_attention_cache`, `build_group_graph_cache`, +`build_res_style_qkv_cache`, and `build_tail_graph_cache` get refactored +into **graph-builder helpers** that accept `(ggml_context*, ggml_cgraph*, +...input ggml_tensor*...)` and return output `ggml_tensor*`, instead of +owning their own contexts. The CPU path keeps the cache-of-subgraphs +architecture (parity, trace mode); only Metal routes through the +consolidated path. Detection via `!ggml_backend_is_cpu(model.backend)` +at the top of `supertonic_vector_step_ggml`. + +**Critical sub-tasks** (the order matters for parity validation): + +1. **In-graph RoPE.** Replace the CPU `apply_rope` call with + `ggml_rope_ext` configured for Supertonic's `(t/L) * theta[d]` + formula: `freq_base = 1.0`, `freq_scale = 1.0`, `freq_factors[d] = + L / theta[d]`, `mode = GGML_ROPE_TYPE_NEOX` (split-pairs layout + matches `apply_rope`'s `(i1, i2) = (offset+d, offset+D/2+d)` pattern + per `supertonic_vector_estimator.cpp:1416`). Positions are an + int32 `arange(L_q)` for Q and `arange(L_kv)` for K, set once at + build time. ggml-metal's `kernel_rope_norm`/`kernel_rope_neox` + already compile. + +2. **In-graph layout conversion.** Replace + `tensor_to_time_channel`/`pack_time_channel_for_ggml` host calls + with `ggml_cont(ctx, ggml_transpose(ctx, x))` at the inter-stage + boundaries. + +3. **Compose the orchestrator** so all stages share one ctx/gf. Walk + the existing `supertonic_vector_trace_proj_ggml` flow (lines + 2050–2585) and inline each `run_*_cache` call as graph-builder + helper invocations. + +4. **Parity test.** Add a `test_supertonic_vector_metal_consolidated` + CTest target that compares the consolidated Metal path to the CPU + reference for one step at a representative L (137-ish). Tolerance + ~1e-2 (loose because of float-order effects across the merged + graph). + +5. **Bench.** Re-run `supertonic-bench --n-gpu-layers 1` and target + `SUPERTONIC_COUNT_DISPATCHES=1` to verify total dispatches drop + from 120 to ~10 and total wall to ~46 ms. + +**Size estimate.** ~600–1000 new lines (mostly the consolidated build +function); the existing trace path stays untouched. Trace-mode tests +keep using the old multi-cache orchestrator. + +**Risk.** The two non-trivial pieces are (a) `ggml_rope_ext` parameter +mapping matching CPU `apply_rope` to within 1e-3 — verify before +inlining everything else — and (b) memory budget for one big graph +across all groups (`MAX_NODES=2048` may not be enough; estimate ~3500 +nodes for the full per-step graph). + +Each commit on the consolidation branch should land in a single PR; +the work is too coupled to split cleanly. + +Backlog items 2–6 above stay as separate per-PR follow-ups in their +listed priority. Do not bundle. + +### Dispatch-count profile (2026-05-11) + +Instrumented `supertonic_graph_compute` with a wall-time + node-count +printout gated on the `SUPERTONIC_COUNT_DISPATCHES` env var. Re-running +`supertonic-cli --n-gpu-layers 1 --text "Hello."` on the same M2: + +- **120 graph_compute dispatches per single synth** (entire pipeline, + vector estimator + vocoder + text encoder + duration). +- **Cumulative graph_compute wall: 222.8 ms** out of the ~250 ms total + Metal synth — i.e. graph_compute IS the cost; CPU-side data marshalling + is the residual ~30 ms. +- **Mean per-dispatch wall: 1.86 ms.** Even 17-node tiny dispatches cost + ~770 µs each; 170-node mid graphs cost 1.1–1.7 ms. The fixed + per-dispatch Metal overhead (command-buffer setup + pipeline lookup + + encode + commit + wait) dominates. + +Dispatch distribution (counts × node-size, sorted by frequency): + + 40 × 18 nodes (the 5×8 text-attention sub-graphs per step) + 20 × 12 nodes + 20 × 90 nodes + 15 × 262 nodes (the 5×3 group-prep graphs) + ~25 misc + +The 80 small (≤90 nodes) dispatches account for an estimated ~120 ms of +Metal time. Consolidating them into the larger per-step graphs would +likely halve the gap to the CPU baseline. + +### f16 GGUF experiment (2026-05-11) + +Hypothesis: q8_0 dequant in the per-`mul_mat` path was the Metal +bottleneck. Tested by converting the bundle with `--ftype f16` (132 MB +GGUF vs 252 MB for q8_0) and re-benching: + + Metal q8_0 total median: 249.9 ms + Metal f16 total median: 286.5 ms (+15 %, worse) + CPU q8_0 total median: 153.5 ms + CPU f16 total median: 168.7 ms (+10 %, worse) + +f16 is uniformly *slower* than q8_0, on both CPU and Metal. q8_0 +dequant is not the bottleneck — ggml-metal's q8_0 `mul_mat` kernel is +well-tuned for these tensor shapes and the smaller weight bandwidth +helps. Phase E.3 closed; do not pursue an f16-on-Metal variant. + +### Dispatch profiling hook + +`SUPERTONIC_COUNT_DISPATCHES=1 ./build/supertonic-cli ...` prints one +line per `ggml_backend_graph_compute` call: + + supertonic_graph_compute #N nodes=K wall=W us cumul=C ms + +Zero-overhead when the env var is unset (single env var read + +branch-predicted skip). + +## Per-step graph consolidation (landed 2026-05-11) + +Landed `supertonic_vector_step_one_graph_ggml` at the end of +`src/supertonic_vector_estimator.cpp` plus the helpers +`apply_supertonic_rope_ggml`, `append_text_attention_subgraph`, and +the `vector_step_one_graph_cache` struct. Routing in +`supertonic_vector_step_ggml` enables this path **by default on +any non-CPU backend** (Metal, CUDA, Vulkan, OpenCL). CPU keeps +the multi-cache trace_proj path — its CPU fast-paths and +`thread_local` sub-graph caches stay competitive on CPU and trace +mode for parity tests still uses the per-stage outputs. Override +via `SUPERTONIC_DISABLE_ONE_GRAPH=1` if needed. + +### Dispatch + bench numbers (Apple M2, q8_0, 4 threads, 5-step CFM) + +`SUPERTONIC_COUNT_DISPATCHES=1 ./build/supertonic-cli --n-gpu-layers 1` +shows the dispatch profile collapsing from **120 → 20 total +dispatches** per synth (5 of which are 1886-node consolidated +per-step graphs). Mean per-dispatch wall climbs from 1.86 ms to +7.9 ms — more real work per kernel batch, less time burned on +command-buffer setup — and total `graph_compute` wall drops from +222.8 ms to 157.7 ms (-29 %). + +`supertonic-bench` on Metal, 5 runs + 1 warmup, identical flags to +`supertonic-cpu.json` / `supertonic-onnx-cpu.json`: + + | Stage | trace_proj (B) | one-graph (E.cons) | + |-----------------------------|---------------:|-------------------:| + | preprocess | 0.01ms | 0.02ms | + | duration | 2.50ms | 3.87ms | + | text_encoder | 13.83ms | 16.58ms | + | vector_estimator (5 steps) | 173.08ms | 147.83ms | + | vocoder | 59.74ms | 60.51ms | + | **total** | **249.92ms**| **229.06ms**| + | RTF | 0.078 | 0.071 | + | real-time multiplier | 12.82× | 13.99× | + +Net: **-15 % on the dominant vector_estimator stage, -8 % on the +total**. Correctness validated: `cpu-ref` vs `metal-one-graph` for +the same text+seed gives correlation **1.0000**, max abs diff 101 +LSB (CPU peak amplitude 6639, so ~1.5 % — normal Metal-vs-CPU +floating-order noise). No regression vs the Phase B port. + +### Why the win is smaller than projected + +Pre-implementation projection was ~46 ms total (saving the full +~204 ms of dispatch overhead at 1.86 ms × ~110 saved dispatches). +Reality: the per-dispatch overhead estimate (1.86 ms) was an +*average*, not a constant. The new 1886-node consolidated graphs +are big enough that the GPU is actually doing real compute work +during the dispatch — kernel-launch overhead is no longer the +bottleneck, but the work itself has moved to dominating. + +The bench tells the story: per-step wall time dropped from +~33 ms (= 173/5) to ~30 ms (= 147/5). The Metal device now spends +most of its time actually computing matmuls rather than waiting +on command-buffer plumbing. Further wins now require *less work*, +not *fewer dispatches* — that's items 2-5 of the remaining +backlog (QKV stacking, op fusion, custom depthwise kernel). + +### Implementation notes + +- **`apply_supertonic_rope_ggml`** translates Supertonic's + `angle = (t/L) * theta[d]` formula to `ggml_rope_ext` with + `freq_base=1.0, freq_scale=1.0, freq_factors[d] = L / theta[d]`, + `mode=GGML_ROPE_TYPE_NEOX` (split-pairs rotation matches + `apply_rope`'s `(i1=offset+d, i2=offset+D/2+d)` layout at + `supertonic_vector_estimator.cpp:1416`). Positions are int32 + `arange(q_len)` for Q and `arange(text_len)` for K, set per + call when L or text_len change. ggml-metal's + `kernel_rope_norm`/`kernel_rope_neox` already compile. + +- **Layout invariant: the GGML tensors take channel-major buffers + raw.** The trace_proj_ggml path at lines 2143/2151 sets `x_in` + directly from `noisy_latent` (no host transpose) and `text_in` + directly from `text_emb`; the ne=[L, Cin] / ne=[text_len, 256] + tensors interpret that channel-major buffer as their natural + layout (innermost dim = time = fast-in-memory). My initial + consolidation tried to "helpfully" transpose the inputs into + (t, c) layout, which corrupted the tensor data and produced + correlation 0.0034 garbage on every backend. Fix: direct + `ggml_backend_tensor_set` from raw caller buffers, matching the + existing path exactly. Same fix on the output path + (`ggml_backend_tensor_get` straight into `next_latent_out`). + +- **Cache invalidation:** keyed on `(model.generation_id, L, + text_len, total_steps)`. Rebuild when any change. The + `vector_step_one_graph_cache` is a single `thread_local` + instance — different Engines / synths share it via the + generation_id key. + +### Remaining Phase E backlog + +**Tier 1 status (2026-05-11):** + +- ✅ **Per-step vector_estimator consolidation** (this PR) — biggest + Tier 1 win, -8 % on total Metal, parity 1.0000. +- ✅ **Vocoder already a single dispatch** (461-node graph) — + no consolidation needed. +- ⏸ **text_encoder + duration consolidation** — measured + contribution: ~22 ms cold-start dispatch wall across the 14 + small dispatches that come before the vector_estimator graphs. + Post-warmup the bench shows text_encoder ≈ 17 ms and + duration ≈ 4 ms — most of which is the dispatches themselves; + consolidating to 1 dispatch each would save ~5-10 ms + steady-state. Deferred because relpos_attention has 9 + per-shape mask tensors + intricate + `ggml_view_3d`/`ggml_permute`/`ggml_sum_rows` plumbing that's + not a straight copy of the vector_step pattern — needs its + own focused 2-3 hour session with parity validation harness + before re-enabling on the GPU dispatcher. +- ⏸ **QKV stacking** — once `vector_estimator` is already in + one graph, stacking the three `dense_matmul_time_ggml` calls + saves in-graph nodes but no dispatch count. Metal-frame- + capture didn't show the QKV matmuls as the hot path, so the + expected win is tiny. Pursue only if Tier 2 hits diminishing + returns. +- ⏸ **`ggml_cont` elimination** — the consolidated path does + `ggml_cont(ggml_transpose(...))` for Q/K/V before rope, and + again inside `apply_supertonic_rope_ggml`. These could be + avoided by views with custom strides, but ggml's `view_3d` + doesn't expose `nb0` (only `nb1`/`nb2`), so the cont copies + are required for the rope kernel's expected layout. Could + use `ggml_permute` + careful 4D views to remove some, but + the win is small and the layout-bug risk is high. + +## Tier 2 progress (2026-05-11) — op-level reductions before custom kernels + +Before sinking time into custom .metal kernels via the QVAC +ggml-speech port patches (the original Tier 2 plan), there are +op-level reductions inside the consolidated per-step graph that +trim dispatch count without touching ggml's kernel set. Each +landed as its own commit in PR #15. + +### Diagnostic: `SUPERTONIC_DUMP_OP_HISTOGRAM=1` + +Added an env-var-gated dump of per-graph op-type histograms to +`supertonic_graph_compute`. Zero overhead unset. Lets us see +exactly which ggml ops dominate the consolidated graph and which +are pure-metadata (RESHAPE/VIEW/PERMUTE/TRANSPOSE — confirmed +no-op in ggml-metal-ops.cpp:186-195). + +**Consolidated per-step graph at HEAD (post-Tier-2 commits):** + + | op | count | dispatch on Metal? | + |-------------------|------:|--------------------| + | RESHAPE | 580 | no (metadata only) | + | ADD | 197 | yes (often fused) | + | CONT | 148 | yes (memcpy) | + | MUL_MAT | 122 | yes (matmul) | + | IM2COL | 118 | yes (memrearrange) | + | VIEW | 88 | no | + | PERMUTE | 72 | no | + | MUL | 70 | yes (often fused) | + | TRANSPOSE | 68 | no | + | REPEAT | 56 | yes | + | CONCAT | 56 | yes | + | NORM | 36 | yes | + | UNARY | 32 | yes (GELU/SiLU) | + | ROPE | 8 | yes | + | FLASH_ATTN_EXT | 8 | yes | + | SCALE | 1 | yes | + | **total** | **1660** | **852 dispatched** | + +808 of 1660 nodes are metadata-only no-ops — what looks like a +large graph is really ~852 real Metal dispatches per per-step +graph (down from ~1078 dispatched ops in the pre-Tier-2 layout). + +### Landed wins + +1. **`repeat_like` returns the broadcast-compatible reshape + without `ggml_repeat`** — ggml_add/ggml_mul broadcast natively + when one operand has dim==1 in a position the other has dim==N, + so the explicit ggml_repeat was redundant work. All four + supertonic files (vector_estimator, vocoder, text_encoder, + duration) had the same pattern; same fix applied to each. + **-226 REPEAT ops** per step graph. Override via + `SUPERTONIC_FORCE_EXPLICIT_REPEAT=1`. + +2. **`apply_supertonic_rope_ggml` drops the defensive + `ggml_cont`** — the [D, H, q_len] view onto a contiguous + [H*D, q_len] tensor is itself contiguous (nb[0]=elem_size, + nb[1]=D*elem_size, nb[2]=H*D*elem_size = ne[0]*ne[1]*elem_size), + so `ggml_rope_ext` accepts the view directly. **8 fewer + kernel_cpy dispatches per per-step graph** × 5 = 40 saved per + synth. + +### Bench delta + +Apple M2, q8_0, 4 threads, 5-step CFM, 3.20 s of audio, 5 runs + +1 warmup, identical flags to the existing JSON artifacts: + + | Stage | Phase B | post-cons | post-repeat | post-rope-cont | + |-----------------------------|--------:|----------:|------------:|---------------:| + | preprocess | 0.01 ms | 0.02 ms | 0.01 ms | 0.02 ms | + | duration | 2.50 ms | 3.87 ms | 4.15 ms | 4.44 ms | + | text_encoder | 13.83 ms | 16.58 ms | 15.80 ms | 14.97 ms | + | vector_estimator (5 steps) | 173.08 ms | 147.83 ms | 129.23 ms | 123.94 ms | + | vocoder | 59.74 ms | 60.51 ms | 53.91 ms | 53.99 ms | + | **total** | **249.92ms** | **229.06ms** | **203.04ms** | **199.90ms** | + | RTF | 0.078 | 0.071 | 0.063 | 0.062 | + | real-time multiplier | 12.82× | 13.99× | 15.78× | 16.03× | + +**Cumulative Tier 1 + early-Tier-2: -50 ms total (-20 %) vs the +Phase B Metal baseline.** Parity vs CPU reference preserved at +correlation 0.9999, max abs diff 249 LSB (~3.7 % of peak +amplitude 6639 — within the float-order tolerance the +consolidation already trades for one-graph-per-step). Still ~50 +ms behind CPU q8_0 (153 ms) and ONNX CPU (145 ms), but the gap +is closing. + +### Remaining op-level reductions + +- **118 IM2COL ops** are almost all K=1 1×1 convs (called from + `dense_matmul_time_ggml` via the existing `conv1d_f32` graph + fallback). For K=1 the im2col is a transpose; could be + replaced with a direct `ggml_mul_mat` on the transposed + weight/input. Projected ~3-6 ms saved. Tricky to get right + without breaking layout assumptions of consumers. +- **148 CONT ops** — 32 are weight-transpose conts in + `dense_matmul_time_ggml` (per call, but the weight is constant + per shape; could cache the transposed copy at engine + construction). Projected ~5-8 ms saved. +- **56 CONCAT + 56 REPEAT (remaining)** come from + `edge_clamp_pad_1d` materialising the replicate padding. A + custom Metal `kernel_supertonic_pad_edge` would collapse these + into one dispatch per padding call. + +### Tier 2 custom Metal kernels + load-time weight prep — landed (2026-05-11) + +Four fused Metal kernels shipped through the local +`tts-cpp/cmake/vcpkg-overlay-ports/ggml/` overlay (chained on top +of the QVAC ggml port via `VCPKG_OVERLAY_PORTS`). Each adds a +new `GGML_OP_SUPERTONIC_*` op with a CPU forward as parity +backstop and a Metal kernel as the production path. Override +each individually with the listed env var. + +1. **`kernel_supertonic_depthwise_1d`** (commit aa4f65c3) — + fuses edge-clamp pad + im2col + mul_mat + add into one Metal + dispatch for K ∈ {3, 5}. Used by every ConvNeXt block in + vector_estimator, vocoder, text_encoder, duration. Override: + `SUPERTONIC_DISABLE_FUSED_DEPTHWISE=1`. +2. **`kernel_supertonic_layer_norm_channel`** (commit 55adf87b) + — fuses permute + cont + ggml_norm + mul + add + permute + + cont into one dispatch. Per time-step, one threadgroup with + simd_sum reductions for mean/var. Override: + `SUPERTONIC_DISABLE_FUSED_LAYER_NORM=1`. +3. **`kernel_supertonic_pw2_residual`** (commit 7a5c0393) — + fuses `add(bias) + mul(gamma) + add(residual)` (3 ops) into + one dispatch at the tail of each vector ConvNeXt block. + Override: `SUPERTONIC_DISABLE_FUSED_PW2_RESIDUAL=1`. +4. **`kernel_supertonic_bias_gelu`** (commit df20115d) — fuses + `add(bias) + gelu_erf` between pw1 and pw2 of every vector + ConvNeXt block. Uses the same `erf_approx` template + as the stock `kernel_gelu_erf_f32` so the fused output is + bit-identical to the unfused chain. Override: + `SUPERTONIC_DISABLE_FUSED_BIAS_GELU=1`. + +Plus a load-time optimization: + +5. **Pre-transposed matmul weights** (commits e935ffb7, + da9553e3) — materialize transposed copies of every + `:onnx::MatMul_*` source weight at engine load time on + non-CPU backends. Eliminates the runtime + `cont(transpose(w))` dispatch that `dense_matmul_time_ggml` + (and the direct `ggml_mul_mat` time-projection sites) used + to emit on every graph compute — ~24 cont sites × 5 CFM + steps = 120 dispatches saved per synth. Override: + `SUPERTONIC_DISABLE_WEIGHT_PRETRANSPOSE=1`. + +6. **Vocoder pw1 fused bias_gelu** (commit 64efe99a) — extends + the bias_gelu fusion to the vocoder's ConvNeXt blocks. + `conv1d_causal_ggml(..., b=nullptr, ...)` skips the internal + bias-add and feeds the matmul output to the fused op + directly. CPU keeps its existing cblas-inside path. ~10 + dispatches saved per vocoder pass. + +Also investigated but **not landed**: + +- **Vocoder pw2_residual fusion** (commit 53a58f5b explains + why) — the vocoder stores its block scale as + `gamma.ne[0] == 1` (a single learnable scalar), while + `pw2_residual_ggml` requires `gamma.ne[0] == C`. Shapes + incompatible, would need a new vocoder-specific scalar-gamma + variant op for a ~0.4 ms projected gain — below the noise + floor of the current bench. Skipped. + +### Final Tier 2 bench + +Apple M2, q8_0, 4 threads, 5-step CFM, 3.20 s of audio, 10 +runs + 2 warmup, `--n-gpu-layers 1` (numbers from +`artifacts/bench/supertonic-cpp-metal-final.json`): + + | Stage | Phase B Metal | Tier 2 final | CPU q8_0 ref | + |-----------------------------|--------------:|-------------:|-------------:| + | preprocess | 0.01 ms | 0.02 ms | 0.01 ms | + | duration | 2.50 ms | 6.03 ms | 1.97 ms | + | text_encoder | 13.83 ms | 18.47 ms | 13.44 ms | + | vector_estimator (5 steps) | 173.08 ms | 97.76 ms | 94.86 ms | + | vocoder | 59.74 ms | 52.02 ms | 43.44 ms | + | **total** | **249.92ms** | **174.49ms**| **153.52ms** | + | RTF | 0.078 | 0.054 | 0.048 | + | real-time multiplier | 12.82× | 18.4× | 20.8× | + +**Cumulative Tier 1 + Tier 2 wins: -75 ms total (-30%) vs the +Phase B Metal baseline.** Parity vs CPU q8_0 reference holds +at correlation 0.9999 / L∞ ≈ 1.7e-3 across the whole sequence +— bit-identical pipeline output before/after the optimizations +on Metal. + +The pretranspose A/B (env-var off vs on, same machine state) +is the cleanest single-knob signal: total 182.75 → 174.38 ms +(-8.37 ms), vec_est 108.61 → 100.45 ms (-8.16 ms). + +### Where the remaining 21 ms gap-to-CPU lives + + | Stage | Metal Tier 2 | CPU q8_0 | Gap | + |-----------------------------|-------------:|---------:|-------------:| + | vector_estimator (5 steps) | 97.76 ms | 94.86 ms | 2.90 ms | + | vocoder | 52.02 ms | 43.44 ms | 8.58 ms | + | text_encoder | 18.47 ms | 13.44 ms | 5.03 ms | + | duration / other | ~6 ms | ~1.7 ms | ~4 ms | + | **total** | **174.49ms** | **153.52ms** | **20.97 ms** | + +Vector estimator is now Metal's strongest stage in absolute +terms (within 3 ms of CPU on its 100-ms budget); vocoder is at +parity with ONNX-CPU (52.0 vs 51.3 ms) and is now the dominant +remaining gap-to-CPU. Vocoder uses `conv1d_causal_ggml` not +`dense_matmul_time_ggml`, so neither the pretranspose +optimization nor (until 64efe99a) the fused bias_gelu applied +there — the weights are already in conv1d-kernel `[K, IC, OC]` +layout from the GGUF. + +### What's still pursuable post-Tier-2 (not in this round) + +1. **KV stacking on cross-attention** — concat W_key and + W_value along out-dim at load time so the two text-side + matmuls become one (Q stays separate, different input). + ~30 invocations per synth × ~0.1-0.2 ms each ≈ 3-6 ms + projected, but the small matmul size means this might be + noise-bound. Could combine with pretranspose: stack the + pretransposed K+V into one wider weight. +2. **Vocoder `pw2_residual_scalar_gamma` op** — new + vocoder-specific fused op handling `gamma.ne[0]==1`. ~10 + dispatches saved per vocoder pass ≈ 0.4 ms. Below noise + floor; skip unless other wins are found first. +3. **Full ConvNeXt block fusion** (the original T2.3 plan) — + deferred because pw1/pw2 weights are 4C×C ≈ 1MB each, + vastly exceeding M2's 32KB threadgroup memory budget. Would + need to call out to `ggml_mul_mat` for the matmuls, which + defeats most of the fusion benefit. +4. **Activation layout change** — eliminate the 32 remaining + `cont(transpose(activation))` calls on Q/K/V activations per + per-step graph. Would require touching the whole attention + pipeline (rope, flash_attn, output projection) — too + invasive for the projected ~3-5 ms win. +5. **CFM step batching (B=2)** — N/A for Supertonic. The CFM + loop in `supertonic_engine.cpp` is a sequential ODE solver + (each step depends on the previous output), unlike + chatterbox's CFG cond+uncond pairs which fit naturally into + `ne[2]` batching. + +### Tier 2 closing the loop + +The Tier 2 PR (`feat/metal-optimization-supertonic` on +tetherto/qvac-ext-lib-whisper.cpp) lands as: +- 4 custom Metal kernels behind individual env-var gates +- Load-time pretranspose mechanism + helper APIs + (`try_pretransposed_weight`, `dense_matmul_time_pretransposed_ggml`) +- All under a local `tts-cpp/cmake/vcpkg-overlay-ports/ggml/` + port that chains on top of the QVAC ggml port via + `VCPKG_OVERLAY_PORTS`. +- CPU q8_0 perf unchanged (the fused-kernel + pretranspose + paths are all gated on `!use_cpu_fastpath`). +- Parity vs CPU reference: corr 0.9999 / L∞ 1.7e-3 throughout. + +## Phase A + B follow-up (2026-05-11) + +### Landed on this PR after Tier 2 closed + +| Commit | Change | Bench delta (M2, 10 runs) | +|------------|--------|---------------------------| +| `bfb44092` | Phase 0: `--precision {f32,f16,q8_0}` flag + parity harness | 0 ms (infra) | +| `8f0be955` | A1+A2: single command buffer per synth + on-GPU latent through 5-step CFM loop | –1.37 ms total | +| `1b7496f6` | A3 step 1: enable `--precision q8_0` storage on Metal (asymmetric load) | –6.17 ms total | + +Cumulative on top of Tier 2: total **174.49 ms → 166.39 ms** (–4.6%). +Real-time multiplier 18.4× → 19.3×. + +### Why the wins are smaller than the original Phase A+B projection + +The Phase A roadmap projected 30+ ms of cumulative gains. Reality on M2 +delivered ~8 ms. Three things drove the gap: + +1. **Metal command-buffer submission on M2 is much cheaper than I + estimated.** I cited "~1-2 ms fixed overhead per dispatch" based on + an earlier diagnostic; actual cost is closer to 0.1-0.3 ms. A1+A2's + "single command buffer per synth" win (eliminating 4 inter-step + dispatches) was projected –15 to –20 ms, landed at –1.4 ms. +2. **Unified memory makes `tensor_get`/`tensor_set` between stages + nearly free.** There's no PCIe transfer cost to amortize. The + "on-GPU latent" win that's a big deal on discrete-GPU x86 doesn't + apply on Apple silicon. +3. **`kernel_mul_mm_q8_0_f32` never fires.** A3's projected –20 to –30 ms + was the matmul-bandwidth win from running ggml's optimized quantized + matmul kernel. But the kernel only dispatches when the quantized + weight is `src0` (a) of `ggml_mul_mat`. Supertonic's `[T, IC]` + activation layout forces the weight into `src1` (b) via the + `conv1d_f32` im2col wrapper, and ggml-metal falls back to a path + that dequantizes to f32 first. **The full A3 win is unlocked by + B2 (activation layout permutation) — and only by it.** + +### A4 (text_encoder + duration consolidation) — deferred + +Analyzed but not implemented: text_encoder currently fires ~10 separate +`ggml_backend_graph_compute` calls (1 ConvNeXt front + 4 relpos attn ++ 4 ffn + 2 speech_prompted_attn × 2-graph pattern). Duration adds +~4 small dispatches. + +Full consolidation into 1-2 graphs would require: +- Extracting each sub-builder (`relpos_attention_ggml`, `ffn_block_ggml`, + `speech_prompted_attention_ggml`) into append-to-graph helpers (the + same shape of refactor that A1+A2 did for the per-CFM-step subgraph). +- Converting the host-side residual + layer_norm + tanh-key-packing + work between sub-graphs into ggml ops. +- Engineering: 4-8 focused hours. +- Realistic return based on A1+A2's measured ratio: **–2 to –4 ms total**. + +Deferred because: (a) ROI per hour is now smaller than B1/B2, (b) the +text_encoder + duration combined budget is only ~21 ms — even a perfect +collapse to 1 dispatch each saves ~5-7 ms maximum, with no compounding +effect on the other stages, (c) it doesn't unlock anything else +downstream (unlike B2 which unlocks A3 step 2). + +Re-evaluate after B2 lands. If the team needs every ms (e.g. for a +constrained-device target), this is the next item to revisit. + +### Next levers on the table + +| Phase | Projected (post-A1+A2 calibration) | Unblocks | Cost | +|-------|-----------------------------------:|----------|------| +| B1 — f16 activations end-to-end | –5 to –10 ms | nothing | medium | +| **B2 — activation layout permutation** | –3 to –5 ms direct, **+ unlocks A3 step 2 (–15 to –25 ms)** | A3 step 2 | high (invasive, touches rope + flash_attn + every attention site) | +| A3 step 2 — q8_0 matmul kernel firing (after B2) | –15 to –25 ms (theoretical) | — | medium-low (B2 does the heavy lifting) | +| B3 — argument buffer reuse | –2 to –5 ms | nothing | high (Metal backend internals) | +| A4 — text_encoder + duration consolidation | –2 to –4 ms | nothing | medium-high | + +**The highest-leverage move now is B2.** Without it, A3's matmul win is +unreachable. The combined B2 + A3-step-2 stack is the only realistic +path to "Metal beats CPU outright on M2." + +### B1 / B2 / B3 status after attempted continuation (2026-05-11) + +After A4 deferred, attempted B1 (f16 end-to-end) and scoped B2. Both +proved bigger than scoped to a single follow-up session. Documented +here for the next round. + +**B1 (f16 activations) — partially scaffolded, deferred:** +- Storage already worked from Phase 0 (load logic converts q8_0 → f16 + correctly in f16 mode). +- Lifting the rejection at load time made compute reach the graph + stage, then fail at `ggml-metal-ops.cpp:2818` (`ggml_metal_op_bin`'s + assertion that both srcs are f32). A non-f32 tensor is flowing into + a `ggml_add` / `ggml_mul` somewhere in the graph — likely an + auto-fused add after a matmul where ggml-metal picks the matmul + output type as f16 instead of f32. +- The cleanup pass needed (audit every binary op's input types and + force-cast where required) is the same kind of work B2 does + comprehensively for activation layout. Pair them in a "graph-wide + type/layout consistency pass" PR. + +**B2 (activation layout permutation) — fully scoped, deferred:** +The 24 `cont(transpose(activation))` calls per per-step graph (3 per +QKV in 8 attention sites = 24, plus the post-attn out projection +transpose) come from converting matmul output `[T, A]` into +`[A, L]` for rope + flash_attn. Eliminating them requires: + +1. **Matmul output layout flip** — output `[A=OC, T]` directly via + `ggml_mul_mat(pretransposed_w_[IC,OC], activation_[IC,T])`. + Requires the activation already in `[IC, T]` format — which + requires every upstream op to produce `[IC, T]`. +2. **New `layer_norm_channel_[C,T]` Metal kernel** — the current + fused kernel assumes `[T, C]` and dispatches one threadgroup per + time step, threads stride over channels. For `[C, T]` the + threadgroup decomposition flips: one threadgroup per channel, + threads stride over time, OR one threadgroup per time step with + different stride math. Roughly 4-8 hours of Metal kernel work. +3. **Audit every `ggml_add` / `ggml_mul` site** for broadcast + compatibility under the new layout (most should work via + `repeat_like`'s native broadcast, but every site needs a check). +4. **Verify rope still works on `[D, L, H]` view** of the new + `[A, L]` activation (likely fine — rope's input is already + width-major). + +The unblocked A3 step 2 win (Metal dispatches +`kernel_mul_mm_q8_0_f32` natively) is what makes B2 worth the work. +Together they target ~25-30 ms of additional Metal speedup vs +current 166 ms. Without A3 step 2, B2 alone delivers ~-3 to -5 ms +(eliminating the cont(transpose) dispatches), which is below the +maintenance cost of the kernel rewrite. + +Realistic estimate: 3-5 focused days as a dedicated PR. Worth doing +when the goal is "Metal beats CPU on M2" — which is currently still +12 ms away (Metal 166 / CPU 153). + +**B3 (argument buffer reuse) — scoped, deferred:** +Metal's `MTLIndirectCommandBuffer` lets the host pre-encode a command +buffer once and bind new input arguments per call, eliminating the +per-call command-buffer encoding cost. Equivalent to CUDA Graph +Capture. + +Requires changes inside the ggml-metal backend (the `ggml_metal_op_*` +encode functions, the residency-set lifecycle). Cross-cutting work +touching files outside `tts-cpp/cmake/vcpkg-overlay-ports/ggml/`'s +current patches — could grow the overlay considerably. + +Realistic estimate: ~1 week including upstream-friendly design, +since the right shape of this change is "improve ggml-metal for all +users" not "patch ggml just for Supertonic." Better as a contribution +to the ggml-org project than a Supertonic-private optimization. + +### Closing the loop on Phase A+B follow-up + +Cumulative Metal perf trajectory across this PR: +- Phase B baseline (correctness port): **249.92 ms** +- Tier 2 final (4 fused kernels + pretranspose): **174.49 ms** +- Phase A+B follow-up (A1+A2 + A3 step 1): **166.39 ms** + +That's **-83 ms / -33% total** on Metal vs the starting baseline. +Real-time multiplier 12.82× → 19.34×. CPU q8_0 still wins by 13 ms; +ONNX-CPU by 21 ms. Closing those final gaps requires B2 + A3 step 2 +as outlined above — substantial work, but the path is clear. + +Parity vs CPU reference held at corr ≥ 0.998 / L∞ ≤ 0.05 throughout +every commit. Multi-precision harness (`--precision f32|f16|q8_0`) +ready to validate B1 + A3 step 2 wins when they land. + +### B2 partial landed (2026-05-11) — Metal vec_est beats CPU + +Investigated a smaller-scope B2 implementation and found that the +"swap `ggml_mul_mat` arg order at Q/K/V projection sites" trick +captures most of B2's direct win without any layer_norm kernel +rewrite or full activation-layout permutation. + +The mechanism: `conv1d_f32(im2col, kernel)` produces `[T, A]` (because +mul_mat(im2col_[IC,T], kernel_[IC,OC]) yields [T, OC]). The Q/K/V +projection sites then have to `cont(transpose(q_tc))` to get the +`[A, L]` shape that rope + flash_attn want. By calling +`mul_mat(kernel, im2col)` instead — kernel as src0 — the result +lands in `[A, T]` directly. Both operands are still non-transposed +so the assertion passes. + +Shipped as a new `dense_matmul_time_wt_pretransposed_ggml` helper. +Eight call sites updated: 4 text-attention Q/K/V/out + 4 +style-attention Q/K/V/out across all per-step graph groups. ~24 +cont(transpose) dispatches × 5 CFM steps = ~120 ops eliminated +per synth. + +Bench (Apple M2, 10 runs + 2 warmup): +- pre-B2 f32: total 172.56 ms / vec_est 99.07 ms +- **B2 partial f32: total 160.88 ms / vec_est 91.61 ms** +- delta: -11.68 ms total / -7.46 ms vec_est + +**This is the first time Metal vec_est beats CPU baseline** (91.61 +vs 94.86 ms). Total Metal 160.88 ms now within 7 ms of CPU's +153.52 ms, and within 16 ms of ONNX's 144.89 ms. + +Cumulative trajectory: +- Phase B baseline: 249.92 ms (12.8× real-time) +- Tier 2 final: 174.49 ms (18.4×) +- Phase A+B + B2 partial: **160.88 ms (19.9×)** ← -36% from start + +**The A3 step 2 unlock (q8_0 matmul kernel dispatch) requires +pretransposing q8_0 weights at load time.** Attempted, but the +`ggml_reshape_3d(w_pre, 1, IC, OC)` call inside the helper produces +an invalid q8_0 tensor when ne[0]=1 (q8_0 requires 32-element +block alignment on the inner dim). A clean q8_0 path needs either +a different reshape strategy (skip the K=1 conv1d framing entirely +and call `ggml_mul_mat(w_pre_q8, im2col_via_a_different_path)`), +or an in-graph `ggml_im2col` that accepts a 2D kernel directly. +Either is a focused half-day's work for ~10-20 ms more savings +(matmul kernel bandwidth). Deferred to a separate session. + +### Full B2 + vocoder CT landed (2026-05-12) — Metal fastest on every stage + +Built on the B2-partial trick by parameterising every fused custom +Metal kernel on per-axis element strides (`sxt`, `sxc`, `syt`, `syc`) +so the same compiled kernel handles both `[T, C]` and `[C, T]` +activations. ggml overlay-port bumped 12 → 13. Added `_ct` +constructors for `layer_norm_channel`, `depthwise_1d`, `pw2_residual`, +`bias_gelu`, `edge_pad_1d`. + +In `supertonic_vector_estimator.cpp`: new `vector_convnext_ggml_ct` +runs the full ConvNeXt block on `[C, T]` activations. Pointwise +K=1 Conv1d becomes a direct `ggml_mul_mat(w[IC,OC], x[IC,T])` (no +im2col, no transpose). All 16 ConvNeXt blocks in the per-step +graph (prologue × 4 + 3 group_prep × 4 + tail × 4) wrap a single +entry permute and a single exit permute around the chain. + +In `supertonic_vocoder.cpp`: same pattern for the 10-block vocoder +ConvNeXt chain. Vocoder differences vs vector_estimator: (1) +depthwise is causal (left-only pad), no `_ct` causal kernel yet — +stays on `[T, C]` with two intra-block permutes; (2) gamma is +scalar `[1]`, so the `pw2_residual_ct` fused op doesn't fit, keep +unfused `mul(scalar gamma) + add(residual)` tail; (3) `norm_g` / +`norm_b` ship as `[1, C]` — same flatten-with-`ggml_reshape_1d` +quirk as `.gamma` in vector_estimator. + +Discovered along the way: the legacy `pw2_residual_ggml` wrapper's +`gamma->ne[0] == x->ne[1]` gate was silently rejecting the fused +path for ConvNeXt all along (GGUF ships `.gamma` as `[1, C, 1, 1]` +not `[C]`). The `_ct` wrapper flattens it once with +`ggml_reshape_1d`, so this is the first time the fused +`pw2_residual` op actually runs on the ConvNeXt residual. + +Bench (Apple M2, q8_0 GGUF, 4 threads, 5-step CFM, 5 runs + 1 warmup, +all four backends benched in sequence on the same machine state): + +| Stage (ms median) | **ggml Metal** | ggml CPU | ONNX CPU | ONNX CoreML | +|------------------------------|---------------:|---------:|---------:|------------:| +| preprocess | 0.02 | 0.01 | 0.05 | 0.05 | +| duration | 3.27 | 1.49 | 1.26 | 8.17 | +| text_encoder | 12.11 | 11.70 | 8.22 | 16.26 | +| **vector_estimator** (5 step)| **57.87** | 90.36 | 77.04 | 177.89 | +| **vocoder** | **17.11** | 39.38 | 49.55 | 50.29 | +| **total** | **91.37** | 142.92 | 136.32 | 255.90 | +| RTF (lower is faster) | **0.029** | 0.045 | 0.043 | 0.080 | +| **real-time multiplier** | **35.1×** | 22.4× | 23.5× | 12.5× | + +Cumulative trajectory: +- Phase B baseline: 249.92 ms (12.8× real-time) +- Tier 2 final: 174.49 ms (18.4×) +- Phase A+B + B2 partial: 160.88 ms (19.9×) +- **Full B2 + vocoder CT: 91.37 ms (35.1×)** ← −63% from Phase B start + +Overrides: `SUPERTONIC_DISABLE_CT_CONVNEXT=1` (vector_estimator), +`SUPERTONIC_DISABLE_CT_VOCODER=1` (vocoder). + +Open follow-ups (small ROI, separate PR): +- Causal-pad mode on `depthwise_1d_ct` → single chain-level + permute for the vocoder (currently 2 intra-block permutes per + block). Projected -1 to -3 ms vocoder. +- B1 — f16 activations end-to-end. Storage loads today; + compute hits `ggml_metal_op_bin`'s f32 assertion. Needs a + graph-wide binary-op type cleanup. +- B3 — argument buffer reuse via `MTLIndirectCommandBuffer`. + Better as an upstream ggml-metal contribution than a + Supertonic-private patch. + +### Out of scope for this baseline + +- CUDA/Vulkan paths (host is Apple silicon; address Metal first). +- Multilingual / non-English voice perf — voice-agnostic. + ### Distribution - Publish generated GGUFs externally if reviewers/users should avoid local diff --git a/tts-cpp/README.md b/tts-cpp/README.md index 9a8d2286c99..b46c1ed4ea9 100644 --- a/tts-cpp/README.md +++ b/tts-cpp/README.md @@ -338,28 +338,38 @@ target_link_libraries(my_app PRIVATE tts-cpp::tts-cpp) ``` For development out of this in-tree subtree (running the parity -harnesses, prototyping API changes, etc.) the canonical build is: +harnesses, prototyping API changes, etc.) the canonical build is the +**bundled-ggml dev flow**: + +```bash +bash tts-cpp/scripts/setup-ggml.sh # clones qvac-ext-ggml@speech into tts-cpp/ggml/ +cmake -S tts-cpp -B tts-cpp/build -DCMAKE_BUILD_TYPE=Release \ + -DTTS_CPP_USE_SYSTEM_GGML=OFF +cmake --build tts-cpp/build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu) +``` + +`setup-ggml.sh` checks out the pinned tetherto/qvac-ext-ggml@speech +commit (which already carries every QVAC infrastructure patch + the +Supertonic 2 fused custom op family — no `patches/` overlay needed). +CMakeLists's `add_subdirectory(ggml)` path then consumes it directly +with `GGML_NATIVE=ON` for native ARM/SIMD codegen — typically ~10% +faster on M-series than the vcpkg-port flavor's portable build. + +Downstream production builds use the system-installed `ggml` instead: ```bash -# Install the speech-stack ggml port via vcpkg first; then: cmake -S tts-cpp -B tts-cpp/build -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_TOOLCHAIN_FILE=/scripts/buildsystems/vcpkg.cmake cmake --build tts-cpp/build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu) ``` -`TTS_CPP_USE_SYSTEM_GGML` defaults to `ON` here so the build picks -up the patched ggml from vcpkg automatically; flipping it `OFF` in -this subtree is rejected at configure time (no `patches/` to apply). -GPU acceleration is selected at the ggml-port level - the -`ggml-speech` port already carries the Metal / Vulkan / OpenCL -backend support its consumers ask for; pass `--n-gpu-layers 99` at -runtime to actually use the compiled GPU backend. - -If you need a bundled-ggml dev build (`add_subdirectory(ggml)` with -patches applied locally rather than coming from vcpkg), use the -standalone [`chatterbox.cpp`](https://github.com/gianni-cor/chatterbox.cpp) -repo - the source-of-truth this subtree was copied from - which keeps -`scripts/setup-ggml.sh` + `patches/` for that flow. +`TTS_CPP_USE_SYSTEM_GGML` defaults to `ON` for this flow, finding +the `ggml-speech` port from qvac-registry-vcpkg (which pulls +qvac-ext-ggml@speech with patches as commits). GPU acceleration is +selected at the ggml-port level — the port already carries the +Metal / Vulkan / OpenCL backend support its consumers ask for; pass +`--n-gpu-layers 99` at runtime to actually use the compiled GPU +backend. ### Useful CMake options diff --git a/tts-cpp/include/tts-cpp/supertonic/engine.h b/tts-cpp/include/tts-cpp/supertonic/engine.h index 6b50491720f..fad8fffd14d 100644 --- a/tts-cpp/include/tts-cpp/supertonic/engine.h +++ b/tts-cpp/include/tts-cpp/supertonic/engine.h @@ -14,7 +14,15 @@ // // EngineOptions opts; // opts.model_gguf_path = "models/supertonic.gguf"; -// opts.n_gpu_layers = 0; // CPU only today +// opts.n_gpu_layers = 0; // 0 = CPU; >0 enables Metal +// // on macOS / CUDA / Vulkan / +// // OpenCL when compiled in. +// // Metal on Apple silicon is the +// // fastest backend as of 2026-05-12 +// // (~35× realtime on M2, beats +// // ggml-CPU, ONNX-CPU and ONNX-CoreML +// // on every stage that matters). +// // See PROGRESS_SUPERTONIC.md. // // Engine engine(opts); // for (const auto & line : lines) { @@ -43,6 +51,26 @@ namespace tts_cpp::supertonic { +// Compute precision for matmul weights inside the model buffer. Selects +// how the GGUF's stored q8_0 weights are loaded into the resident model: +// - F32 (default): expand q8_0 to f32 at load time. CPU path uses +// cblas/AMX f32 matmul. Metal path uses kernel_mul_mat_f32_f32. +// Highest accuracy + simplest, but on Metal misses the 4× +// weight-bandwidth win of running the native q8_0 matmul kernel. +// - F16 (Phase B1): expand q8_0 to f16 at load time, run f16 matmul +// with f32 accumulator. ~2× less activation bandwidth on Metal, +// may drift slightly across the 5 CFM steps (parity tolerance +// relaxed to ~1e-2 L_inf). +// - Q8_0 (Phase A3): keep weights as q8_0 in the model buffer, let +// ggml's quantized matmul kernels dispatch directly. Metal-only +// (Phase A3 makes the load logic asymmetric: q8_0 on Metal, f32 +// on CPU). +enum class Precision { + F32, + F16, + Q8_0, +}; + struct EngineOptions { // Required. std::string model_gguf_path; @@ -56,6 +84,11 @@ struct EngineOptions { int n_threads = 0; int n_gpu_layers = 0; + // Compute precision for matmul weights — see Precision enum above. + // Default F32 is the current behaviour (load q8_0 GGUF, expand to f32). + // F16 / Q8_0 are non-default GPU paths (Metal-validated). + Precision precision = Precision::F32; + // F16 K/V flash-attention in the vector estimator. When -1, the // engine auto-enables this on GPU backends (non-CPU) and disables // it on CPU; pass 1 / 0 to force the setting regardless of the @@ -72,6 +105,9 @@ struct EngineOptions { // Halves the GPU read bandwidth into those ops with a small // (≤ 2e-3 abs / 5e-3 cosine) numerical drift on the end-to-end // synth. Mirrors chatterbox's CHATTERBOX_F16_CFM gate. + // Orthogonal to `precision`: this is a per-op runtime selector for + // the OpenCL hot-weight materialisation, while `precision` decides + // the storage type of all matmul weights uniformly. int f16_weights = -1; // Optional path to a .npy file containing the initial noise tensor of diff --git a/tts-cpp/scripts/setup-ggml.sh b/tts-cpp/scripts/setup-ggml.sh new file mode 100755 index 00000000000..656d0b61f24 --- /dev/null +++ b/tts-cpp/scripts/setup-ggml.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# +# setup-ggml.sh — clone the qvac-ext-ggml@speech branch into tts-cpp/ggml/ +# +# The bundled-ggml dev build path for tts-cpp out of this in-tree subtree. +# Replaces the vcpkg-port consumption when you want a fast iteration loop +# without going through vcpkg installs. +# +# Pinned to the head of the `speech` branch (a tetherto/qvac-ext-ggml fork +# of ggml-org/ggml carrying all QVAC infrastructure patches + the +# Supertonic 2 fused custom op family pre-applied as commits — no +# patches/ directory needed at this layer). +# +# Usage: +# bash tts-cpp/scripts/setup-ggml.sh +# cmake -S tts-cpp -B tts-cpp/build -DTTS_CPP_USE_SYSTEM_GGML=OFF +# cmake --build tts-cpp/build -j +# +# To update to a newer pin: bump GGML_REF below and re-run. The script +# is idempotent — re-running checks out the right ref into the existing +# tts-cpp/ggml/ clone without re-cloning. + +set -euo pipefail + +GGML_REPO_URL="https://github.com/tetherto/qvac-ext-ggml.git" +GGML_REF="60a172e48f699bd0a00575ef911feed9473b2187" # merge of qvac-ext-ggml#8 (speech HEAD) + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TTS_CPP_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" +GGML_DIR="${TTS_CPP_DIR}/ggml" + +if [ -d "${GGML_DIR}/.git" ]; then + echo "setup-ggml: existing clone at ${GGML_DIR} — fetching + checking out pin ${GGML_REF:0:10}" + git -C "${GGML_DIR}" fetch --depth 1 origin "${GGML_REF}" + git -C "${GGML_DIR}" checkout --detach "${GGML_REF}" +else + echo "setup-ggml: cloning qvac-ext-ggml @ ${GGML_REF:0:10} into ${GGML_DIR}" + rm -rf "${GGML_DIR}" + git clone --depth 1 --no-tags "${GGML_REPO_URL}" "${GGML_DIR}" + git -C "${GGML_DIR}" fetch --depth 1 origin "${GGML_REF}" + git -C "${GGML_DIR}" checkout --detach "${GGML_REF}" +fi + +echo "setup-ggml: tts-cpp/ggml/ ready at $(git -C "${GGML_DIR}" rev-parse --short HEAD)" +echo "setup-ggml: next: cmake -S tts-cpp -B tts-cpp/build -DTTS_CPP_USE_SYSTEM_GGML=OFF" diff --git a/tts-cpp/scripts/validate-precision-parity.sh b/tts-cpp/scripts/validate-precision-parity.sh new file mode 100755 index 00000000000..ce6c29208c8 --- /dev/null +++ b/tts-cpp/scripts/validate-precision-parity.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +# Multi-precision parity + bench harness for Supertonic 2. +# +# For each supported precision (f32, f16, q8_0): +# 1. Synthesizes a reference WAV on CPU at that precision. +# 2. Synthesizes the same WAV on Metal at the same precision. +# 3. Reports parity (corr, L_inf, RMS) between the two. +# 4. Optionally runs supertonic-bench at the same precision and emits +# a per-precision JSON artifact alongside. +# +# Usage: +# bash scripts/validate-precision-parity.sh [--bench] [--text TEXT] [--model PATH] +# [--precisions f32,f16,q8_0] +# +# Precisions not yet wired through the graph builders fail at load with +# a clear "scaffolded but not yet supported" message and are skipped (not +# counted as a parity failure). This lets the harness be useful right +# now while Phase A3 / B1 work lands. + +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +MODEL="$ROOT/models/supertonic2.gguf" +TEXT="The quick brown fox jumps over the lazy dog." +PRECISIONS="f32,f16,q8_0" +DO_BENCH=0 +RUNS=10 +WARMUP=2 +THREADS=4 +ARTIFACT_DIR="$ROOT/artifacts/bench/parity-matrix" + +while [[ $# -gt 0 ]]; do + case "$1" in + --bench) DO_BENCH=1; shift ;; + --text) TEXT="$2"; shift 2 ;; + --model) MODEL="$2"; shift 2 ;; + --precisions) PRECISIONS="$2"; shift 2 ;; + --runs) RUNS="$2"; shift 2 ;; + --warmup) WARMUP="$2"; shift 2 ;; + --threads) THREADS="$2"; shift 2 ;; + --artifact-dir) ARTIFACT_DIR="$2"; shift 2 ;; + -h|--help) + sed -n '2,/^set -euo/p' "$0" | sed 's/^# //; s/^#//; /^set -euo/d' + exit 0 ;; + *) echo "unknown arg: $1" >&2; exit 2 ;; + esac +done + +CLI="$ROOT/build/supertonic-cli" +BENCH="$ROOT/build/supertonic-bench" +PY="$ROOT/.venv/bin/python3" +if [[ ! -x "$CLI" ]]; then + echo "build/supertonic-cli not found. Run 'cmake --build build --target supertonic-cli' first." >&2 + exit 1 +fi +if [[ "$DO_BENCH" -eq 1 && ! -x "$BENCH" ]]; then + echo "--bench requested but build/supertonic-bench not found." >&2 + exit 1 +fi +if [[ ! -x "$PY" ]]; then + echo "$PY not found. Activate a venv with numpy + wave installed." >&2 + exit 1 +fi + +mkdir -p "$ARTIFACT_DIR" +TMP="$(mktemp -d)" +trap 'rm -rf "$TMP"' EXIT + +printf "\nSupertonic 2 multi-precision parity + bench harness\n" +printf " model: %s\n" "$MODEL" +printf " text: %.60s%s\n" "$TEXT" "$([[ ${#TEXT} -gt 60 ]] && echo '...')" +printf " precisions: %s\n" "$PRECISIONS" +printf " bench: %s\n\n" "$([[ "$DO_BENCH" -eq 1 ]] && echo 'yes' || echo 'no')" + +OVERALL_RC=0 +IFS=',' read -r -a PREC_ARR <<< "$PRECISIONS" +for P in "${PREC_ARR[@]}"; do + P_TRIM="$(echo "$P" | xargs)" + CPU_WAV="$TMP/cpu-$P_TRIM.wav" + MTL_WAV="$TMP/mtl-$P_TRIM.wav" + + printf "=== %s ===\n" "$P_TRIM" + + set +e + CPU_LOG="$("$CLI" --model "$MODEL" --text "$TEXT" --n-gpu-layers 0 \ + --precision "$P_TRIM" --out "$CPU_WAV" 2>&1)" + CPU_RC=$? + MTL_LOG="$("$CLI" --model "$MODEL" --text "$TEXT" --n-gpu-layers 1 \ + --precision "$P_TRIM" --out "$MTL_WAV" 2>&1)" + MTL_RC=$? + set -e + + if echo "$CPU_LOG$MTL_LOG" | grep -qE "scaffolded but not yet|partially scaffolded"; then + printf " SKIP: precision %s not yet wired through graph builders (Phase A3/B1)\n\n" "$P_TRIM" + continue + fi + # Tolerate the harmless post-write atexit `GGML_ASSERT([rsets->data count] == 0)` + # that fires on Metal cleanup AFTER the WAV is fully written. Treat the run as + # successful iff the WAV file exists and is at least 1 KB (covers a synthesized + # signal, well above an empty/header-only file). + cpu_ok=1; mtl_ok=1 + [[ -s "$CPU_WAV" ]] || cpu_ok=0 + [[ -s "$MTL_WAV" ]] || mtl_ok=0 + if [[ -f "$CPU_WAV" ]]; then + size=$(wc -c < "$CPU_WAV") + [[ $size -lt 1024 ]] && cpu_ok=0 + fi + if [[ -f "$MTL_WAV" ]]; then + size=$(wc -c < "$MTL_WAV") + [[ $size -lt 1024 ]] && mtl_ok=0 + fi + if [[ $cpu_ok -eq 0 || $mtl_ok -eq 0 ]]; then + printf " FAIL: synthesis errored. cpu_rc=%d mtl_rc=%d wav_ok cpu=%d mtl=%d\n" \ + "$CPU_RC" "$MTL_RC" "$cpu_ok" "$mtl_ok" + printf " --- cpu tail ---\n%s\n --- metal tail ---\n%s\n\n" \ + "$(echo "$CPU_LOG" | tail -3)" "$(echo "$MTL_LOG" | tail -3)" + OVERALL_RC=1 + continue + fi + + "$PY" - <= {tol_corr}) L_inf={linf:.6f} (tol <= {tol_linf}) RMS={rms:.6f}") +ok = corr >= tol_corr and linf <= tol_linf +print(" PASS" if ok else " FAIL parity") +sys.exit(0 if ok else 1) +PY + PY_RC=$? + if [[ $PY_RC -ne 0 ]]; then OVERALL_RC=1; fi + + if [[ "$DO_BENCH" -eq 1 ]]; then + JSON="$ARTIFACT_DIR/supertonic-mtl-${P_TRIM}.json" + printf " bench --> %s\n" "$JSON" + "$BENCH" --model "$MODEL" --text "$TEXT" \ + --voice M1 --language en --steps 5 --speed 1.05 --seed 42 \ + --runs "$RUNS" --warmup "$WARMUP" --threads "$THREADS" \ + --n-gpu-layers 1 --precision "$P_TRIM" \ + --json-out "$JSON" 2>&1 | grep -E '^\s*(vector_estimator|vocoder|text_encoder|total|RTF|Real-time)' || true + fi + printf "\n" +done + +if [[ $OVERALL_RC -eq 0 ]]; then + printf "All wired-up precisions pass parity.\n" +else + printf "One or more precisions failed parity (or errored).\n" >&2 +fi +exit $OVERALL_RC diff --git a/tts-cpp/src/supertonic_bench.cpp b/tts-cpp/src/supertonic_bench.cpp index a410fd8cedc..08ac71fb398 100644 --- a/tts-cpp/src/supertonic_bench.cpp +++ b/tts-cpp/src/supertonic_bench.cpp @@ -46,10 +46,29 @@ void usage(const char * argv0) { " [--voice M1] [--language en] [--steps 5] [--speed 1.05]\n" " [--seed 42] [--noise-npy /path/to/noise.npy]\n" " [--runs 5] [--warmup 1] [--threads N] [--n-gpu-layers N]\n" - " [--f16-attn 0|1] [--json-out FILE]\n", + " [--f16-attn 0|1] [--precision f32|f16|q8_0] (default: f32)\n" + " [--json-out FILE]\n", argv0); } +tts_cpp::supertonic::detail::supertonic_precision parse_bench_precision(const std::string & s) { + using P = tts_cpp::supertonic::detail::supertonic_precision; + if (s == "f32" || s == "F32") return P::F32; + if (s == "f16" || s == "F16") return P::F16; + if (s == "q8_0" || s == "Q8_0" || s == "q8") return P::Q8_0; + throw std::runtime_error("unknown --precision value: " + s + " (expected f32|f16|q8_0)"); +} + +const char * precision_to_string(tts_cpp::supertonic::detail::supertonic_precision p) { + using P = tts_cpp::supertonic::detail::supertonic_precision; + switch (p) { + case P::F32: return "f32"; + case P::F16: return "f16"; + case P::Q8_0: return "q8_0"; + } + return "f32"; +} + double percentile(std::vector v, double p) { if (v.empty()) return 0.0; std::sort(v.begin(), v.end()); @@ -123,6 +142,7 @@ int main(int argc, char ** argv) { // Phase 2A — F16 load-time materialization of the hot matmul / // pwconv weights. -1 auto / 0 / 1 force. int f16_weights = -1; + supertonic_precision precision = supertonic_precision::F32; for (int i = 1; i < argc; ++i) { std::string a = argv[i]; @@ -144,6 +164,7 @@ int main(int argc, char ** argv) { else if (a == "--n-gpu-layers") n_gpu_layers = std::stoi(next("--n-gpu-layers")); else if (a == "--f16-attn") f16_attn = std::stoi(next("--f16-attn")); else if (a == "--f16-weights") f16_weights = std::stoi(next("--f16-weights")); + else if (a == "--precision") precision = parse_bench_precision(next("--precision")); else if (a == "--json-out") json_out = next("--json-out"); else if (a == "-h" || a == "--help") { usage(argv[0]); return 0; } else { fprintf(stderr, "unknown arg: %s\n", a.c_str()); usage(argv[0]); return 2; } @@ -151,7 +172,8 @@ int main(int argc, char ** argv) { if (model_path.empty() || text.empty()) { usage(argv[0]); return 2; } supertonic_model model; - if (!load_supertonic_gguf(model_path, model, n_gpu_layers, /*verbose=*/false, f16_weights)) { + if (!load_supertonic_gguf(model_path, model, n_gpu_layers, + /*verbose=*/false, f16_weights, precision)) { fprintf(stderr, "failed to load model\n"); return 1; } @@ -291,7 +313,8 @@ int main(int argc, char ** argv) { printf(" text length: %zu chars\n", text.size()); printf(" voice: %s, language: %s, steps: %d, speed: %.2f\n", voice.c_str(), language.c_str(), steps, speed); - printf(" threads: %d\n", model.n_threads); + printf(" threads: %d, n_gpu_layers: %d, precision: %s\n", + model.n_threads, n_gpu_layers, precision_to_string(precision)); printf(" backend: %s%s\n", ggml_backend_name(model.backend) ? ggml_backend_name(model.backend) : "(unknown)", model.use_f16_attn ? " (f16_attn=on)" : ""); @@ -326,6 +349,8 @@ int main(int argc, char ** argv) { os << " \"steps\": " << steps << ",\n"; os << " \"speed\": " << speed << ",\n"; os << " \"threads\": " << model.n_threads << ",\n"; + os << " \"n_gpu_layers\": " << n_gpu_layers << ",\n"; + os << " \"precision\": \"" << precision_to_string(precision) << "\",\n"; os << " \"audio_s\": " << last_audio_s << ",\n"; os << " \"runs\": " << runs << ",\n"; os << " \"warmup\": " << warmup << ",\n"; diff --git a/tts-cpp/src/supertonic_cli.cpp b/tts-cpp/src/supertonic_cli.cpp index eff4309a5b7..0705fa696b5 100644 --- a/tts-cpp/src/supertonic_cli.cpp +++ b/tts-cpp/src/supertonic_cli.cpp @@ -20,10 +20,18 @@ void usage(const char * argv0) { " [--f16-weights 0|1] (load-time F16 materialization for the\n" " audit-identified hot matmul / pwconv weights;\n" " defaults to auto: on for GPU, off for CPU)\n" + " [--precision f32|f16|q8_0] (default: f32)\n" " [--noise-npy /path/to/noise.npy]\n", argv0); } +tts_cpp::supertonic::Precision parse_precision(const std::string & s) { + if (s == "f32" || s == "F32") return tts_cpp::supertonic::Precision::F32; + if (s == "f16" || s == "F16") return tts_cpp::supertonic::Precision::F16; + if (s == "q8_0" || s == "Q8_0" || s == "q8") return tts_cpp::supertonic::Precision::Q8_0; + throw std::runtime_error("unknown --precision value: " + s + " (expected f32|f16|q8_0)"); +} + void write_wav(const std::string & path, const std::vector & wav, int sr) { FILE * f = std::fopen(path.c_str(), "wb"); if (!f) throw std::runtime_error("cannot open output wav: " + path); @@ -72,6 +80,7 @@ int main(int argc, char ** argv) { else if (arg == "--n-gpu-layers") opts.n_gpu_layers = std::stoi(next("--n-gpu-layers")); else if (arg == "--f16-attn") opts.f16_attn = std::stoi(next("--f16-attn")); else if (arg == "--f16-weights") opts.f16_weights = std::stoi(next("--f16-weights")); + else if (arg == "--precision") opts.precision = parse_precision(next("--precision")); else if (arg == "--noise-npy") opts.noise_npy_path = next("--noise-npy"); else if (arg == "-h" || arg == "--help") { usage(argv[0]); return 0; } else { fprintf(stderr, "unknown arg: %s\n", arg.c_str()); usage(argv[0]); return 2; } diff --git a/tts-cpp/src/supertonic_duration.cpp b/tts-cpp/src/supertonic_duration.cpp index 936b986065c..68825f68687 100644 --- a/tts-cpp/src/supertonic_duration.cpp +++ b/tts-cpp/src/supertonic_duration.cpp @@ -78,7 +78,14 @@ ggml_tensor * repeat_like(ggml_context * ctx, ggml_tensor * v, ggml_tensor * lik if (!ggml_can_repeat(v, like)) { throw std::runtime_error("cannot repeat tensor in duration graph"); } - return ggml_repeat(ctx, v, like); + // Every caller feeds this into ggml_add/ggml_mul which broadcast natively; + // skip the explicit ggml_repeat dispatch. + static const bool force_explicit_repeat = + std::getenv("SUPERTONIC_FORCE_EXPLICIT_REPEAT") != nullptr; + if (force_explicit_repeat) { + return ggml_repeat(ctx, v, like); + } + return v; } ggml_tensor * conv1d_f32(ggml_context * ctx, @@ -87,6 +94,7 @@ ggml_tensor * conv1d_f32(ggml_context * ctx, int stride, int padding, int dilation) { + // duration uses the pure-graph path unconditionally; no CPU fast path. ggml_tensor * im2col = ggml_im2col(ctx, kernel, input, stride, 0, padding, 0, dilation, 0, false, GGML_TYPE_F32); ggml_tensor * result = ggml_mul_mat(ctx, ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1]), @@ -95,6 +103,15 @@ ggml_tensor * conv1d_f32(ggml_context * ctx, } ggml_tensor * edge_clamp_pad_1d(ggml_context * ctx, ggml_tensor * x, int pad_left, int pad_right) { + if (pad_left == 0 && pad_right == 0) return x; + static const bool disable_fused_edge_pad = + std::getenv("SUPERTONIC_DISABLE_FUSED_EDGE_PAD") != nullptr; + if (!disable_fused_edge_pad && + x->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && + ggml_is_contiguous(x)) { + return ggml_supertonic_edge_pad_1d(ctx, x, pad_left, pad_right); + } const int64_t L = x->ne[0]; const int64_t C = x->ne[1]; ggml_tensor * out = x; @@ -117,6 +134,16 @@ ggml_tensor * depthwise_same_ggml(ggml_context * ctx, ggml_tensor * b, int dilation) { const int K = (int) w->ne[0]; + static const bool disable_fused = + std::getenv("SUPERTONIC_DISABLE_FUSED_DEPTHWISE") != nullptr; + if (!disable_fused && (K == 3 || K == 5) && + x->type == GGML_TYPE_F32 && w->type == GGML_TYPE_F32 && + b->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && w->ne[1] == 1 && w->ne[3] == 1 && + w->ne[2] == x->ne[1] && b->ne[0] == x->ne[1] && + ggml_is_contiguous(x) && ggml_is_contiguous(w) && ggml_is_contiguous(b)) { + return ggml_supertonic_depthwise_1d(ctx, x, w, b, dilation); + } const int pad_left = ((K - 1) * dilation) / 2; const int pad_right = (K - 1) * dilation - pad_left; ggml_tensor * padded = edge_clamp_pad_1d(ctx, x, pad_left, pad_right); @@ -128,6 +155,15 @@ ggml_tensor * depthwise_same_ggml(ggml_context * ctx, } ggml_tensor * layer_norm_ggml(ggml_context * ctx, ggml_tensor * x, ggml_tensor * g, ggml_tensor * b) { + static const bool disable_fused_layer_norm = + std::getenv("SUPERTONIC_DISABLE_FUSED_LAYER_NORM") != nullptr; + if (!disable_fused_layer_norm && + x->type == GGML_TYPE_F32 && g->type == GGML_TYPE_F32 && b->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && + g->ne[0] == x->ne[1] && b->ne[0] == x->ne[1] && + ggml_is_contiguous(x) && ggml_is_contiguous(g) && ggml_is_contiguous(b)) { + return ggml_supertonic_layer_norm_channel(ctx, x, g, b, 1e-6f); + } ggml_tensor * xt = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); xt = ggml_norm(ctx, xt, 1e-6f); xt = ggml_mul(ctx, xt, repeat_like(ctx, g, xt)); diff --git a/tts-cpp/src/supertonic_engine.cpp b/tts-cpp/src/supertonic_engine.cpp index 5007f83e839..b4be7f27ea0 100644 --- a/tts-cpp/src/supertonic_engine.cpp +++ b/tts-cpp/src/supertonic_engine.cpp @@ -122,9 +122,17 @@ struct Engine::Impl { if (!std::filesystem::exists(opts.model_gguf_path)) { throw std::runtime_error(supertonic_setup_hint(opts.model_gguf_path)); } + // Map the public Precision enum onto the internal one (separate + // declaration so the engine header doesn't pull in internal.h). + supertonic_precision internal_precision = supertonic_precision::F32; + switch (opts.precision) { + case Precision::F32: internal_precision = supertonic_precision::F32; break; + case Precision::F16: internal_precision = supertonic_precision::F16; break; + case Precision::Q8_0: internal_precision = supertonic_precision::Q8_0; break; + } if (!load_supertonic_gguf(opts.model_gguf_path, model, opts.n_gpu_layers, /*verbose=*/false, - opts.f16_weights)) { + opts.f16_weights, internal_precision)) { throw std::runtime_error("Supertonic Engine: failed to load GGUF: " + opts.model_gguf_path); } @@ -238,20 +246,24 @@ struct Engine::Impl { std::vector latent_mask((size_t) latent_len, 1.0f); - std::vector next; - for (int step = 0; step < steps; ++step) { - if (cancel_flag.load(std::memory_order_acquire)) { - throw std::runtime_error("Supertonic Engine: cancelled at vector step " - + std::to_string(step)); - } - if (!supertonic_vector_step_ggml(model, latent.data(), latent_len, - text_emb.data(), (int) text_ids.size(), - style_ttl.data(), latent_mask.data(), - step, steps, next, &error)) { - throw std::runtime_error("Supertonic Engine: vector estimator failed: " + error); - } - latent.swap(next); + if (cancel_flag.load(std::memory_order_acquire)) { + throw std::runtime_error("Supertonic Engine: cancelled before vector estimator"); + } + // Phase A1+A2: run all CFM steps as ONE ggml graph on non-CPU + // backends. Latent flows step-to-step in GPU memory; on CPU this + // falls back to a per-step loop over `supertonic_vector_step_ggml`. + // Override via SUPERTONIC_DISABLE_LOOP_GRAPH=1. + // NOTE: cancellation granularity is now per-synth on the GPU path + // (worst-case cancel latency = whole CFM loop). CPU keeps per-step + // cancellation via the fallback. + std::vector final_latent; + if (!supertonic_vector_loop_ggml(model, latent.data(), latent_len, + text_emb.data(), (int) text_ids.size(), + style_ttl.data(), latent_mask.data(), + steps, final_latent, &error)) { + throw std::runtime_error("Supertonic Engine: vector estimator failed: " + error); } + latent = std::move(final_latent); if (cancel_flag.load(std::memory_order_acquire)) { throw std::runtime_error("Supertonic Engine: cancelled before vocoder"); diff --git a/tts-cpp/src/supertonic_gguf.cpp b/tts-cpp/src/supertonic_gguf.cpp index feec5ab7ff7..4f6dd4d5513 100644 --- a/tts-cpp/src/supertonic_gguf.cpp +++ b/tts-cpp/src/supertonic_gguf.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,89 @@ ggml_tensor * get_tensor_or_null(const supertonic_model & model, const std::stri return it == model.tensors.end() ? nullptr : it->second; } +// Compute the storage type for a model tensor given the source type from +// the GGUF and the engine's compute-precision selector. Non-matmul tensors +// (biases, norms, embeddings — stored as f32 in the GGUF) are unaffected; +// only quantized matmul weights actually change destination type. +// +// Truth table: +// precision \ src_type | F32 | F16 | Q8_0 +// --------------------------+------+------+------ +// F32 (default) | F32 | F32 | F32 +// F16 (Phase B1) | F32 | F16 | F16 +// Q8_0 (Phase A3) | F32 | F32 | Q8_0 <-- key win: Metal keeps q8_0 +// +// F32 row preserves the historical behaviour exactly. +// Predicate: is `tensor_name` a true matmul weight that lands in a +// `ggml_mul_mat(weight, activation)` call (weight as src0) where Metal +// can dispatch `kernel_mul_mm_q8_0_f32` directly? +// +// Today this is only the vector_estimator's per-step matmul weights — +// those go through `dense_matmul_time_wt_pretransposed_ggml` (the +// B2-partial helper) which uses the pretransposed weight as src0 and +// dispatches the optimised q8_0 mat-mat kernel. +// +// Other GGUF q8_0 sources (text_encoder, duration, speech-prompted +// attention) still flow through `dense_matmul_time_ggml`, which does +// `ggml_cont(ggml_transpose(w))` at compute time — and Metal has no +// CONT kernel for q8_0, so we'd crash. Phase A3 follow-up: extend +// the pretranspose-aware helper to those sites and broaden this +// predicate. +bool is_supertonic_matmul_weight_name(const std::string & name) { + return name.find("vector_estimator:onnx::MatMul_") != std::string::npos; +} + +ggml_type target_supertonic_storage_type(const std::string & name, + enum ggml_type src_type, + supertonic_precision precision, + bool backend_is_cpu) { + // Only quantized matmul-weight tensors are subject to the precision + // selector. Everything else (biases, norms, scales, the unicode + // indexer i32 lookup, etc.) is passed through unchanged so we don't + // attempt a dequant on types that don't have a to_float trait. + const bool is_quantized_weight = + (src_type == GGML_TYPE_Q8_0) || (src_type == GGML_TYPE_F16); + if (!is_quantized_weight) return src_type; + + switch (precision) { + case supertonic_precision::F32: return GGML_TYPE_F32; + case supertonic_precision::F16: + // Asymmetric like q8_0: on CPU dequant everything to f32 (AMX + // cblas takes f32). On non-CPU keep f16 ONLY for true matmul- + // weight tensors that flow through dense_matmul_time_pretransposed_* + // — these dispatch ggml-metal's `kernel_mul_mm_f16_f32` directly. + // Other quantized GGUF tensors (relpos embeddings, conv1d + // kernels, per-channel scales used in plain ggml_mul) flow into + // ggml_metal_op_bin which asserts f32 on both srcs, so we dequant + // them at load. + if (!backend_is_cpu && is_supertonic_matmul_weight_name(name)) { + return GGML_TYPE_F16; + } + return GGML_TYPE_F32; + case supertonic_precision::Q8_0: + // Asymmetric: on CPU, ALWAYS dequant to f32 so cblas/AMX takes + // the weights (q8_0 path on CPU is NEON-only and loses the AMX + // advantage; not worth the parity drift). On non-CPU backends, + // keep q8_0 ONLY for true matmul-weight tensors that flow + // through `dense_matmul_time_wt_pretransposed_ggml`'s + // weight-as-src0 ordering — other quantized GGUF tensors + // (relpos embeddings, conv1d kernels) use op patterns that + // Metal lacks q8_0 kernels for. + if (!backend_is_cpu && + src_type == GGML_TYPE_Q8_0 && + is_supertonic_matmul_weight_name(name)) { + return GGML_TYPE_Q8_0; + } + return GGML_TYPE_F32; + } + return GGML_TYPE_F32; +} + +bool needs_supertonic_tensor_conversion(enum ggml_type src_type, + enum ggml_type dst_type) { + return src_type != dst_type; +} + bool should_expand_supertonic_tensor(enum ggml_type type) { return type == GGML_TYPE_F16 || type == GGML_TYPE_Q8_0; } @@ -97,6 +181,54 @@ std::vector expand_supertonic_tensor_to_f32(const ggml_tensor * src) { return out; } +// Convert a GGUF tensor's data into `out_buf`, which the caller has sized +// to `ggml_row_size(dst_type, n_elems) * (n_rows ...)` — i.e. ggml_nbytes +// for the destination tensor shape. Supports any pair the ggml type +// traits cover: F32 ↔ F16 ↔ Q8_0. Always converts via f32 as the pivot +// because that's the only API surface ggml exports publicly. +void convert_supertonic_tensor_data(const ggml_tensor * src, + enum ggml_type dst_type, + std::vector & out_buf) { + const int64_t n = ggml_nelements(src); + const void * src_data = ggml_get_data(src); + + if (src->type == dst_type) { + // No conversion needed — caller should ideally have skipped this path + // and uploaded the raw GGUF bytes, but handle it for completeness. + const size_t bytes = ggml_nbytes(src); + out_buf.resize(bytes); + std::memcpy(out_buf.data(), src_data, bytes); + return; + } + + // Pivot through f32 using the public ggml_get_type_traits() API. + // `ggml_get_type_traits_cpu()->from_float` is also public for the + // reverse direction (f32 → quantized). + std::vector f32_pivot((size_t) n); + const ggml_type_traits * src_tr = ggml_get_type_traits(src->type); + if (!src_tr || !src_tr->to_float) { + throw std::runtime_error(std::string("Supertonic load: missing to_float for ") + + ggml_type_name(src->type)); + } + src_tr->to_float(src_data, f32_pivot.data(), n); + + if (dst_type == GGML_TYPE_F32) { + out_buf.resize(f32_pivot.size() * sizeof(float)); + std::memcpy(out_buf.data(), f32_pivot.data(), out_buf.size()); + return; + } + + const size_t dst_bytes = ggml_row_size(dst_type, n); + out_buf.resize(dst_bytes); + + const ggml_type_traits_cpu * dst_tr = ggml_get_type_traits_cpu(dst_type); + if (!dst_tr || !dst_tr->from_float) { + throw std::runtime_error(std::string("Supertonic load: missing from_float for ") + + ggml_type_name(dst_type)); + } + dst_tr->from_float(f32_pivot.data(), out_buf.data(), n); +} + ggml_backend_t init_supertonic_backend(int n_gpu_layers, bool verbose) { #ifdef GGML_USE_CUDA if (n_gpu_layers > 0) { @@ -497,6 +629,19 @@ ggml_tensor * require_source_tensor(const supertonic_model & model, const std::s return it->second; } +ggml_tensor * try_source_tensor(const supertonic_model & model, const std::string & source_name) { + auto it = model.source_tensors.find(source_name); + if (it == model.source_tensors.end()) return nullptr; + return it->second; +} + +ggml_tensor * try_pretransposed_weight(const supertonic_model & model, const ggml_tensor * w) { + if (!w) return nullptr; + auto it = model.pretransposed_weights.find(w); + if (it == model.pretransposed_weights.end()) return nullptr; + return it->second; +} + void supertonic_set_n_threads(supertonic_model & model, int n_threads) { configure_supertonic_blas_threads_once(); if (n_threads <= 0) { @@ -510,6 +655,38 @@ void supertonic_graph_compute(const supertonic_model & model, ggml_cgraph * grap if (ggml_backend_is_cpu(model.backend) && model.n_threads > 0) { ggml_backend_cpu_set_n_threads(model.backend, model.n_threads); } + static const bool count_dispatches = std::getenv("SUPERTONIC_COUNT_DISPATCHES") != nullptr; + static const bool dump_op_histogram = std::getenv("SUPERTONIC_DUMP_OP_HISTOGRAM") != nullptr; + if (dump_op_histogram) { + static thread_local int hist_call = 0; + ++hist_call; + const int n = ggml_graph_n_nodes(graph); + std::map hist; + for (int i = 0; i < n; ++i) { + ggml_tensor * t = ggml_graph_node(graph, i); + hist[ggml_op_name(t->op)] += 1; + } + fprintf(stderr, "=== supertonic_graph_compute #%d op histogram (n_nodes=%d) ===\n", hist_call, n); + std::vector> sorted; + for (auto & kv : hist) sorted.emplace_back(kv.second, kv.first); + std::sort(sorted.rbegin(), sorted.rend()); + for (auto & p : sorted) { + fprintf(stderr, " %4d %s\n", p.first, p.second.c_str()); + } + } + if (count_dispatches) { + static thread_local int n_calls = 0; + static thread_local double total_us = 0.0; + ++n_calls; + const auto t0 = std::chrono::steady_clock::now(); + ggml_backend_graph_compute(model.backend, graph); + const auto t1 = std::chrono::steady_clock::now(); + const double us = std::chrono::duration(t1 - t0).count(); + total_us += us; + fprintf(stderr, "supertonic_graph_compute #%d nodes=%d wall=%.1fus cumul=%.2fms\n", + n_calls, ggml_graph_n_nodes(graph), us, total_us / 1000.0); + return; + } ggml_backend_graph_compute(model.backend, graph); } @@ -547,8 +724,22 @@ bool load_supertonic_gguf(const std::string & path, supertonic_model & model, int n_gpu_layers, bool verbose, - int f16_weights) { + int f16_weights, + supertonic_precision precision) { model.generation_id = next_supertonic_generation_id(); + model.precision_id = static_cast(precision); + // The load path supports F32 / F16 / Q8_0 destination types. + // - F32: fully wired. + // - Q8_0: storage on Metal only for `:onnx::MatMul_*` weights (the + // optimised `kernel_mul_mm_q8_0_f32` dispatches via the swapped- + // args `dense_matmul_time_wt_pretransposed_ggml` helper). Other + // tensors expand to f32. On CPU everything expands to f32 so + // cblas/AMX keeps the lead. + // - F16: same asymmetric scheme as Q8_0 — `:onnx::MatMul_*` weights + // stay f16 on Metal (dispatches `kernel_mul_mm_f16_f32`), other + // GGUF-f16 tensors (relpos embeddings, per-channel scales used in + // plain `ggml_mul`) expand to f32 so they don't trip `ggml_metal_op_bin`'s + // f32-only assertion. Pretranspose pass covers f16 alongside f32/q8_0. ggml_context * tmp_ctx = nullptr; gguf_init_params gp = { /*.no_alloc=*/ false, /*.ctx=*/ &tmp_ctx }; gguf_context * gguf_ctx = gguf_init_from_file(path.c_str(), gp); @@ -651,58 +842,76 @@ bool load_supertonic_gguf(const std::string & path, // we use `uint16_t` storage to avoid a public-header dep on // ggml's f16 typedef. std::unordered_map> f16_materialised_tensors; + // Tensors that need a Metal-specific type conversion (e.g. + // f32 → q8_0 for `--precision q8_0`) keep their converted + // bytes here, held alive until the backend upload loop runs. + std::unordered_map> converted_tensors; + + // Ensure the source-alias map is populated even when the + // Phase 2A `use_f16_weights` path didn't already build it — + // the precision-driven decision below also needs it to + // recognise `:onnx::MatMul_` sources for Metal asymmetric load. + if (tensor_to_source_for_alloc.empty()) { + int64_t id_tn = gguf_find_key(gguf_ctx, "supertonic.tensor_names"); + int64_t id_sn = gguf_find_key(gguf_ctx, "supertonic.source_names"); + if (id_tn >= 0 && id_sn >= 0) { + const size_t n_tn = gguf_get_arr_n(gguf_ctx, id_tn); + const size_t n_sn = gguf_get_arr_n(gguf_ctx, id_sn); + if (n_tn == n_sn) { + for (size_t i = 0; i < n_tn; ++i) { + tensor_to_source_for_alloc[gguf_get_arr_str(gguf_ctx, id_tn, i)] = + gguf_get_arr_str(gguf_ctx, id_sn, i); + } + } + } + } // Decide per-tensor destination type: - // - F16 / Q8_0 sources: expand to F32 (legacy behaviour; - // `should_expand_supertonic_tensor`). - // - F32 sources on the F16-weights hot-path roster: - // materialise as F16 (Phase 2A). - // - Everything else: preserve the source type via dup. + // 1. F32 sources on the F16-weights hot-path roster + + // `use_f16_weights` on → materialise as F16 (Phase 2A). + // 2. Else fall through to the precision-driven path: + // `target_supertonic_storage_type` returns F32 / F16 / Q8_0 + // depending on `--precision` and whether the source name is + // a `:onnx::MatMul_` weight on a non-CPU backend. + // 3. Anything else preserves the source type via dup. for (int64_t i = 0; i < num_tensors; ++i) { const char * name = gguf_get_tensor_name(gguf_ctx, i); ggml_tensor * src = ggml_get_tensor(tmp_ctx, name); if (!src) throw std::runtime_error(std::string("missing tmp tensor: ") + name); - // Phase 2A predicate check. Only fires when - // `use_f16_weights` was on and the source resolved to - // a hot-roster name AND its current GGML type is - // either F32 or one of the expand-to-F32 types - // (otherwise the source already carries narrower - // precision than F16 and we don't widen). + auto src_it = tensor_to_source_for_alloc.find(name); + const std::string & decision_name = + (src_it != tensor_to_source_for_alloc.end()) ? src_it->second : std::string(name); + + // Phase 2A predicate check (master). bool f16_materialise = false; - if (model.use_f16_weights) { - auto sit = tensor_to_source_for_alloc.find(name); - if (sit != tensor_to_source_for_alloc.end() && - should_materialise_f16_weight(sit->second) && - (src->type == GGML_TYPE_F32 || - should_expand_supertonic_tensor(src->type))) { - f16_materialise = true; - } + if (model.use_f16_weights && + should_materialise_f16_weight(decision_name) && + (src->type == GGML_TYPE_F32 || + should_expand_supertonic_tensor(src->type))) { + f16_materialise = true; } ggml_type dst_type; if (f16_materialise) { dst_type = GGML_TYPE_F16; - } else if (should_expand_supertonic_tensor(src->type)) { - dst_type = GGML_TYPE_F32; } else { - dst_type = src->type; + // Precision-driven path (ours): F32 / F16 / Q8_0 per + // the `--precision` flag. Returns src->type unchanged + // for tensors that don't need conversion. + dst_type = target_supertonic_storage_type( + decision_name, src->type, precision, + /*backend_is_cpu=*/ ggml_backend_is_cpu(model.backend)); } - ggml_tensor * dst = ggml_new_tensor(model.ctx_w, dst_type, - ggml_n_dims(src), src->ne); + ggml_tensor * dst = (dst_type == src->type) + ? ggml_dup_tensor(model.ctx_w, src) + : ggml_new_tensor(model.ctx_w, dst_type, ggml_n_dims(src), src->ne); ggml_set_name(dst, name); model.tensors[name] = dst; if (f16_materialise) { - // Materialise F32 → F16 host-side. When src was - // originally F16/Q8_0 we expand to F32 first via - // the existing helper, then convert back to F16 - // — round-trip is lossless for the F16 case (the - // original 16-bit pattern is preserved) and a - // one-shot rounding loss for the Q8_0 case - // (acceptable; matches what Q4_0 + F16 down-quant - // does in chatterbox). + // Phase 2A F16 materialise path. std::vector src_f32; if (should_expand_supertonic_tensor(src->type)) { src_f32 = expand_supertonic_tensor_to_f32(src); @@ -716,7 +925,13 @@ bool load_supertonic_gguf(const std::string & path, ggml_fp32_to_fp16_row(src_f32.data(), reinterpret_cast(f16.data()), (int64_t) src_f32.size()); + } else if (needs_supertonic_tensor_conversion(src->type, dst_type)) { + // Precision-driven conversion (ours). Covers f32 → q8_0, + // q8_0 → f32, f16 → f32 etc. Buffered here, uploaded later. + convert_supertonic_tensor_data(src, dst_type, converted_tensors[name]); } else if (should_expand_supertonic_tensor(src->type)) { + // Legacy fallback: f16/q8_0 src with f32 dst that + // didn't go through the conversion helper above. expanded_f32_tensors[name] = expand_supertonic_tensor_to_f32(src); } } @@ -779,16 +994,24 @@ bool load_supertonic_gguf(const std::string & path, continue; } // Phase 2A: F16-materialised tensors take precedence over - // the F32 expansion path (they may have been promoted - // from either F32 or F16/Q8_0 sources). + // the precision-converted / F32-expanded paths (they may + // have been promoted from either F32 or F16/Q8_0 sources). auto f16_mat = f16_materialised_tensors.find(ggml_get_name(cur)); if (f16_mat != f16_materialised_tensors.end()) { ggml_backend_tensor_set(cur, f16_mat->second.data(), 0, f16_mat->second.size() * sizeof(uint16_t)); continue; } - auto expanded = expanded_f32_tensors.find(ggml_get_name(cur)); - if (expanded != expanded_f32_tensors.end()) { + // Precision-driven conversion (`--precision q8_0`/f16 etc.) — + // bytes are already in dst-type representation. + auto converted = converted_tensors.find(ggml_get_name(cur)); + if (converted != converted_tensors.end()) { + ggml_backend_tensor_set(cur, converted->second.data(), 0, + converted->second.size()); + } else if (auto expanded = expanded_f32_tensors.find(ggml_get_name(cur)); + expanded != expanded_f32_tensors.end()) { + // Legacy f16/q8_0 → f32 expansion (used when the + // conversion helper didn't run). ggml_backend_tensor_set(cur, expanded->second.data(), 0, expanded->second.size() * sizeof(float)); } else { @@ -802,14 +1025,21 @@ bool load_supertonic_gguf(const std::string & path, ggml_backend_tensor_get(unicode, model.unicode_indexer.data(), 0, ggml_nbytes(unicode)); } - std::vector tensor_names = get_string_array(gguf_ctx, "supertonic.tensor_names"); - std::vector source_names = get_string_array(gguf_ctx, "supertonic.source_names"); - if (tensor_names.size() != source_names.size()) { - throw std::runtime_error("supertonic tensor/source metadata length mismatch"); - } - for (size_t i = 0; i < tensor_names.size(); ++i) { - ggml_tensor * t = require_tensor(model, tensor_names[i]); - model.source_tensors[source_names[i]] = t; + // Populate the model's source_tensors lookup from the + // GGUF's `supertonic.tensor_names` / `supertonic.source_names` + // pair (the `tensor_to_source_for_alloc` map above only carries + // the same data for the pre-alloc decision; we re-read here so + // we don't have to widen its scope). + { + std::vector tensor_names = get_string_array(gguf_ctx, "supertonic.tensor_names"); + std::vector source_names = get_string_array(gguf_ctx, "supertonic.source_names"); + if (tensor_names.size() != source_names.size()) { + throw std::runtime_error("supertonic.tensor_names / source_names length mismatch"); + } + for (size_t i = 0; i < tensor_names.size(); ++i) { + ggml_tensor * t = require_tensor(model, tensor_names[i]); + model.source_tensors[source_names[i]] = t; + } } for (const std::string & voice_name : get_string_array(gguf_ctx, "supertonic.voice_names")) { @@ -983,6 +1213,128 @@ bool load_supertonic_gguf(const std::string & path, 0, ggml_nbytes(it->second)); } } + + // Materialize pre-transposed copies of matmul weights to drop the + // runtime `cont(transpose(w))` dispatch that `dense_matmul_time_ggml` + // emits on every graph compute (~32 sites × 5 CFM steps per synth). + // CPU's `cblas_sgemm` already handles the transpose via its `Trans` + // flag, so this is a Metal-perf-only optimization — skip the extra + // memory + load-time cost on CPU. Override via + // `SUPERTONIC_DISABLE_WEIGHT_PRETRANSPOSE=1` to debug the unpacked + // path. + // + // Coexists with the F6 pre-transposed t_proj pass above: that one + // handles 4 specific `[512, 64]` `t_proj` weights and registers + // them under the `__T` suffix; this one handles every other + // `:onnx::MatMul_` weight under the `:T` suffix. No collisions. + static const bool disable_pretranspose = + std::getenv("SUPERTONIC_DISABLE_WEIGHT_PRETRANSPOSE") != nullptr; + if (!disable_pretranspose && model.backend && + !ggml_backend_is_cpu(model.backend)) { + std::vector> to_pretranspose; + for (const auto & [src_name, t] : model.source_tensors) { + if (!t) continue; + if (src_name.find(":onnx::MatMul_") == std::string::npos) continue; + if (ggml_n_dims(t) != 2) continue; + // Pretranspose f32 weights (default precision) AND q8_0 / f16 + // weights (asymmetric load modes). For q8_0 / f16 we + // dequant→transpose→requantize through f32; the round-trip + // introduces tiny rounding within the type's existing noise + // tolerance. This is what unlocks A3 step 2 + // (kernel_mul_mm_q8_0_f32 / kernel_mul_mm_f16_f32 dispatches + // when both (a) the pretransposed weight is available as + // src0 and (b) the new dense_matmul_time_wt_pretransposed_ggml + // swaps the mul_mat args so the weight is src0). + if (t->type != GGML_TYPE_F32 && + t->type != GGML_TYPE_F16 && + t->type != GGML_TYPE_Q8_0) continue; + to_pretranspose.push_back({src_name, t}); + } + if (!to_pretranspose.empty()) { + ggml_init_params extra_params = { + /*.mem_size=*/ ggml_tensor_overhead() * to_pretranspose.size(), + /*.mem_buffer=*/ nullptr, + /*.no_alloc=*/ true, + }; + model.ctx_w_extra = ggml_init(extra_params); + if (!model.ctx_w_extra) { + throw std::runtime_error("ggml_init ctx_w_extra failed"); + } + std::vector> orig_to_pre; + orig_to_pre.reserve(to_pretranspose.size()); + for (const auto & [src_name, t] : to_pretranspose) { + // Pre tensor has same type as orig (f32 stays f32, + // q8_0 stays q8_0); only the shape swaps. + ggml_tensor * tt = ggml_new_tensor_2d(model.ctx_w_extra, + t->type, t->ne[1], t->ne[0]); + const std::string tt_name = std::string(ggml_get_name(t)) + ":T"; + ggml_set_name(tt, tt_name.c_str()); + model.source_tensors[src_name + ":T"] = tt; + orig_to_pre.push_back({t, tt}); + } + model.buffer_w_extra = + ggml_backend_alloc_ctx_tensors(model.ctx_w_extra, model.backend); + if (!model.buffer_w_extra) { + throw std::runtime_error( + "ggml_backend_alloc_ctx_tensors ctx_w_extra failed"); + } + // Upload the transposed data. For f32 weights this is a + // straight host-side reorder. For q8_0 weights we dequant + // to f32, transpose in f32, then requantize via from_float + // into the pretransposed q8_0 tensor. Both directions go + // through the public ggml type-traits APIs. + for (const auto & [orig, pre] : orig_to_pre) { + const int OC = (int) orig->ne[0]; + const int IC = (int) orig->ne[1]; + const size_t n = (size_t) OC * IC; + + // Step 1: download `orig` data, dequantize to f32 if needed. + std::vector host_orig_f32(n); + if (orig->type == GGML_TYPE_F32) { + ggml_backend_tensor_get(orig, host_orig_f32.data(), 0, + n * sizeof(float)); + } else { + std::vector raw(ggml_nbytes(orig)); + ggml_backend_tensor_get(orig, raw.data(), 0, raw.size()); + const ggml_type_traits * tr = ggml_get_type_traits(orig->type); + if (!tr || !tr->to_float) { + throw std::runtime_error( + std::string("pretranspose: missing to_float for ") + + ggml_type_name(orig->type)); + } + tr->to_float(raw.data(), host_orig_f32.data(), (int64_t) n); + } + + // Step 2: transpose in f32. + std::vector host_pre_f32(n); + for (int oc = 0; oc < OC; ++oc) { + for (int ic = 0; ic < IC; ++ic) { + host_pre_f32[(size_t) ic + (size_t) oc * IC] = + host_orig_f32[(size_t) oc + (size_t) ic * OC]; + } + } + + // Step 3: upload (requantizing if needed). + if (pre->type == GGML_TYPE_F32) { + ggml_backend_tensor_set(pre, host_pre_f32.data(), 0, + n * sizeof(float)); + } else { + const size_t dst_bytes = ggml_row_size(pre->type, n); + std::vector raw(dst_bytes); + const ggml_type_traits_cpu * dtr = + ggml_get_type_traits_cpu(pre->type); + if (!dtr || !dtr->from_float) { + throw std::runtime_error( + std::string("pretranspose: missing from_float for ") + + ggml_type_name(pre->type)); + } + dtr->from_float(host_pre_f32.data(), raw.data(), (int64_t) n); + ggml_backend_tensor_set(pre, raw.data(), 0, raw.size()); + } + model.pretransposed_weights[orig] = pre; + } + } + } } catch (const std::exception & e) { fprintf(stderr, "load_supertonic_gguf: %s\n", e.what()); gguf_free(gguf_ctx); @@ -1009,6 +1361,10 @@ void free_supertonic_model(supertonic_model & model) { if (model.generation_id != 0) { unregister_supertonic_alive(model.generation_id); } + if (model.buffer_w_extra) { + ggml_backend_buffer_free(model.buffer_w_extra); + model.buffer_w_extra = nullptr; + } if (model.buffer_w) { ggml_backend_buffer_free(model.buffer_w); model.buffer_w = nullptr; @@ -1017,10 +1373,15 @@ void free_supertonic_model(supertonic_model & model) { ggml_backend_free(model.backend); model.backend = nullptr; } + if (model.ctx_w_extra) { + ggml_free(model.ctx_w_extra); + model.ctx_w_extra = nullptr; + } if (model.ctx_w) { ggml_free(model.ctx_w); model.ctx_w = nullptr; } + model.pretransposed_weights.clear(); model.tensors.clear(); model.source_tensors.clear(); model.vocoder = {}; diff --git a/tts-cpp/src/supertonic_internal.h b/tts-cpp/src/supertonic_internal.h index 97ae58a3813..d18e84ec131 100644 --- a/tts-cpp/src/supertonic_internal.h +++ b/tts-cpp/src/supertonic_internal.h @@ -9,6 +9,7 @@ #include #include "ggml-backend.h" +#include "ggml-cpu.h" #include "ggml.h" namespace tts_cpp::supertonic::detail { @@ -131,10 +132,29 @@ struct supertonic_model { // Override via `EngineOptions::f16_weights` / `--f16-weights`. bool use_f16_weights = false; + // The compute precision the model was loaded with — set by + // `load_supertonic_gguf`. Lets graph builders dispatch precision- + // specific code paths (e.g. asymmetric q8_0 load on Metal). + // Orthogonal to `use_f16_weights` above (that's a per-op runtime + // selector for the OpenCL hot-weight materialisation; this is the + // global storage-type selector). + int precision_id = 0; // supertonic_precision::F32 + std::map tensors; std::unordered_map source_tensors; std::unordered_map voices; + // Pre-transposed copies of matmul weights, materialized at load time + // to eliminate the per-call `cont(transpose(w))` dispatch that + // `dense_matmul_time_ggml` issues on every graph compute. Keyed by + // the ORIGINAL weight tensor pointer (i.e. the value in + // `source_tensors[]`); the mapped value is the transposed + // f32 copy with `ne = [IC, OC]` and lives in `ctx_w_extra` / + // `buffer_w_extra`. Lookup via `try_pretransposed_weight(model, w)`. + ggml_context * ctx_w_extra = nullptr; + ggml_backend_buffer_t buffer_w_extra = nullptr; + std::unordered_map pretransposed_weights; + std::vector unicode_indexer; std::vector languages; std::string tts_json; @@ -217,17 +237,48 @@ struct supertonic_model { // regardless of backend). // See Phase 2A in `aiDocs/PLAN_SUPERTONIC_OPENCL.md` for the // roster + auto-policy rationale. +// +// `precision` (separate concern): selects the storage type for +// matmul weights at GGUF load time. Mirrors the public +// `tts_cpp::supertonic::Precision` enum. F32 is the historical +// default; Q8_0 / F16 trigger asymmetric loads on Metal. +enum class supertonic_precision { + F32 = 0, + F16 = 1, + Q8_0 = 2, +}; + bool load_supertonic_gguf(const std::string & path, supertonic_model & model, int n_gpu_layers = 0, bool verbose = false, - int f16_weights = -1); + int f16_weights = -1, + supertonic_precision precision = supertonic_precision::F32); void free_supertonic_model(supertonic_model & model); void supertonic_set_n_threads(supertonic_model & model, int n_threads); void supertonic_graph_compute(const supertonic_model & model, ggml_cgraph * graph); +// True when the model's compute backend supports the per-stage CPU fast paths +// (the `ggml_custom_4d` callbacks in conv1d_f32 / depthwise_same_ggml / +// layer_norm_ggml etc.). ggml custom ops are CPU-only by design; on Metal / +// CUDA / Vulkan the helpers must fall through to their stock-ggml-op paths. +// Mirrors the `!ggml_backend_is_cpu(backend)` idiom Chatterbox uses to gate +// its Metal-only batched-CFG path. +inline bool model_prefers_cpu_kernels(const supertonic_model & model) { + return model.backend == nullptr || ggml_backend_is_cpu(model.backend); +} + ggml_tensor * require_tensor(const supertonic_model & model, const std::string & name); ggml_tensor * require_source_tensor(const supertonic_model & model, const std::string & source_name); +ggml_tensor * try_source_tensor(const supertonic_model & model, const std::string & source_name); + +// Look up a pre-transposed copy of a matmul weight. Returns nullptr if no +// pre-transposed copy was materialized for `w` at load time (e.g. CPU backend +// — pre-transposition is a Metal-perf-only optimization). When non-null, the +// returned tensor has `ne = [IC, OC]` (the swapped layout of `w`), is f32 and +// contiguous in `model.buffer_w_extra`. Callers should reshape it as the +// conv1d kernel `[K=1, IC, OC]` directly and skip the cont(transpose(w)). +ggml_tensor * try_pretransposed_weight(const supertonic_model & model, const ggml_tensor * w); std::string supertonic_preprocess_text(const std::string & text, const std::string & language, @@ -401,6 +452,24 @@ void supertonic_profile_csv_record(const char * stage, const char * island, void supertonic_profile_csv_flush(); void supertonic_profile_csv_set_path(const char * path); +// Phase A1+A2 (Metal): run ALL `total_steps` CFM denoising steps inside +// ONE ggml_cgraph, dispatched with a single ggml_backend_graph_compute +// call. On non-CPU backends this replaces the engine's per-step loop +// entirely (latent stays in GPU memory step-to-step, no host round-trip). +// On CPU it falls back to a per-step loop over `supertonic_vector_step_ggml` +// so the cblas fastpaths still apply. Override the GPU path with +// SUPERTONIC_DISABLE_LOOP_GRAPH=1 to A/B against the per-step path. +bool supertonic_vector_loop_ggml(const supertonic_model & model, + const float * initial_noisy_latent, + int latent_len, + const float * text_emb, + int text_len, + const float * style_ttl, + const float * latent_mask, + int total_steps, + std::vector & final_latent_out, + std::string * error = nullptr); + bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, const float * noisy_latent, const float * text_emb, diff --git a/tts-cpp/src/supertonic_text_encoder.cpp b/tts-cpp/src/supertonic_text_encoder.cpp index 80ee1f44f87..2ea17f4bd93 100644 --- a/tts-cpp/src/supertonic_text_encoder.cpp +++ b/tts-cpp/src/supertonic_text_encoder.cpp @@ -116,7 +116,14 @@ ggml_tensor * repeat_like(ggml_context * ctx, ggml_tensor * v, ggml_tensor * lik else if (like->ne[1] == v->ne[0]) v = ggml_reshape_2d(ctx, v, 1, v->ne[0]); } if (!ggml_can_repeat(v, like)) throw std::runtime_error("cannot repeat tensor in text encoder graph"); - return ggml_repeat(ctx, v, like); + // Every caller feeds this into ggml_add/ggml_mul which broadcast natively; + // skip the explicit ggml_repeat dispatch. + static const bool force_explicit_repeat = + std::getenv("SUPERTONIC_FORCE_EXPLICIT_REPEAT") != nullptr; + if (force_explicit_repeat) { + return ggml_repeat(ctx, v, like); + } + return v; } ggml_tensor * conv1d_f32(ggml_context * ctx, @@ -125,6 +132,8 @@ ggml_tensor * conv1d_f32(ggml_context * ctx, int stride, int padding, int dilation) { + // text_encoder uses the pure-graph path unconditionally; no CPU fast path + // here so no use_cpu_fastpath plumbing. ggml_tensor * im2col = ggml_im2col(ctx, kernel, input, stride, 0, padding, 0, dilation, 0, false, GGML_TYPE_F32); ggml_tensor * result = ggml_mul_mat(ctx, ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1]), @@ -133,6 +142,15 @@ ggml_tensor * conv1d_f32(ggml_context * ctx, } ggml_tensor * edge_clamp_pad_1d(ggml_context * ctx, ggml_tensor * x, int pad_left, int pad_right) { + if (pad_left == 0 && pad_right == 0) return x; + static const bool disable_fused_edge_pad = + std::getenv("SUPERTONIC_DISABLE_FUSED_EDGE_PAD") != nullptr; + if (!disable_fused_edge_pad && + x->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && + ggml_is_contiguous(x)) { + return ggml_supertonic_edge_pad_1d(ctx, x, pad_left, pad_right); + } const int64_t L = x->ne[0], C = x->ne[1]; ggml_tensor * out = x; if (pad_left > 0) { @@ -151,6 +169,16 @@ ggml_tensor * depthwise_same_ggml(ggml_context * ctx, ggml_tensor * w, ggml_tensor * b) { const int K = (int)w->ne[0]; + static const bool disable_fused = + std::getenv("SUPERTONIC_DISABLE_FUSED_DEPTHWISE") != nullptr; + if (!disable_fused && (K == 3 || K == 5) && + x->type == GGML_TYPE_F32 && w->type == GGML_TYPE_F32 && + b->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && w->ne[1] == 1 && w->ne[3] == 1 && + w->ne[2] == x->ne[1] && b->ne[0] == x->ne[1] && + ggml_is_contiguous(x) && ggml_is_contiguous(w) && ggml_is_contiguous(b)) { + return ggml_supertonic_depthwise_1d(ctx, x, w, b, 1); + } const int pad_left = (K - 1) / 2; const int pad_right = (K - 1) - pad_left; ggml_tensor * padded = edge_clamp_pad_1d(ctx, x, pad_left, pad_right); @@ -162,6 +190,15 @@ ggml_tensor * depthwise_same_ggml(ggml_context * ctx, } ggml_tensor * layer_norm_ggml(ggml_context * ctx, ggml_tensor * x, ggml_tensor * g, ggml_tensor * b) { + static const bool disable_fused_layer_norm = + std::getenv("SUPERTONIC_DISABLE_FUSED_LAYER_NORM") != nullptr; + if (!disable_fused_layer_norm && + x->type == GGML_TYPE_F32 && g->type == GGML_TYPE_F32 && b->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && + g->ne[0] == x->ne[1] && b->ne[0] == x->ne[1] && + ggml_is_contiguous(x) && ggml_is_contiguous(g) && ggml_is_contiguous(b)) { + return ggml_supertonic_layer_norm_channel(ctx, x, g, b, 1e-6f); + } ggml_tensor * xt = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); xt = ggml_norm(ctx, xt, 1e-6f); xt = ggml_mul(ctx, xt, repeat_like(ctx, g, xt)); @@ -683,6 +720,9 @@ void speech_prompted_attention(const supertonic_model & m, int idx, dense_time_matmul(merged, L, C, out_w, out_b, C, out_lc); } +// `speech_attention_cache` + `build_speech_attention_cache` own the +// second-of-two graph caches `speech_prompted_attention_ggml` runs +// (flash-attn + out-proj after host-side q/k/v_pack work). struct speech_attention_cache { const supertonic_model * model = nullptr; uint64_t generation_id = 0; @@ -700,19 +740,19 @@ struct speech_attention_cache { ggml_tensor * v = nullptr; }; -void free_speech_attention_cache(speech_attention_cache & cache) { +inline void free_speech_attention_cache(speech_attention_cache & cache) { supertonic_safe_gallocr_free(cache.allocr, cache.generation_id); if (cache.ctx) ggml_free(cache.ctx); cache = {}; } -void build_speech_attention_cache(speech_attention_cache & cache, - const supertonic_model & m, - int idx, - int L, - int Lctx, - const std::string & out_w_source, - const std::string & out_b_source) { +inline void build_speech_attention_cache(speech_attention_cache & cache, + const supertonic_model & m, + int idx, + int L, + int Lctx, + const std::string & out_w_source, + const std::string & out_b_source) { free_speech_attention_cache(cache); cache.model = &m; cache.generation_id = m.generation_id; @@ -748,6 +788,123 @@ void build_speech_attention_cache(speech_attention_cache & cache, ggml_gallocr_alloc_graph(cache.allocr, cache.gf); } +// Phase A4: speech_prompted_attention as ONE merged ggml graph. +// +// Pre-A4 this function built two separate graphs (QKV proj, then +// flash-attn+out-proj) with host-side q_pack/v_pack/k_pack head-split +// work between them. The merged version does the head-split in-graph +// via reshape + permute + cont (or relies on ggml's view semantics +// where it's free), feeds straight into flash_attn, and runs the out +// projection — all in one `ggml_backend_graph_compute` call. +// +// Per call savings: 1 graph dispatch (one fewer command buffer) + +// host-side pack work (3 round-trips of q/v/k_pack data eliminated). +// Two calls per synth = 2 dispatches saved. +struct speech_prompted_merged_cache { + const supertonic_model * model = nullptr; + uint64_t generation_id = 0; + int idx = -1; + int L = 0; + int Lctx = 0; + std::string out_w_source; + std::string out_b_source; + std::vector buf; + ggml_context * ctx = nullptr; + ggml_cgraph * gf = nullptr; + ggml_gallocr_t allocr = nullptr; + ggml_tensor * x_in = nullptr; // [L, C] + ggml_tensor * style_in = nullptr; // [Lctx, C] + ggml_tensor * out = nullptr; // [L, C] result +}; + +void free_speech_prompted_merged_cache(speech_prompted_merged_cache & cache) { + supertonic_safe_gallocr_free(cache.allocr, cache.generation_id); + if (cache.ctx) ggml_free(cache.ctx); + cache = {}; +} + +void build_speech_prompted_merged_cache(speech_prompted_merged_cache & cache, + const supertonic_model & m, + int idx, + int L, + int Lctx, + const std::string & q_w_source, + const std::string & v_w_source, + const std::string & out_w_source, + const std::string & out_b_source, + const std::string & tanh_k_source, + const std::string & q_b_source, + const std::string & v_b_source) { + const int C = 256; + const int half = 128; + const int H = 2; + (void)H; + free_speech_prompted_merged_cache(cache); + cache.model = &m; + cache.generation_id = m.generation_id; + cache.idx = idx; + cache.L = L; + cache.Lctx = Lctx; + cache.out_w_source = out_w_source; + cache.out_b_source = out_b_source; + + constexpr int NODES = 512; + const size_t buf_size = ggml_tensor_overhead() * NODES + ggml_graph_overhead_custom(NODES, false); + cache.buf.assign(buf_size, 0); + ggml_init_params gp = { buf_size, cache.buf.data(), true }; + cache.ctx = ggml_init(gp); + cache.gf = ggml_new_graph_custom(cache.ctx, NODES, false); + + cache.x_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, L, C); + ggml_set_name(cache.x_in, "spm_x_in"); ggml_set_input(cache.x_in); + cache.style_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, Lctx, C); + ggml_set_name(cache.style_in, "spm_style_in"); ggml_set_input(cache.style_in); + + // Q proj. Output ne=[L, C]. Head-split: reshape to [L, half, H] + // then permute(1, 0, 2, 3) → cont gives [half, L, H] — the layout + // flash_attn views as [head_dim, q_len, n_heads]. + ggml_tensor * q_tc = dense_matmul_time_ggml(cache.ctx, cache.x_in, + require_source_tensor(m, q_w_source), + require_source_tensor(m, q_b_source)); + ggml_tensor * q_3d = ggml_reshape_3d(cache.ctx, q_tc, L, half, 2); + ggml_tensor * q_dlh = ggml_cont(cache.ctx, ggml_permute(cache.ctx, q_3d, 1, 0, 2, 3)); + + // V proj on style. Same head-split into [half, Lctx, H]. + ggml_tensor * v_tc = dense_matmul_time_ggml(cache.ctx, cache.style_in, + require_source_tensor(m, v_w_source), + require_source_tensor(m, v_b_source)); + ggml_tensor * v_3d = ggml_reshape_3d(cache.ctx, v_tc, Lctx, half, 2); + ggml_tensor * v_dlh = ggml_cont(cache.ctx, ggml_permute(cache.ctx, v_3d, 1, 0, 2, 3)); + + // K is the precomputed tanh_k model tensor. Stored as ne=[Lctx, C]. + // Same head-split: reshape to [Lctx, half, H] then permute to + // [half, Lctx, H] and cont. No per-call host work needed since + // K is constant per model. + ggml_tensor * k_orig = require_source_tensor(m, tanh_k_source); + ggml_tensor * k_3d = ggml_reshape_3d(cache.ctx, k_orig, Lctx, half, 2); + ggml_tensor * k_dlh = ggml_cont(cache.ctx, ggml_permute(cache.ctx, k_3d, 1, 0, 2, 3)); + + // Flash attention. Same call shape as the pre-A4 path. + ggml_tensor * attn = ggml_flash_attn_ext(cache.ctx, q_dlh, k_dlh, v_dlh, + nullptr, 1.0f / 16.0f, 0.0f, 0.0f); + attn = ggml_reshape_2d(cache.ctx, attn, C, L); + ggml_tensor * ctx_tc = ggml_cont(cache.ctx, ggml_transpose(cache.ctx, attn)); + + // Output projection. + cache.out = dense_matmul_time_ggml(cache.ctx, ctx_tc, + require_source_tensor(m, out_w_source), + require_source_tensor(m, out_b_source)); + ggml_set_name(cache.out, "spm_out"); ggml_set_output(cache.out); + ggml_build_forward_expand(cache.gf, cache.out); + + cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend)); + if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new speech_prompted_merged failed"); + if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { + throw std::runtime_error("ggml_gallocr_reserve speech_prompted_merged failed"); + } + ggml_gallocr_alloc_graph(cache.allocr, cache.gf); +} + // F14 — cached speech-prompted attention QKV graph. // // Pre-audit, `speech_prompted_attention_ggml` allocated a fresh @@ -787,6 +944,8 @@ void speech_prompted_attention_ggml(const supertonic_model & m, int idx, const std::string q_w = "text_encoder:" + std::string(idx == 0 ? "onnx::MatMul_3678" : "onnx::MatMul_3682"); const std::string v_w = "text_encoder:" + std::string(idx == 0 ? "onnx::MatMul_3680" : "onnx::MatMul_3684"); const std::string o_w = "text_encoder:" + std::string(idx == 0 ? "onnx::MatMul_3681" : "onnx::MatMul_3685"); + const std::string tanh_k_src = "text_encoder:/speech_prompted_text_encoder/attention" + std::to_string(attn_num) + "/tanh/Tanh_output_0"; + (void) tanh_k_src; // master's path uses model.speech_tanh_k_cache; tanh_k_src kept for symbolic parity with read_f32 fallback below. // F14: per-(model, idx, L) cached QKV graph. Two thread-local // slots so the two speech-prompted layers don't fight over a @@ -879,8 +1038,9 @@ void speech_prompted_attention_ggml(const supertonic_model & m, int idx, speech_attention_cache & cache = caches[idx]; if (cache.model != &m || cache.generation_id != m.generation_id || cache.idx != idx || cache.L != L || cache.Lctx != Lctx || - cache.out_w_source != o_w || cache.out_b_source != p + ".out_fc.linear.bias") { - build_speech_attention_cache(cache, m, idx, L, Lctx, o_w, p + ".out_fc.linear.bias"); + cache.out_w_source != o_w) { + build_speech_attention_cache(cache, m, idx, L, Lctx, o_w, + p + ".out_fc.linear.bias"); } ggml_backend_tensor_set(cache.q, q_pack.data(), 0, q_pack.size()*sizeof(float)); ggml_backend_tensor_set(cache.k, k_pack.data(), 0, k_pack.size()*sizeof(float)); diff --git a/tts-cpp/src/supertonic_vector_estimator.cpp b/tts-cpp/src/supertonic_vector_estimator.cpp index 597957a06f3..42e12f5ff01 100644 --- a/tts-cpp/src/supertonic_vector_estimator.cpp +++ b/tts-cpp/src/supertonic_vector_estimator.cpp @@ -155,7 +155,18 @@ ggml_tensor * repeat_like(ggml_context * ctx, ggml_tensor * v, ggml_tensor * lik std::to_string(like->ne[0]) + "," + std::to_string(like->ne[1]) + "," + std::to_string(like->ne[2]) + "," + std::to_string(like->ne[3]) + "]"); } - return ggml_repeat(ctx, v, like); + // Every call site in this file feeds the return value straight into + // ggml_add / ggml_mul, both of which broadcast natively in ggml. Skip + // the explicit ggml_repeat node so the downstream op handles the + // broadcast — saves ~282 REPEAT ops per consolidated per-step graph. + // Override with SUPERTONIC_FORCE_EXPLICIT_REPEAT=1 if this regresses + // on a backend that doesn't broadcast (none observed today). + static const bool force_explicit_repeat = + std::getenv("SUPERTONIC_FORCE_EXPLICIT_REPEAT") != nullptr; + if (force_explicit_repeat) { + return ggml_repeat(ctx, v, like); + } + return v; } ggml_tensor * conv1d_f32(ggml_context * ctx, @@ -217,6 +228,19 @@ ggml_tensor * conv1d_f32(ggml_context * ctx, } ggml_tensor * edge_clamp_pad_1d(ggml_context * ctx, ggml_tensor * x, int pad_left, int pad_right) { + if (pad_left == 0 && pad_right == 0) return x; + // Fused fast path via supertonic_edge_pad_1d. Same kernel handles + // both sides; the legacy view + repeat_4d + concat chain (2 ops + // per side) becomes 1 dispatch total. Override: + // SUPERTONIC_DISABLE_FUSED_EDGE_PAD=1. + static const bool disable_fused_edge_pad = + std::getenv("SUPERTONIC_DISABLE_FUSED_EDGE_PAD") != nullptr; + if (!disable_fused_edge_pad && + x->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && + ggml_is_contiguous(x)) { + return ggml_supertonic_edge_pad_1d(ctx, x, pad_left, pad_right); + } const int64_t L = x->ne[0]; const int64_t C = x->ne[1]; ggml_tensor * out = x; @@ -337,6 +361,23 @@ ggml_tensor * depthwise_same_ggml(ggml_context * ctx, return custom; } const int K = (int) w->ne[0]; + // Fused-op fast path (any backend that registers GGML_OP_SUPERTONIC_DEPTHWISE_1D + // — Metal does via the local ggml port overlay; CPU's + // ggml_compute_forward_supertonic_depthwise_1d is the parity backstop). + // Replaces the edge_clamp_pad + im2col + mul_mat + add chain with one + // dispatch. Currently supports K in {3, 5}; the existing graph path is + // the fallback for K outside that set. Override with + // SUPERTONIC_DISABLE_FUSED_DEPTHWISE=1 to force the stock-op chain. + static const bool disable_fused = + std::getenv("SUPERTONIC_DISABLE_FUSED_DEPTHWISE") != nullptr; + if (!disable_fused && (K == 3 || K == 5) && + x->type == GGML_TYPE_F32 && w->type == GGML_TYPE_F32 && + b->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && w->ne[1] == 1 && w->ne[3] == 1 && + w->ne[2] == x->ne[1] && b->ne[0] == x->ne[1] && + ggml_is_contiguous(x) && ggml_is_contiguous(w) && ggml_is_contiguous(b)) { + return ggml_supertonic_depthwise_1d(ctx, x, w, b, dilation); + } const int pad_left = ((K - 1) * dilation) / 2; const int pad_right = (K - 1) * dilation - pad_left; ggml_tensor * padded = edge_clamp_pad_1d(ctx, x, pad_left, pad_right); @@ -351,6 +392,19 @@ ggml_tensor * layer_norm_ggml(ggml_context * ctx, ggml_tensor * x, ggml_tensor * g, ggml_tensor * b) { + // Fused-op fast path on non-CPU backends (Metal/Vulkan/CUDA/OpenCL): + // GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL collapses the + // permute + cont + ggml_norm + mul + add + permute + cont chain into + // a single dispatch. Override with SUPERTONIC_DISABLE_FUSED_LAYER_NORM=1. + static const bool disable_fused_layer_norm = + std::getenv("SUPERTONIC_DISABLE_FUSED_LAYER_NORM") != nullptr; + if (!supertonic_use_cpu_custom_ops() && !disable_fused_layer_norm && + x->type == GGML_TYPE_F32 && g->type == GGML_TYPE_F32 && b->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && + g->ne[0] == x->ne[1] && b->ne[0] == x->ne[1] && + ggml_is_contiguous(x) && ggml_is_contiguous(g) && ggml_is_contiguous(b)) { + return ggml_supertonic_layer_norm_channel(ctx, x, g, b, 1e-6f); + } // CPU-only direct row-wise layer-norm; falls through to permute + // ggml_norm on non-CPU backends so the graph stays GPU-executable. if (supertonic_use_cpu_custom_ops() && @@ -465,6 +519,13 @@ ggml_tensor * dense_matmul_time_ggml(ggml_context * ctx, // tensors are loaded as ne=[OC, IC]. Make that transpose contiguous, then // view it as a Conv1d kernel [K=1, IC, OC] so it can consume the repo's // standard time-major activation layout [T, IC]. + // + // Tried replacing this conv1d_f32 wrapper with a direct ggml_mul_mat on + // 2026-05-11 — it requires cont on BOTH operands to satisfy mul_mat's + // !ggml_is_transposed(A) assertion, which yields the SAME dispatch count + // (cont + cont + mul_mat + add) as the current conv1d path (cont + + // im2col + mul_mat + add). Net wash; keeping conv1d_f32 because it's + // already battle-tested with the CPU fastpath. ggml_tensor * wt = ggml_cont(ctx, ggml_transpose(ctx, w)); ggml_tensor * kernel = ggml_reshape_3d(ctx, wt, 1, w->ne[1], w->ne[0]); ggml_tensor * y = conv1d_f32(ctx, kernel, x, 1, 0, 1); @@ -472,9 +533,146 @@ ggml_tensor * dense_matmul_time_ggml(ggml_context * ctx, return y; } +// Same as dense_matmul_time_ggml, but `model` is consulted for a pre- +// transposed copy of `w` (built at load time for `:onnx::MatMul_*` weights +// on non-CPU backends). When available, the runtime `cont(transpose(w))` +// dispatch is skipped — the pre-transposed tensor already has the +// `[IC, OC]` layout that the conv1d_f32 K=1 kernel expects. CPU callers +// fall through to the original path (the cblas pointwise fast path takes +// the loaded `[OC, IC]` weight directly). +// Forward decl — defined below. +ggml_tensor * dense_matmul_time_wt_pretransposed_ggml(ggml_context * ctx, + const supertonic_model & model, + ggml_tensor * x, + ggml_tensor * w, + ggml_tensor * b); + +ggml_tensor * dense_matmul_time_pretransposed_ggml(ggml_context * ctx, + const supertonic_model & model, + ggml_tensor * x, + ggml_tensor * w, + ggml_tensor * b) { + if (!supertonic_use_cpu_custom_ops()) { + if (ggml_tensor * w_pre = try_pretransposed_weight(model, w)) { + if (w_pre->type == GGML_TYPE_F32) { + // f32 fast path: reshape w_pre into the conv1d kernel + // [K=1, IC, OC] and dispatch via the existing wrapper. + // mul_mat(im2col_f32, kernel_f32) hits the optimised + // kernel_mul_mm_f32_f32. + ggml_tensor * kernel = ggml_reshape_3d(ctx, w_pre, 1, w_pre->ne[0], w_pre->ne[1]); + ggml_tensor * y = conv1d_f32(ctx, kernel, x, 1, 0, 1); + if (b) y = ggml_add(ctx, y, repeat_like(ctx, b, y)); + return y; + } + // Quantized w_pre (q8_0): the f32 fast path's + // mul_mat(im2col_f32, kernel_quant) would need a + // kernel_mul_mm_f32_q8_0 variant which ggml-metal doesn't ship. + // Route through the wt helper (kernel as src0 — dispatches + // kernel_mul_mm_q8_0_f32) and transpose the [A, T] result back + // to [T, A] so the caller's downstream code (residual adds, + // [T, C]-shaped intermediate state) doesn't have to change. + ggml_tensor * y_wt = dense_matmul_time_wt_pretransposed_ggml( + ctx, model, x, w, b); + return ggml_cont(ctx, ggml_transpose(ctx, y_wt)); + } + } + return dense_matmul_time_ggml(ctx, x, w, b); +} + +// Phase B2 partial: like dense_matmul_time_pretransposed_ggml but emits +// the result in *width-major* `[OC, T]` layout instead of `[T, OC]`. +// +// The trick is to swap the `ggml_mul_mat` operand order from +// `mul_mat(im2col_[IC,T], kernel_[IC,OC]) -> [T, OC]` to +// `mul_mat(kernel_[IC,OC], im2col_[IC,T]) -> [OC, T]`. Both operands +// stay non-transposed so the assertion on `a`/`b` is satisfied. The +// kernel-as-`src0` ordering is also what `kernel_mul_mm_q8_0_f32` +// requires, so this single change *also* unlocks A3 step 2 (the +// optimized quantized matmul kernel will dispatch when `w_pre` is +// q8_0 — see the asymmetric load logic in supertonic_gguf.cpp). +// +// Used at the Q/K/V projection sites in the per-step graph: the +// downstream rope + flash_attn expect `[A, L]` layout, so the cont +// (transpose) that used to flip `[L, A]` -> `[A, L]` becomes dead +// code. Eliminates ~24 cont dispatches per per-step graph × 5 +// steps = ~120 ops per synth. +// +// Bias add: `b` (shape `[OC]`) broadcasts naturally against the +// new `[OC, T]` output via `repeat_like`'s 1-d → 2-d reshape on the +// `ne[0]` match. +// +// Falls through to the legacy path with a runtime cont(transpose) +// on the activation when no pretransposed weight is available +// (e.g. weight not on the `:onnx::MatMul_` allowlist). +ggml_tensor * dense_matmul_time_wt_pretransposed_ggml(ggml_context * ctx, + const supertonic_model & model, + ggml_tensor * x, + ggml_tensor * w, + ggml_tensor * b) { + if (!supertonic_use_cpu_custom_ops()) { + if (ggml_tensor * w_pre = try_pretransposed_weight(model, w)) { + const int IC = (int) w_pre->ne[0]; + const int OC = (int) w_pre->ne[1]; + + // ggml_im2col only reads the kernel's SHAPE (ne[0..3]); it never + // touches the kernel data — the output buffer holds the + // rearranged activation. So for the SHAPE we can use: + // - a reshape of w_pre when w_pre is f32 (cheap, just metadata) + // - a tiny phantom f32 tensor allocated in the graph context + // when w_pre is quantized (because reshape_3d(q8_0, 1, IC, OC) + // would set ne[0]=1 < q8_0's 32-element block size and break + // the type's invariants). The phantom is never read. + ggml_tensor * shape_kernel; + if (w_pre->type == GGML_TYPE_F32) { + shape_kernel = ggml_reshape_3d(ctx, w_pre, 1, IC, OC); + } else { + shape_kernel = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, IC, OC); + // No data needs binding — im2col only consults ne[0..3]. + } + + ggml_tensor * im2col = ggml_im2col(ctx, shape_kernel, x, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F32); + // im2col has ne=[IC, T, 1, 1]. Reshape to 2D for mul_mat. + ggml_tensor * im2col_2d = ggml_reshape_2d(ctx, im2col, + im2col->ne[0], im2col->ne[2] * im2col->ne[1]); + // Swapped order: w_pre first (src0 = the quantized/f32 weight), + // im2col second (src1 = f32 activation). Result is [M=OC, N=T]. + // For w_pre=q8_0 this dispatches kernel_mul_mm_q8_0_f32 — the + // bandwidth-optimised quantized matmul kernel — which is the + // A3 step 2 unlock. + ggml_tensor * w_2d = ggml_reshape_2d(ctx, w_pre, IC, OC); + ggml_tensor * y = ggml_mul_mat(ctx, w_2d, im2col_2d); + // y has ne=[OC, T] — already the wt layout. + if (b) y = ggml_add(ctx, y, repeat_like(ctx, b, y)); + return y; + } + } + // Fallback: legacy [T, OC] matmul + explicit cont(transpose) to + // produce [OC, T] for the caller. CPU also lands here (and gets + // the cblas fast path for free via dense_matmul_time_ggml). + ggml_tensor * y_tc = dense_matmul_time_ggml(ctx, x, w, b); + return ggml_cont(ctx, ggml_transpose(ctx, y_tc)); +} + ggml_tensor * bias_gelu_ggml(ggml_context * ctx, ggml_tensor * x, ggml_tensor * b) { + const bool use_cpu_custom = supertonic_use_cpu_custom_ops(); + // Fused-op fast path (any backend that registers + // GGML_OP_SUPERTONIC_BIAS_GELU — Metal does via the local ggml port + // overlay; CPU's ggml_compute_forward_supertonic_bias_gelu is the + // parity backstop). Replaces the add(bias) + gelu_erf chain + // (2 dispatches on Metal) with one dispatch. Override with + // SUPERTONIC_DISABLE_FUSED_BIAS_GELU=1 to force the stock-op chain. + // Skipped on CPU custom-op backends (cblas path below is faster). + static const bool disable_fused_bias_gelu = + std::getenv("SUPERTONIC_DISABLE_FUSED_BIAS_GELU") != nullptr; + if (!use_cpu_custom && !disable_fused_bias_gelu && + x->type == GGML_TYPE_F32 && b->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && + b->ne[0] == x->ne[1] && + ggml_is_contiguous(x) && ggml_is_contiguous(b)) { + return ggml_supertonic_bias_gelu(ctx, x, b); + } // CPU-only fused bias + GELU; falls back to gelu(add(x, b)) on GPU. - if (supertonic_use_cpu_custom_ops() && + if (use_cpu_custom && x->type == GGML_TYPE_F32 && b->type == GGML_TYPE_F32 && x->ne[2] == 1 && x->ne[3] == 1) { auto op = [](ggml_tensor * dst, int ith, int nth, void *) { const ggml_tensor * src = dst->src[0]; @@ -507,9 +705,29 @@ ggml_tensor * pw2_residual_ggml(ggml_context * ctx, ggml_tensor * x, ggml_tensor * b, ggml_tensor * gamma) { + const bool use_cpu_custom = supertonic_use_cpu_custom_ops(); + // Fused-op fast path (any backend that registers + // GGML_OP_SUPERTONIC_PW2_RESIDUAL — Metal does via the local ggml port + // overlay; CPU's ggml_compute_forward_supertonic_pw2_residual is the + // parity backstop). Replaces the add(bias) + mul(gamma) + add(residual) + // chain with one dispatch. Override with + // SUPERTONIC_DISABLE_FUSED_PW2_RESIDUAL=1 to force the stock-op chain. + // Skipped on CPU custom-op backends (cblas fast path below is faster). + static const bool disable_fused_pw2_residual = + std::getenv("SUPERTONIC_DISABLE_FUSED_PW2_RESIDUAL") != nullptr; + if (!use_cpu_custom && !disable_fused_pw2_residual && + residual->type == GGML_TYPE_F32 && x->type == GGML_TYPE_F32 && + b->type == GGML_TYPE_F32 && gamma->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && + residual->ne[0] == x->ne[0] && residual->ne[1] == x->ne[1] && + b->ne[0] == x->ne[1] && gamma->ne[0] == x->ne[1] && + ggml_is_contiguous(residual) && ggml_is_contiguous(x) && + ggml_is_contiguous(b) && ggml_is_contiguous(gamma)) { + return ggml_supertonic_pw2_residual(ctx, residual, x, b, gamma); + } // CPU-only fused (bias + gamma + residual); falls back to the // 3-step add/mul/add chain on GPU. - if (supertonic_use_cpu_custom_ops() && + if (use_cpu_custom && residual->type == GGML_TYPE_F32 && x->type == GGML_TYPE_F32 && b->type == GGML_TYPE_F32 && gamma->type == GGML_TYPE_F32 && x->ne[2] == 1 && x->ne[3] == 1) { @@ -568,6 +786,109 @@ ggml_tensor * vector_convnext_ggml(ggml_context * ctx, require_source_tensor(model, p + ".gamma")); } +// Phase B2 full: [C, T]-layout pointwise (K=1) Conv1d as a direct matmul. +// +// pwconv1/pwconv2 weights load as Conv1d kernels with ne=[K=1, IC, OC, 1]. +// With activations already in [C, T] layout (IC inner-most), the K=1 +// dimension is degenerate and the convolution is just: +// +// y[OC, T] = sum_IC w[IC, OC] * x[IC, T] +// +// which is exactly `ggml_mul_mat(w_2d=[IC, OC], x_2d=[IC, T])` — no +// im2col, no transpose, no pretranspose-cache lookup needed. Result is +// f32 contiguous and directly consumable by the next [C, T] op. +// +// CPU is intentionally NOT routed here: AMX cblas_sgemm in the legacy +// path is faster than the equivalent ggml_mul_mat dispatch on Apple +// CPUs. Caller's `vector_convnext_ggml_ct` already roundtrips on CPU. +ggml_tensor * pointwise_matmul_ct(ggml_context * ctx, + ggml_tensor * x_ct, // [IC, T, 1, 1] + ggml_tensor * w, // [1, IC, OC, 1] (Conv1d K=1) + ggml_tensor * b) { + GGML_ASSERT(w->ne[0] == 1); // K=1 + GGML_ASSERT(w->ne[1] == x_ct->ne[0]); // IC match + GGML_ASSERT(ggml_is_contiguous(w)); + ggml_tensor * w_2d = ggml_reshape_2d(ctx, w, w->ne[1], w->ne[2]); + ggml_tensor * x_2d = ggml_reshape_2d(ctx, x_ct, x_ct->ne[0], x_ct->ne[1]); + ggml_tensor * y = ggml_mul_mat(ctx, w_2d, x_2d); // [OC, T] + if (b) y = ggml_add(ctx, y, repeat_like(ctx, b, y)); + return y; +} + +// Phase B2 full: ConvNeXt block operating on `[C, T]` activations end-to-end. +// All five fused custom Metal kernels have layout-flag plumbing landed in +// port-version 13; this block strings their `_ct` variants together so the +// activation tensor never needs to flip layout mid-block. Used by callers +// that fuse a chain of N convnext blocks with a single entry permute +// `[T, C] -> [C, T]` before the loop and a single exit permute after — net +// savings = (N - 1) intra-block transposes per chain × 5 CFM steps. +// +// Input x: [C, T, 1, 1] f32 contiguous +// Output : [C, T, 1, 1] f32 contiguous +// +// CPU backends fall through to the legacy `[T, C]` path: the `_ct` ops have +// CPU forward implementations but they would force AMX-cblas off, so on +// CPU we permute in/out around the legacy block to keep AMX engaged. +ggml_tensor * vector_convnext_ggml_ct(ggml_context * ctx, + const supertonic_model & model, + const std::string & p, + ggml_tensor * x_ct, + int dilation) { + if (model_prefers_cpu_kernels(model)) { + // CPU: roundtrip to [T, C], run legacy block (AMX cblas fast path), + // roundtrip back. Cheap on CPU because the permute is just a copy. + ggml_tensor * x_tc = ggml_cont(ctx, ggml_permute(ctx, x_ct, 1, 0, 2, 3)); + ggml_tensor * y_tc = vector_convnext_ggml(ctx, model, p, x_tc, dilation); + return ggml_cont(ctx, ggml_permute(ctx, y_tc, 1, 0, 2, 3)); + } + + // Helper: flatten leading-1 dims so per-channel tensors come out as [C]. + // Supertonic GGUFs ship bias/gamma/norm parameters as [C, 1, 1, 1] or + // [1, C, 1, 1] depending on which PyTorch broadcast view they were + // exported from. The `_ct` ctors all assert `param->ne[0] == C_dim`, so + // unflattened tensors break them. This is the same shape mismatch that + // has been silently disabling the legacy `pw2_residual_ggml` fused path + // for ConvNeXt blocks all along. + auto flatten_1d = [&](ggml_tensor * t) -> ggml_tensor * { + const int64_t n = ggml_nelements(t); + // Skip reshape only when already a literal 1-d view with ne[0] == n + // (`ggml_n_dims` is unreliable here — it ignores leading-1 dims and + // would return 1 for a [1, C, 1, 1] tensor where ne[0] = 1). + if (t->ne[0] == n && t->ne[1] == 1 && t->ne[2] == 1 && t->ne[3] == 1) { + return t; + } + return ggml_reshape_1d(ctx, t, n); + }; + + ggml_tensor * residual = x_ct; + // depthwise_1d_ct: [C, T] -> [C, T] + ggml_tensor * y = ggml_supertonic_depthwise_1d_ct(ctx, x_ct, + require_source_tensor(model, p + ".dwconv.weight"), + flatten_1d(require_source_tensor(model, p + ".dwconv.bias")), + dilation); + // layer_norm_channel_ct: [C, T] -> [C, T] + y = ggml_supertonic_layer_norm_channel_ct(ctx, y, + flatten_1d(require_source_tensor(model, p + ".norm.norm.weight")), + flatten_1d(require_source_tensor(model, p + ".norm.norm.bias")), + 1e-6f); + // pw1 matmul: [IC=C, T] -> [OC, T] + y = pointwise_matmul_ct(ctx, y, + require_source_tensor(model, p + ".pwconv1.weight"), + nullptr); + // bias_gelu_ct: [OC, T] -> [OC, T] + y = ggml_supertonic_bias_gelu_ct(ctx, y, + flatten_1d(require_source_tensor(model, p + ".pwconv1.bias"))); + // pw2 matmul: [IC=OC, T] -> [C, T] (restores channel count) + y = pointwise_matmul_ct(ctx, y, + require_source_tensor(model, p + ".pwconv2.weight"), + nullptr); + // pw2_residual_ct: x[C, T] + bias[C] (×) gamma[C] + residual[C, T] -> [C, T] + return ggml_supertonic_pw2_residual_ct(ctx, y, + flatten_1d(require_source_tensor(model, p + ".pwconv2.bias")), + flatten_1d(require_source_tensor(model, p + ".gamma")), + residual); +} + std::vector tensor_to_time_channel(ggml_tensor * t) { const int L = (int) t->ne[0]; const int C = (int) t->ne[1]; @@ -730,7 +1051,7 @@ void build_text_attention_cache(vector_text_attention_cache & cache, ggml_set_name(ctx_tc, "vector_attn_ctx"); ggml_set_output(ctx_tc); ggml_build_forward_expand(cache.gf, ctx_tc); - ggml_tensor * out = dense_matmul_time_ggml(cache.ctx, ctx_tc, + ggml_tensor * out = dense_matmul_time_pretransposed_ggml(cache.ctx, model, ctx_tc, require_source_tensor(model, out_w_source), require_source_tensor(model, out_b_source)); ggml_set_name(out, "vector_attn_out"); ggml_set_output(out); @@ -994,18 +1315,20 @@ void build_group_graph_cache(vector_group_graph_cache & cache, } // F6: pre-transposed companion lives in model.ctx_w under // `__T` (populated at load). Falls back to the - // in-graph `ggml_cont(ggml_transpose(W))` rewrite if the - // pre-transpose roster didn't cover this weight (e.g. when - // running against a model whose `matmul_source` shape doesn't - // match the audit's [512, 64] expectation; see the defensive - // check in supertonic_gguf.cpp's F6 hook). + // per-pointer `pretransposed_weights` map (Metal's broader Q/K/V + // pretranspose roster), and finally to an in-graph + // `ggml_cont(ggml_transpose(W))` rewrite if neither covers this + // weight. ggml_tensor * t_proj; { auto pretrans_it = model.source_tensors.find(matmul_source + "__T"); ggml_tensor * w_t = (pretrans_it != model.source_tensors.end()) ? pretrans_it->second : nullptr; if (!w_t) { - w_t = ggml_cont(cache.ctx, ggml_transpose(cache.ctx, - require_source_tensor(model, matmul_source))); + ggml_tensor * t_proj_w_orig = require_source_tensor(model, matmul_source); + w_t = try_pretransposed_weight(model, t_proj_w_orig); + if (!w_t) { + w_t = ggml_cont(cache.ctx, ggml_transpose(cache.ctx, t_proj_w_orig)); + } } t_proj = ggml_mul_mat(cache.ctx, w_t, ggml_reshape_2d(cache.ctx, cache.temb_in, 64, 1)); @@ -1029,13 +1352,13 @@ void build_group_graph_cache(vector_group_graph_cache & cache, ggml_build_forward_expand(cache.gf, cur); const std::string attn_prefix = vector_main_block(post_block + 1) + ".attn."; - ggml_tensor * q = dense_matmul_time_ggml(cache.ctx, cur, + ggml_tensor * q = dense_matmul_time_pretransposed_ggml(cache.ctx, model, cur, require_source_tensor(model, q_matmul_source), require_source_tensor(model, attn_prefix + "W_query.linear.bias")); - ggml_tensor * k = dense_matmul_time_ggml(cache.ctx, cache.text_in, + ggml_tensor * k = dense_matmul_time_pretransposed_ggml(cache.ctx, model, cache.text_in, require_source_tensor(model, k_matmul_source), require_source_tensor(model, attn_prefix + "W_key.linear.bias")); - ggml_tensor * v = dense_matmul_time_ggml(cache.ctx, cache.text_in, + ggml_tensor * v = dense_matmul_time_pretransposed_ggml(cache.ctx, model, cache.text_in, require_source_tensor(model, v_matmul_source), require_source_tensor(model, attn_prefix + "W_value.linear.bias")); ggml_set_name(q, q_name.c_str()); ggml_set_output(q); ggml_build_forward_expand(cache.gf, q); @@ -1361,14 +1684,14 @@ void build_res_style_qkv_cache(vector_res_style_qkv_cache & cache, ggml_build_forward_expand(cache.gf, post); const std::string style_prefix = vector_main_block(style_block) + ".attention."; - ggml_tensor * sq = dense_matmul_time_ggml(cache.ctx, post, + ggml_tensor * sq = dense_matmul_time_pretransposed_ggml(cache.ctx, model, post, require_source_tensor(model, q_matmul_source), require_source_tensor(model, style_prefix + "W_query.linear.bias")); - ggml_tensor * sk = dense_matmul_time_ggml(cache.ctx, cache.kctx_in, + ggml_tensor * sk = dense_matmul_time_pretransposed_ggml(cache.ctx, model, cache.kctx_in, require_source_tensor(model, k_matmul_source), require_source_tensor(model, style_prefix + "W_key.linear.bias")); sk = ggml_tanh(cache.ctx, sk); - ggml_tensor * sv = dense_matmul_time_ggml(cache.ctx, cache.style_v_in, + ggml_tensor * sv = dense_matmul_time_pretransposed_ggml(cache.ctx, model, cache.style_v_in, require_source_tensor(model, v_matmul_source), require_source_tensor(model, style_prefix + "W_value.linear.bias")); ggml_set_name(sq, q_name.c_str()); ggml_set_output(sq); ggml_build_forward_expand(cache.gf, sq); @@ -3150,6 +3473,912 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model, } } +// Apply Supertonic's non-standard RoPE in-graph. +// Supertonic uses angle = (t/L) * theta[d_half], where theta is loaded from +// the GGUF and L is the per-call sequence length. ggml_rope_ext's formula +// expands to angle = (pos / freq_factors[d/2]) * freq_scale * freq_base^(-d/n_dims). +// Setting freq_base=1, freq_scale=1, freq_factors[d_half] = L / theta[d_half], +// positions = [0..L) reproduces the Supertonic formula exactly. NEOX mode +// matches apply_rope's split-pairs layout (x[d] rotates with x[d+D/2]) at +// supertonic_vector_estimator.cpp:1416. +// +// x_tc must be a contiguous 2D tensor of shape ne=[H*D, q_len] (width-major). +// `positions` is int32 [q_len], `freq_factors` is f32 [D/2]; both are caller- +// owned input tensors set via ggml_backend_tensor_set before compute. +ggml_tensor * apply_supertonic_rope_ggml(ggml_context * ctx, + ggml_tensor * x_tc, + ggml_tensor * positions, + ggml_tensor * freq_factors, + int q_len, + int H, + int D) { + GGML_ASSERT(x_tc->ne[0] == (int64_t)(H*D)); + GGML_ASSERT(x_tc->ne[1] == (int64_t)q_len); + const size_t row_bytes = (size_t)(H*D) * sizeof(float); + const size_t head_bytes = (size_t)D * sizeof(float); + // View [H*D, q_len] as [D, H, q_len] so rope's outer dim is time. + // Strides: nb1 = head step (D floats), nb2 = time step (H*D floats). + // This view is naturally contiguous (nb[0]=elem_size, nb[1]=D*elem_size, + // nb[2]=H*D*elem_size = ne[0]*ne[1]*elem_size) so we can skip the + // ggml_cont copy that earlier versions inserted defensively. + ggml_tensor * x_view = ggml_view_3d(ctx, x_tc, D, H, q_len, + head_bytes, row_bytes, 0); + ggml_tensor * roped = ggml_rope_ext(ctx, x_view, positions, freq_factors, + D, GGML_ROPE_TYPE_NEOX, 0, + /*freq_base=*/1.0f, + /*freq_scale=*/1.0f, + /*ext_factor=*/0.0f, + /*attn_factor=*/1.0f, + /*beta_fast=*/0.0f, + /*beta_slow=*/0.0f); + return ggml_reshape_2d(ctx, roped, (int64_t) H * D, q_len); +} + +// Append a text-attention subgraph (Q, K, V flash-attention + out projection + +// bias add) to the parent (ctx, gf). Mirrors build_text_attention_cache but +// composes into the caller's context instead of owning one. +// +// Inputs: +// q_tc, k_tc, v_tc: contiguous [H*D, *_len] tensors +// out_w_tensor: model tensor for the out projection weight +// out_b_tensor: model tensor for the out projection bias +// Returns: out_tc tensor of shape [out_dim, q_len]. +ggml_tensor * append_text_attention_subgraph(ggml_context * ctx, + const supertonic_model & model, + ggml_tensor * q_tc, + ggml_tensor * k_tc, + ggml_tensor * v_tc, + int q_len, int kv_len, + int n_heads, int head_dim, + ggml_tensor * out_w_tensor, + ggml_tensor * out_b_tensor, + float scale) { + const int width = n_heads * head_dim; + const size_t time_stride = (size_t)width * sizeof(float); + const size_t head_stride = (size_t)head_dim * sizeof(float); + ggml_tensor * q_in = ggml_view_3d(ctx, q_tc, + head_dim, q_len, n_heads, time_stride, head_stride, 0); + ggml_tensor * k_in = ggml_view_3d(ctx, k_tc, + head_dim, kv_len, n_heads, time_stride, head_stride, 0); + ggml_tensor * v_in = ggml_view_3d(ctx, v_tc, + head_dim, kv_len, n_heads, time_stride, head_stride, 0); + ggml_tensor * attn = ggml_flash_attn_ext(ctx, q_in, k_in, v_in, + nullptr, scale, 0.0f, 0.0f); + attn = ggml_reshape_2d(ctx, attn, (int64_t) n_heads * head_dim, q_len); + ggml_tensor * ctx_tc = ggml_cont(ctx, ggml_transpose(ctx, attn)); + return dense_matmul_time_pretransposed_ggml(ctx, model, ctx_tc, out_w_tensor, out_b_tensor); +} + +// Per-group MatMul tensor name suffixes (groups 0..3). See per-group source +// names in trace_proj_ggml; these tables centralise them for the consolidated +// path. +struct vector_step_group_names { + int t_linear; // time-linear (matmul for time embedding projection) + int attn_q; + int attn_k; + int attn_v; + int attn_out; + int style_q; + int style_k; + int style_v; + int style_out; +}; + +static const vector_step_group_names kGroupNames[4] = { + {3095, 3101, 3102, 3103, 3110, 3116, 3117, 3118, 3119}, + {3140, 3146, 3147, 3148, 3155, 3161, 3162, 3163, 3164}, + {3185, 3191, 3192, 3193, 3200, 3206, 3207, 3208, 3209}, + {3230, 3236, 3237, 3238, 3245, 3251, 3252, 3253, 3254}, +}; + +static std::string matmul_name(int suffix) { + return "vector_estimator:onnx::MatMul_" + std::to_string(suffix); +} + +// Bundle of input tensors a single CFM step subgraph needs. Used both by +// the per-step cache (one step per ggml_cgraph) and by the +// 5-steps-unrolled-into-one-graph cache (Phase A1+A2). +// +// `x_in` / `noise_in` vary per step (x_in = latent for this step, +// noise_in is the "residual" we add the velocity to — for Supertonic's +// CFM equation `next = noise_in + velocity * (1 / total_steps)` they +// happen to be the same tensor for a single step but become DIFFERENT +// tensors when steps are chained: step N's x_in is step N-1's output, +// while noise_in is still the original noisy latent that step. In the +// per-step path we bind them to the same external buffer; in the +// unrolled-loop path we wire them as graph edges between steps). +// +// `t_emb_in` varies per step (one time embedding per CFM step index). +// All other inputs are constant across the 5 CFM steps and bind to a +// single shared input tensor regardless of which path is used. +struct vector_step_inputs { + ggml_tensor * x_in = nullptr; // ne=[L, Cin] f32 + ggml_tensor * mask_in = nullptr; // ne=[L] f32 + ggml_tensor * t_emb_in = nullptr; // ne=[64] f32 (per-step) + ggml_tensor * text_in = nullptr; // ne=[text_len, 256] f32 + ggml_tensor * style_v_raw_in = nullptr; // ne=[50, 256] f32 + ggml_tensor * style_kctx_in = nullptr; // ne=[50, 256] f32 + ggml_tensor * noise_in = nullptr; // ne=[L, Cin] f32 (per-step) + ggml_tensor * pos_q = nullptr; // ne=[L] i32 + ggml_tensor * pos_k = nullptr; // ne=[text_len] i32 + ggml_tensor * freq_factors_q = nullptr; // ne=[D/2] f32 + ggml_tensor * freq_factors_k = nullptr; // ne=[D/2] f32 +}; + +// Append one CFM step's subgraph (proj_in → 4 groups → tail → proj_out +// → velocity → next = noise + velocity / total_steps) to `gf`. All +// inputs are pre-bound by the caller; this function only builds the +// dataflow and returns the `next` tensor (ne=[L, Cin]) so the caller +// can either set it as a graph output or feed it as the next step's +// `x_in`. The function does NOT call `ggml_set_output` / +// `ggml_build_forward_expand` on the result — that's the caller's +// decision. +// +// `L`, `text_len` and `total_steps` are passed explicitly because they're +// used in several places. CPU vs GPU dispatch lives on the thread-local +// `supertonic_use_cpu_custom_ops()` flag set by the outer +// `supertonic_op_dispatch_scope` at the public entry point. +ggml_tensor * append_supertonic_vector_step_subgraph( + ggml_context * gctx, + ggml_cgraph * gf, + const supertonic_model & model, + const vector_step_inputs & inputs, + int L, + int text_len, + int total_steps); + +// Consolidated per-step cache: one ctx, one cgraph, one gallocr for the entire +// per-step computation. Replaces the ~17 sub-graph dispatches the trace_proj +// orchestrator emits with a single ggml_backend_graph_compute call. +struct vector_step_one_graph_cache { + const supertonic_model * model = nullptr; + uint64_t generation_id = 0; + int L = 0; + int text_len = 0; + int total_steps = 0; + + std::vector buf; + ggml_context * ctx = nullptr; + ggml_cgraph * gf = nullptr; + ggml_gallocr_t allocr = nullptr; + + // Per-call inputs + ggml_tensor * x_in = nullptr; // noisy_latent (L, Cin) ggml-shape: ne=[L, Cin] + ggml_tensor * mask_in = nullptr; // [L] + ggml_tensor * t_emb_in = nullptr; // [64] + ggml_tensor * text_in = nullptr; // [text_len, 256] + ggml_tensor * style_v_raw_in = nullptr; // [50, 256] (style_ttl repacked) + ggml_tensor * style_kctx_in = nullptr; // [50, 256] (model's /Expand_output_0) + ggml_tensor * noise_in = nullptr; // (L, Cin) (same data as x_in but indep slot for tail) + + // Per-build (rope) inputs + ggml_tensor * pos_q = nullptr; // int32 [L] + ggml_tensor * pos_k = nullptr; // int32 [text_len] + ggml_tensor * freq_factors_q = nullptr; // f32 [32] (head_dim/2) + ggml_tensor * freq_factors_k = nullptr; // f32 [32] + + // Output + ggml_tensor * next_latent_out = nullptr; // ne=[L, Cin] in (t, c) order +}; + +void free_vector_step_one_graph_cache(vector_step_one_graph_cache & cache) { + if (cache.allocr) { + supertonic_safe_gallocr_free(cache.allocr, cache.model ? cache.model->generation_id : 0); + cache.allocr = nullptr; + } + if (cache.ctx) { + ggml_free(cache.ctx); + cache.ctx = nullptr; + } + cache.gf = nullptr; + cache.buf.clear(); + cache.model = nullptr; + cache.generation_id = 0; + cache.L = 0; + cache.text_len = 0; + cache.total_steps = 0; + cache.x_in = cache.mask_in = cache.t_emb_in = cache.text_in = nullptr; + cache.style_v_raw_in = cache.style_kctx_in = cache.noise_in = nullptr; + cache.pos_q = cache.pos_k = cache.freq_factors_q = cache.freq_factors_k = nullptr; + cache.next_latent_out = nullptr; +} + +ggml_tensor * append_supertonic_vector_step_subgraph( + ggml_context * gctx, + ggml_cgraph * gf, + const supertonic_model & model, + const vector_step_inputs & inputs, + int L, + int text_len, + int total_steps) { + const bool use_cpu_custom = supertonic_use_cpu_custom_ops(); + // Shape constants that aren't dependent on L / text_len. Mirror the + // values from supertonic_vector_step_one_graph_ggml. + const int C = 512; + const int H = 4; // text-attention heads + const int D = 64; // text-attention head_dim + const int SH = 2; // style-attention heads + const int SD = 128; // style-attention head_dim + const int kv_style = 50; // fixed by /Expand_output_0 + (void)H; (void)D; (void)SH; (void)SD; (void)kv_style; + + // ===== PHASE 0: proj_in + mask ===== + ggml_tensor * cur = conv1d_f32(gctx, + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.proj_in.net.weight"), + inputs.x_in, 1, 0, 1); + cur = ggml_mul(gctx, cur, repeat_like(gctx, inputs.mask_in, cur)); + + // ===== PHASE 1: Group 0 prologue — ConvNeXt × 4 on main_blocks.0 + time_add (1) + ConvNeXt (2) ===== + int dils[4] = {1, 2, 4, 8}; + // Phase B2 full: permute to [C, T] once before the 4-block chain, run + // the chain in [C, T] (which lets each block's two pointwise convs + // become a direct ggml_mul_mat with no im2col), permute back to + // [T, C] for the downstream time-add. Saves 2 im2col dispatches per + // block × 4 blocks × 5 steps − 2 permutes per chain × 5 steps = + // 30 dispatches eliminated per synth. Override: + // SUPERTONIC_DISABLE_CT_CONVNEXT=1. + static const bool disable_ct_convnext = + std::getenv("SUPERTONIC_DISABLE_CT_CONVNEXT") != nullptr; + const bool use_ct_convnext = !disable_ct_convnext && !use_cpu_custom; + if (use_ct_convnext) { + ggml_tensor * cur_ct = ggml_cont(gctx, ggml_permute(gctx, cur, 1, 0, 2, 3)); + for (int j = 0; j < 4; ++j) { + cur_ct = vector_convnext_ggml_ct(gctx, model, + "vector_estimator:tts.ttl.vector_field.main_blocks.0.convnext." + std::to_string(j), + cur_ct, dils[j]); + } + cur = ggml_cont(gctx, ggml_permute(gctx, cur_ct, 1, 0, 2, 3)); + } else { + for (int j = 0; j < 4; ++j) { + cur = vector_convnext_ggml(gctx, model, + "vector_estimator:tts.ttl.vector_field.main_blocks.0.convnext." + std::to_string(j), + cur, dils[j]); + } + } + // Time-add for group 0. + { + ggml_tensor * w = require_source_tensor(model, matmul_name(kGroupNames[0].t_linear)); + ggml_tensor * b = require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.1.linear.linear.bias"); + ggml_tensor * w_t = try_pretransposed_weight(model, w); + if (!w_t) w_t = ggml_cont(gctx, ggml_transpose(gctx, w)); + ggml_tensor * t_proj = ggml_mul_mat(gctx, w_t, ggml_reshape_2d(gctx, inputs.t_emb_in, 64, 1)); + t_proj = ggml_add(gctx, t_proj, ggml_reshape_2d(gctx, b, C, 1)); + cur = ggml_add(gctx, cur, repeat_like(gctx, t_proj, cur)); + } + cur = vector_convnext_ggml(gctx, model, + "vector_estimator:tts.ttl.vector_field.main_blocks.2.convnext.0", + cur, 1); + ggml_tensor * block_pre_attn = cur; + + // Per-group attention block. + auto run_group = [&](ggml_tensor * x, int group, ggml_tensor * x_pre_attn) -> ggml_tensor * { + const auto & names = kGroupNames[group]; + const int attn_block = group * 6 + 3; + const int post_attn_block = group * 6 + 4; + const int style_block = group * 6 + 5; + + // Text attention QKV — output directly in [A, T] (width-major) + // layout so the cont(transpose) before rope/flash_attn is gone. + // The kernel-as-src0 ordering also dispatches the optimized + // kernel_mul_mm_q8_0_f32 when weights are q8_0. + ggml_tensor * q_wt = dense_matmul_time_wt_pretransposed_ggml(gctx, model, x_pre_attn, + require_source_tensor(model, matmul_name(names.attn_q)), + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(attn_block) + ".attn.W_query.linear.bias")); + ggml_tensor * k_wt = dense_matmul_time_wt_pretransposed_ggml(gctx, model, inputs.text_in, + require_source_tensor(model, matmul_name(names.attn_k)), + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(attn_block) + ".attn.W_key.linear.bias")); + ggml_tensor * v_wt = dense_matmul_time_wt_pretransposed_ggml(gctx, model, inputs.text_in, + require_source_tensor(model, matmul_name(names.attn_v)), + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(attn_block) + ".attn.W_value.linear.bias")); + + q_wt = apply_supertonic_rope_ggml(gctx, q_wt, inputs.pos_q, inputs.freq_factors_q, L, H, D); + k_wt = apply_supertonic_rope_ggml(gctx, k_wt, inputs.pos_k, inputs.freq_factors_k, text_len, H, D); + + ggml_tensor * attn_out = append_text_attention_subgraph(gctx, model, + q_wt, k_wt, v_wt, L, text_len, H, D, + require_source_tensor(model, matmul_name(names.attn_out)), + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(attn_block) + ".attn.out_fc.linear.bias"), + 1.0f / 16.0f); + + ggml_tensor * residual = ggml_add(gctx, x_pre_attn, attn_out); + ggml_tensor * normed = layer_norm_ggml(gctx, residual, + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(attn_block) + ".norm.norm.weight"), + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(attn_block) + ".norm.norm.bias")); + + ggml_tensor * post = vector_convnext_ggml(gctx, model, + "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(post_attn_block) + ".convnext.0", + normed, 1); + + ggml_tensor * masked_post = ggml_mul(gctx, post, repeat_like(gctx, inputs.mask_in, post)); + + // Style attention QKV — output directly in [A, T] layout. + ggml_tensor * sq_wt = dense_matmul_time_wt_pretransposed_ggml(gctx, model, masked_post, + require_source_tensor(model, matmul_name(names.style_q)), + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(style_block) + ".attention.W_query.linear.bias")); + ggml_tensor * sk_wt = dense_matmul_time_wt_pretransposed_ggml(gctx, model, inputs.style_kctx_in, + require_source_tensor(model, matmul_name(names.style_k)), + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(style_block) + ".attention.W_key.linear.bias")); + sk_wt = ggml_tanh(gctx, sk_wt); + ggml_tensor * sv_wt = dense_matmul_time_wt_pretransposed_ggml(gctx, model, inputs.style_v_raw_in, + require_source_tensor(model, matmul_name(names.style_v)), + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(style_block) + ".attention.W_value.linear.bias")); + + ggml_tensor * style_out = append_text_attention_subgraph(gctx, model, + sq_wt, sk_wt, sv_wt, L, kv_style, SH, SD, + require_source_tensor(model, matmul_name(names.style_out)), + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(style_block) + ".attention.out_fc.linear.bias"), + 1.0f / 16.0f); + + ggml_tensor * style_residual = ggml_add(gctx, post, style_out); + ggml_tensor * style_normed = layer_norm_ggml(gctx, style_residual, + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(style_block) + ".norm.norm.weight"), + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(style_block) + ".norm.norm.bias")); + (void)x; + return style_normed; + }; + + // Group prep for groups 1-3. + auto group_prep = [&](ggml_tensor * x, int group) -> ggml_tensor * { + const int conv_block = group * 6 + 0; + const int linear_block = group * 6 + 1; + const int post_block = group * 6 + 2; + int dils2[4] = {1, 2, 4, 8}; + ggml_tensor * y = x; + if (use_ct_convnext) { + ggml_tensor * y_ct = ggml_cont(gctx, ggml_permute(gctx, y, 1, 0, 2, 3)); + for (int j = 0; j < 4; ++j) { + y_ct = vector_convnext_ggml_ct(gctx, model, + "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(conv_block) + ".convnext." + std::to_string(j), + y_ct, dils2[j]); + } + y = ggml_cont(gctx, ggml_permute(gctx, y_ct, 1, 0, 2, 3)); + } else { + for (int j = 0; j < 4; ++j) { + y = vector_convnext_ggml(gctx, model, + "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(conv_block) + ".convnext." + std::to_string(j), + y, dils2[j]); + } + } + ggml_tensor * w = require_source_tensor(model, matmul_name(kGroupNames[group].t_linear)); + ggml_tensor * b = require_source_tensor(model, + "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(linear_block) + ".linear.linear.bias"); + ggml_tensor * w_t = try_pretransposed_weight(model, w); + if (!w_t) w_t = ggml_cont(gctx, ggml_transpose(gctx, w)); + ggml_tensor * t_proj = ggml_mul_mat(gctx, w_t, ggml_reshape_2d(gctx, inputs.t_emb_in, 64, 1)); + t_proj = ggml_add(gctx, t_proj, ggml_reshape_2d(gctx, b, C, 1)); + y = ggml_add(gctx, y, repeat_like(gctx, t_proj, y)); + y = vector_convnext_ggml(gctx, model, + "vector_estimator:tts.ttl.vector_field.main_blocks." + + std::to_string(post_block) + ".convnext.0", + y, 1); + return y; + }; + + ggml_tensor * x_after_g0 = run_group(cur, 0, block_pre_attn); + ggml_tensor * x_pre_g1 = group_prep(x_after_g0, 1); + ggml_tensor * x_after_g1 = run_group(x_after_g0, 1, x_pre_g1); + ggml_tensor * x_pre_g2 = group_prep(x_after_g1, 2); + ggml_tensor * x_after_g2 = run_group(x_after_g1, 2, x_pre_g2); + ggml_tensor * x_pre_g3 = group_prep(x_after_g2, 3); + ggml_tensor * x_after_g3 = run_group(x_after_g2, 3, x_pre_g3); + + // Tail: last_convnext × 4 + proj_out + mask + noise add. + ggml_tensor * tail = x_after_g3; + if (use_ct_convnext) { + ggml_tensor * tail_ct = ggml_cont(gctx, ggml_permute(gctx, tail, 1, 0, 2, 3)); + for (int j = 0; j < 4; ++j) { + tail_ct = vector_convnext_ggml_ct(gctx, model, + "vector_estimator:tts.ttl.vector_field.last_convnext.convnext." + std::to_string(j), + tail_ct, 1); + } + tail = ggml_cont(gctx, ggml_permute(gctx, tail_ct, 1, 0, 2, 3)); + } else { + for (int j = 0; j < 4; ++j) { + tail = vector_convnext_ggml(gctx, model, + "vector_estimator:tts.ttl.vector_field.last_convnext.convnext." + std::to_string(j), + tail, 1); + } + } + ggml_tensor * velocity = conv1d_f32(gctx, + require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.proj_out.net.weight"), + tail, 1, 0, 1); + ggml_tensor * masked_velocity = ggml_mul(gctx, velocity, repeat_like(gctx, inputs.mask_in, velocity)); + ggml_tensor * scaled = ggml_scale(gctx, masked_velocity, 1.0f / (float)total_steps); + ggml_tensor * next = ggml_add(gctx, inputs.noise_in, scaled); + + // Mark gf as used so the unused-parameter warning doesn't fire — the + // graph build is via the tensors above which inherit gf via ctx. + (void)gf; + return next; +} + + +// Compute one CFM denoising step as ONE ggml graph. Used only when the +// model's backend isn't CPU (Metal / CUDA / Vulkan / OpenCL). Replaces the +// ~21 sub-graph dispatches the trace_proj orchestrator emits with a single +// ggml_backend_graph_compute call. +bool supertonic_vector_step_one_graph_ggml(const supertonic_model & model, + const float * noisy_latent, + int latent_len, + const float * text_emb, + int text_len, + const float * style_ttl, + const float * latent_mask, + int current_step, + int total_steps, + std::vector & next_latent_out, + std::string * error) { + // The outer entry point sets `supertonic_op_dispatch_scope`; this + // function is only called on non-CPU backends, so the thread-local + // `supertonic_use_cpu_custom_ops()` reads false inside the helpers. + try { + const int L = latent_len; + const int Cin = model.hparams.latent_channels; // typically 16 + const int C = 512; + const int text_C = 256; + const int H = 4; // text-attention heads + const int D = 64; // text-attention head_dim + const int A = H * D; // 256 = attention width + const int SH = 2; // style-attention heads + const int SD = 128; // style-attention head_dim + const int kv_style = 50; // style attention kv length (fixed by /Expand_output_0) + + thread_local vector_step_one_graph_cache cache; + const bool need_rebuild = cache.model != &model || + cache.generation_id != model.generation_id || + cache.L != L || + cache.text_len != text_len || + cache.total_steps != total_steps; + if (need_rebuild) { + free_vector_step_one_graph_cache(cache); + cache.model = &model; + cache.generation_id = model.generation_id; + cache.L = L; + cache.text_len = text_len; + cache.total_steps = total_steps; + + // Memory budget for the consolidated graph. The original + // sub-graphs each used 128-512 nodes; the full per-step graph is + // roughly the sum (4 groups x ~700 ops/group + tail + front). + // Round up generously. + constexpr int MAX_NODES = 8192; + const size_t buf_size = ggml_tensor_overhead() * MAX_NODES + + ggml_graph_overhead_custom(MAX_NODES, false); + cache.buf.assign(buf_size, 0); + ggml_init_params p = { buf_size, cache.buf.data(), true }; + cache.ctx = ggml_init(p); + cache.gf = ggml_new_graph_custom(cache.ctx, MAX_NODES, false); + + // --- Per-call inputs --- + cache.x_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, L, Cin); + ggml_set_name(cache.x_in, "step_x_in"); ggml_set_input(cache.x_in); + cache.mask_in = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, L); + ggml_set_name(cache.mask_in, "step_mask"); ggml_set_input(cache.mask_in); + cache.t_emb_in = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, 64); + ggml_set_name(cache.t_emb_in, "step_temb"); ggml_set_input(cache.t_emb_in); + cache.text_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, text_len, text_C); + ggml_set_name(cache.text_in, "step_text_in"); ggml_set_input(cache.text_in); + cache.style_v_raw_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, kv_style, text_C); + ggml_set_name(cache.style_v_raw_in, "step_style_v"); ggml_set_input(cache.style_v_raw_in); + cache.style_kctx_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, kv_style, text_C); + ggml_set_name(cache.style_kctx_in, "step_style_kctx"); ggml_set_input(cache.style_kctx_in); + cache.noise_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, L, Cin); + ggml_set_name(cache.noise_in, "step_noise_in"); ggml_set_input(cache.noise_in); + + // --- RoPE inputs --- + cache.pos_q = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_I32, L); + ggml_set_name(cache.pos_q, "step_pos_q"); ggml_set_input(cache.pos_q); + cache.pos_k = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_I32, text_len); + ggml_set_name(cache.pos_k, "step_pos_k"); ggml_set_input(cache.pos_k); + cache.freq_factors_q = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, D / 2); + ggml_set_name(cache.freq_factors_q, "step_ff_q"); ggml_set_input(cache.freq_factors_q); + cache.freq_factors_k = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, D / 2); + ggml_set_name(cache.freq_factors_k, "step_ff_k"); ggml_set_input(cache.freq_factors_k); + + ggml_context * gctx = cache.ctx; + ggml_cgraph * gf = cache.gf; + + vector_step_inputs inputs; + inputs.x_in = cache.x_in; + inputs.mask_in = cache.mask_in; + inputs.t_emb_in = cache.t_emb_in; + inputs.text_in = cache.text_in; + inputs.style_v_raw_in = cache.style_v_raw_in; + inputs.style_kctx_in = cache.style_kctx_in; + inputs.noise_in = cache.noise_in; + inputs.pos_q = cache.pos_q; + inputs.pos_k = cache.pos_k; + inputs.freq_factors_q = cache.freq_factors_q; + inputs.freq_factors_k = cache.freq_factors_k; + + ggml_tensor * next = append_supertonic_vector_step_subgraph( + gctx, gf, model, inputs, L, text_len, total_steps); + + ggml_set_name(next, "step_next_latent"); + ggml_set_output(next); + ggml_build_forward_expand(gf, next); + cache.next_latent_out = next; + + + // Allocate. + cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector step one-graph failed"); + if (!ggml_gallocr_reserve(cache.allocr, gf)) { + throw std::runtime_error("ggml_gallocr_reserve vector step one-graph failed"); + } + ggml_gallocr_alloc_graph(cache.allocr, gf); + } + + // ===== Per-call inputs ===== + // The existing trace_proj_ggml at lines 2143/2151 sets these tensors + // DIRECTLY from the caller-provided channel-major buffers (no host + // transpose), and the views downstream interpret memory accordingly. + // Copy that pattern exactly — my earlier transpose loops were a bug + // (correlation 0.003 vs CPU reference; root-caused 2026-05-11). + ggml_backend_tensor_set(cache.x_in, noisy_latent, 0, (size_t)L * Cin * sizeof(float)); + ggml_backend_tensor_set(cache.noise_in, noisy_latent, 0, (size_t)L * Cin * sizeof(float)); + ggml_backend_tensor_set(cache.mask_in, latent_mask, 0, (size_t)L * sizeof(float)); + + std::vector te_host = time_embedding(model, current_step, total_steps); + ggml_backend_tensor_set(cache.t_emb_in, te_host.data(), 0, te_host.size() * sizeof(float)); + + // text_emb is in (C=256, text_len) channel-major; the tensor has + // ne=[text_len, 256] which puts t_len fast in memory. Same raw layout, + // so direct memcpy (matches trace_proj_ggml). + ggml_backend_tensor_set(cache.text_in, text_emb, 0, (size_t)text_len * 256 * sizeof(float)); + + // Style inputs (cached host buffers from existing helper). + const std::vector * style_v_raw_ptr = nullptr; + const std::vector * kctx_raw_ptr = nullptr; + cached_style_layouts(model, style_ttl, style_v_raw_ptr, kctx_raw_ptr); + ggml_backend_tensor_set(cache.style_v_raw_in, style_v_raw_ptr->data(), 0, style_v_raw_ptr->size() * sizeof(float)); + ggml_backend_tensor_set(cache.style_kctx_in, kctx_raw_ptr->data(), 0, kctx_raw_ptr->size() * sizeof(float)); + + // RoPE positions + freq_factors. theta is loaded from the model and + // depends on L (sequence length); recompute per call. + { + std::vector pos_q_host(L); + for (int i = 0; i < L; ++i) pos_q_host[i] = i; + ggml_backend_tensor_set(cache.pos_q, pos_q_host.data(), 0, pos_q_host.size() * sizeof(int32_t)); + std::vector pos_k_host(text_len); + for (int i = 0; i < text_len; ++i) pos_k_host[i] = i; + ggml_backend_tensor_set(cache.pos_k, pos_k_host.data(), 0, pos_k_host.size() * sizeof(int32_t)); + + const int half = 32; // D/2 = 64/2 + f32_tensor theta = read_f32(model, "vector_estimator:tts.ttl.vector_field.main_blocks.3.attn.theta"); + if ((int)theta.data.size() < half) { + throw std::runtime_error("theta tensor has fewer than D/2 elements"); + } + std::vector ff_q(half), ff_k(half); + for (int d = 0; d < half; ++d) { + ff_q[d] = (float)L / theta.data[d]; + ff_k[d] = (float)text_len / theta.data[d]; + } + ggml_backend_tensor_set(cache.freq_factors_q, ff_q.data(), 0, ff_q.size() * sizeof(float)); + ggml_backend_tensor_set(cache.freq_factors_k, ff_k.data(), 0, ff_k.size() * sizeof(float)); + } + + // ===== ONE compute call ===== + supertonic_graph_compute(model, cache.gf); + + // ===== Read output ===== + // The output tensor has ne=[L, Cin] with element (i=t, j=c) at offset + // c*L+t — exactly the (c, t) channel-major layout the caller expects. + // Direct memcpy, no transpose. + next_latent_out.assign((size_t)Cin * L, 0.0f); + ggml_backend_tensor_get(cache.next_latent_out, next_latent_out.data(), 0, + (size_t)Cin * L * sizeof(float)); + if (error) error->clear(); + return true; + } catch (const std::exception & e) { + if (error) *error = e.what(); + return false; + } +} + +// ===================================================================== +// Phase A1+A2 — single-graph CFM loop +// ===================================================================== +// +// Unroll all `total_steps` CFM denoising steps into ONE ggml_cgraph and +// dispatch with a single ggml_backend_graph_compute call. Each step's +// `x_in` and `noise_in` is the previous step's output node (no host +// round-trip), and only `t_emb_in` differs per step (N inputs, one +// per CFM step). Replaces the engine's `for (step ...) { +// supertonic_vector_step_ggml(...) }` loop on non-CPU backends. +// +// CPU keeps the per-step path because its cblas fastpaths benefit from +// the cache-per-shape boundary and the host-side rope/style helpers in +// trace_proj_ggml expect to see per-step outputs. + +struct vector_loop_one_graph_cache { + const supertonic_model * model = nullptr; + uint64_t generation_id = 0; + int L = 0; + int text_len = 0; + int total_steps = 0; + + std::vector buf; + ggml_context * ctx = nullptr; + ggml_cgraph * gf = nullptr; + ggml_gallocr_t allocr = nullptr; + + // Shared inputs (constant across CFM steps). + ggml_tensor * x0_in = nullptr; // ne=[L, Cin] initial noisy latent + ggml_tensor * mask_in = nullptr; // ne=[L] + ggml_tensor * text_in = nullptr; // ne=[text_len, 256] + ggml_tensor * style_v_raw_in = nullptr; // ne=[50, 256] + ggml_tensor * style_kctx_in = nullptr; // ne=[50, 256] + + // RoPE inputs (constant across steps). + ggml_tensor * pos_q = nullptr; + ggml_tensor * pos_k = nullptr; + ggml_tensor * freq_factors_q = nullptr; + ggml_tensor * freq_factors_k = nullptr; + + // Per-step time embedding (one tensor per CFM step). + std::vector t_emb_in; + + // Final output — last step's `next` tensor. + ggml_tensor * final_latent_out = nullptr; +}; + +void free_vector_loop_one_graph_cache(vector_loop_one_graph_cache & cache) { + if (cache.allocr) { + supertonic_safe_gallocr_free(cache.allocr, cache.model ? cache.model->generation_id : 0); + cache.allocr = nullptr; + } + if (cache.ctx) { + ggml_free(cache.ctx); + cache.ctx = nullptr; + } + cache.gf = nullptr; + cache.buf.clear(); + cache.model = nullptr; + cache.generation_id = 0; + cache.L = 0; + cache.text_len = 0; + cache.total_steps = 0; + cache.x0_in = cache.mask_in = cache.text_in = nullptr; + cache.style_v_raw_in = cache.style_kctx_in = nullptr; + cache.pos_q = cache.pos_k = cache.freq_factors_q = cache.freq_factors_k = nullptr; + cache.t_emb_in.clear(); + cache.final_latent_out = nullptr; +} + +bool supertonic_vector_loop_one_graph_ggml(const supertonic_model & model, + const float * initial_noisy_latent, + int latent_len, + const float * text_emb, + int text_len, + const float * style_ttl, + const float * latent_mask, + int total_steps, + std::vector & final_latent_out, + std::string * error) { + // Public entry point — set the thread-local dispatch flag so the + // helpers' `supertonic_use_cpu_custom_ops()` reads consistently + // (false on non-CPU backends, true on CPU + accelerate/cblas). + supertonic_op_dispatch_scope dispatch(model); + try { + const int L = latent_len; + const int Cin = model.hparams.latent_channels; + const int text_C = 256; + const int D = 64; + const int kv_style = 50; + + thread_local vector_loop_one_graph_cache cache; + const bool need_rebuild = cache.model != &model || + cache.generation_id != model.generation_id || + cache.L != L || + cache.text_len != text_len || + cache.total_steps != total_steps; + if (need_rebuild) { + free_vector_loop_one_graph_cache(cache); + cache.model = &model; + cache.generation_id = model.generation_id; + cache.L = L; + cache.text_len = text_len; + cache.total_steps = total_steps; + + // ~5x the per-step node budget. Each per-step build registered ~1056 + // ggml nodes pre-Tier-2; post-Tier-2 it's ~928. Round up to 8192/step + // × total_steps = ~40k. Plus the shared inputs (a few dozen) + + // per-step temb input tensors. + const int MAX_NODES = 8192 * std::max(1, total_steps) + 256; + const size_t buf_size = ggml_tensor_overhead() * (size_t) MAX_NODES + + ggml_graph_overhead_custom(MAX_NODES, false); + cache.buf.assign(buf_size, 0); + ggml_init_params p = { buf_size, cache.buf.data(), true }; + cache.ctx = ggml_init(p); + cache.gf = ggml_new_graph_custom(cache.ctx, MAX_NODES, false); + + // --- Shared inputs --- + cache.x0_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, L, Cin); + ggml_set_name(cache.x0_in, "loop_x0_in"); ggml_set_input(cache.x0_in); + cache.mask_in = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, L); + ggml_set_name(cache.mask_in, "loop_mask"); ggml_set_input(cache.mask_in); + cache.text_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, text_len, text_C); + ggml_set_name(cache.text_in, "loop_text_in"); ggml_set_input(cache.text_in); + cache.style_v_raw_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, kv_style, text_C); + ggml_set_name(cache.style_v_raw_in, "loop_style_v"); ggml_set_input(cache.style_v_raw_in); + cache.style_kctx_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, kv_style, text_C); + ggml_set_name(cache.style_kctx_in, "loop_style_kctx"); ggml_set_input(cache.style_kctx_in); + + cache.pos_q = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_I32, L); + ggml_set_name(cache.pos_q, "loop_pos_q"); ggml_set_input(cache.pos_q); + cache.pos_k = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_I32, text_len); + ggml_set_name(cache.pos_k, "loop_pos_k"); ggml_set_input(cache.pos_k); + cache.freq_factors_q = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, D / 2); + ggml_set_name(cache.freq_factors_q, "loop_ff_q"); ggml_set_input(cache.freq_factors_q); + cache.freq_factors_k = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, D / 2); + ggml_set_name(cache.freq_factors_k, "loop_ff_k"); ggml_set_input(cache.freq_factors_k); + + cache.t_emb_in.resize(total_steps, nullptr); + for (int s = 0; s < total_steps; ++s) { + cache.t_emb_in[s] = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, 64); + const std::string name = "loop_temb_" + std::to_string(s); + ggml_set_name(cache.t_emb_in[s], name.c_str()); + ggml_set_input(cache.t_emb_in[s]); + } + + // --- Chain N CFM steps together --- + ggml_tensor * cur_latent = cache.x0_in; + for (int s = 0; s < total_steps; ++s) { + vector_step_inputs inputs; + inputs.x_in = cur_latent; // previous step's output + inputs.mask_in = cache.mask_in; + inputs.t_emb_in = cache.t_emb_in[s]; + inputs.text_in = cache.text_in; + inputs.style_v_raw_in = cache.style_v_raw_in; + inputs.style_kctx_in = cache.style_kctx_in; + inputs.noise_in = cur_latent; // CFM: next = noise_in + v/N + inputs.pos_q = cache.pos_q; + inputs.pos_k = cache.pos_k; + inputs.freq_factors_q = cache.freq_factors_q; + inputs.freq_factors_k = cache.freq_factors_k; + + ggml_tensor * next = append_supertonic_vector_step_subgraph( + cache.ctx, cache.gf, model, inputs, L, text_len, total_steps); + const std::string step_name = "loop_next_" + std::to_string(s); + ggml_set_name(next, step_name.c_str()); + cur_latent = next; + } + ggml_set_output(cur_latent); + ggml_build_forward_expand(cache.gf, cur_latent); + cache.final_latent_out = cur_latent; + + cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector loop one-graph failed"); + if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) { + throw std::runtime_error("ggml_gallocr_reserve vector loop one-graph failed"); + } + ggml_gallocr_alloc_graph(cache.allocr, cache.gf); + } + + // --- Per-call inputs (constants across CFM steps) --- + ggml_backend_tensor_set(cache.x0_in, initial_noisy_latent, 0, + (size_t) L * Cin * sizeof(float)); + ggml_backend_tensor_set(cache.mask_in, latent_mask, 0, (size_t) L * sizeof(float)); + ggml_backend_tensor_set(cache.text_in, text_emb, 0, (size_t) text_len * 256 * sizeof(float)); + + const std::vector * style_v_raw_ptr = nullptr; + const std::vector * kctx_raw_ptr = nullptr; + cached_style_layouts(model, style_ttl, style_v_raw_ptr, kctx_raw_ptr); + ggml_backend_tensor_set(cache.style_v_raw_in, style_v_raw_ptr->data(), 0, + style_v_raw_ptr->size() * sizeof(float)); + ggml_backend_tensor_set(cache.style_kctx_in, kctx_raw_ptr->data(), 0, + kctx_raw_ptr->size() * sizeof(float)); + + { + std::vector pos_q_host(L); + for (int i = 0; i < L; ++i) pos_q_host[i] = i; + ggml_backend_tensor_set(cache.pos_q, pos_q_host.data(), 0, + pos_q_host.size() * sizeof(int32_t)); + std::vector pos_k_host(text_len); + for (int i = 0; i < text_len; ++i) pos_k_host[i] = i; + ggml_backend_tensor_set(cache.pos_k, pos_k_host.data(), 0, + pos_k_host.size() * sizeof(int32_t)); + + const int half = 32; + f32_tensor theta = read_f32(model, "vector_estimator:tts.ttl.vector_field.main_blocks.3.attn.theta"); + if ((int) theta.data.size() < half) { + throw std::runtime_error("theta tensor has fewer than D/2 elements"); + } + std::vector ff_q(half), ff_k(half); + for (int d = 0; d < half; ++d) { + ff_q[d] = (float) L / theta.data[d]; + ff_k[d] = (float) text_len / theta.data[d]; + } + ggml_backend_tensor_set(cache.freq_factors_q, ff_q.data(), 0, + ff_q.size() * sizeof(float)); + ggml_backend_tensor_set(cache.freq_factors_k, ff_k.data(), 0, + ff_k.size() * sizeof(float)); + } + + // --- Per-step time embeddings --- + for (int s = 0; s < total_steps; ++s) { + std::vector te = time_embedding(model, s, total_steps); + ggml_backend_tensor_set(cache.t_emb_in[s], te.data(), 0, + te.size() * sizeof(float)); + } + + // --- ONE compute call for ALL CFM steps --- + supertonic_graph_compute(model, cache.gf); + + // --- Read final output --- + final_latent_out.assign((size_t) Cin * L, 0.0f); + ggml_backend_tensor_get(cache.final_latent_out, final_latent_out.data(), 0, + (size_t) Cin * L * sizeof(float)); + if (error) error->clear(); + return true; + } catch (const std::exception & e) { + if (error) *error = e.what(); + return false; + } +} + +// Public-ish driver: dispatches to the unrolled-loop path on non-CPU +// backends, falls back to the per-step `supertonic_vector_step_ggml` +// loop on CPU. Gate the unrolled path off with +// SUPERTONIC_DISABLE_LOOP_GRAPH=1 to A/B against the per-step path on +// the same backend. +bool supertonic_vector_loop_ggml(const supertonic_model & model, + const float * initial_noisy_latent, + int latent_len, + const float * text_emb, + int text_len, + const float * style_ttl, + const float * latent_mask, + int total_steps, + std::vector & final_latent_out, + std::string * error) { + const bool disable_loop = + std::getenv("SUPERTONIC_DISABLE_LOOP_GRAPH") != nullptr; + if (!disable_loop && !model_prefers_cpu_kernels(model)) { + return supertonic_vector_loop_one_graph_ggml( + model, initial_noisy_latent, latent_len, text_emb, text_len, + style_ttl, latent_mask, total_steps, final_latent_out, error); + } + // CPU / disabled path: run the per-step loop in the addon's existing way. + try { + std::vector latent((size_t) model.hparams.latent_channels * latent_len); + std::memcpy(latent.data(), initial_noisy_latent, latent.size() * sizeof(float)); + std::vector next; + for (int step = 0; step < total_steps; ++step) { + if (!supertonic_vector_step_ggml(model, latent.data(), latent_len, + text_emb, text_len, + style_ttl, latent_mask, + step, total_steps, next, error)) { + return false; + } + latent.swap(next); + } + final_latent_out = std::move(latent); + if (error) error->clear(); + return true; + } catch (const std::exception & e) { + if (error) *error = e.what(); + return false; + } +} + bool supertonic_vector_step_ggml(const supertonic_model & model, const float * noisy_latent, int latent_len, @@ -3162,6 +4391,19 @@ bool supertonic_vector_step_ggml(const supertonic_model & model, std::vector & next_latent_out, std::string * error) { supertonic_op_dispatch_scope dispatch(model); + // Metal / CUDA / Vulkan / OpenCL: use the consolidated one-graph path + // (one ggml_backend_graph_compute call per CFM step instead of ~21). + // CPU: keep the multi-cache trace_proj path — its CPU fast-paths and + // thread_local sub-graph caches stay competitive on CPU and trace mode + // relies on the per-stage outputs. Set SUPERTONIC_DISABLE_ONE_GRAPH=1 + // to fall back to the multi-cache path on GPU backends if needed. + const bool disable_one_graph = std::getenv("SUPERTONIC_DISABLE_ONE_GRAPH") != nullptr; + if (!disable_one_graph && !model_prefers_cpu_kernels(model)) { + return supertonic_vector_step_one_graph_ggml(model, noisy_latent, latent_len, + text_emb, text_len, style_ttl, + latent_mask, current_step, + total_steps, next_latent_out, error); + } try { std::vector scalar_trace; std::vector ggml_trace; diff --git a/tts-cpp/src/supertonic_vocoder.cpp b/tts-cpp/src/supertonic_vocoder.cpp index fe6ffbf80d2..daf32f5ad11 100644 --- a/tts-cpp/src/supertonic_vocoder.cpp +++ b/tts-cpp/src/supertonic_vocoder.cpp @@ -88,11 +88,33 @@ ggml_tensor * repeat_like(ggml_context * ctx, ggml_tensor * v, ggml_tensor * lik std::to_string(like->ne[0]) + "," + std::to_string(like->ne[1]) + "," + std::to_string(like->ne[2]) + "," + std::to_string(like->ne[3]) + "]"); } - return ggml_repeat(ctx, v, like); + // Every caller feeds the return value straight into ggml_add / ggml_mul, + // both of which broadcast natively in ggml. Skip the explicit + // ggml_repeat node so the downstream op handles the broadcast — saves a + // kernel_repeat launch per call on Metal. + static const bool force_explicit_repeat = + std::getenv("SUPERTONIC_FORCE_EXPLICIT_REPEAT") != nullptr; + if (force_explicit_repeat) { + return ggml_repeat(ctx, v, like); + } + return v; } ggml_tensor * causal_replicate_pad_1d(ggml_context * ctx, ggml_tensor * x, int pad_left) { if (pad_left <= 0) return x; + // Prefer the fused supertonic_edge_pad_1d op when available (Metal + // via the overlay port + CPU via the parity backstop) — collapses + // the view + repeat_4d + concat triplet into a single dispatch. + // Override with SUPERTONIC_DISABLE_FUSED_EDGE_PAD=1 to A/B against + // the stock-ops chain. + static const bool disable_fused_edge_pad = + std::getenv("SUPERTONIC_DISABLE_FUSED_EDGE_PAD") != nullptr; + if (!disable_fused_edge_pad && + x->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && + ggml_is_contiguous(x)) { + return ggml_supertonic_edge_pad_1d(ctx, x, pad_left, 0); + } const int64_t C = x->ne[1]; ggml_tensor * first = ggml_view_2d(ctx, x, 1, C, x->nb[1], 0); ggml_tensor * rep = ggml_repeat_4d(ctx, first, pad_left, C, 1, 1); @@ -340,6 +362,15 @@ ggml_tensor * layer_norm_channel_ggml(ggml_context * ctx, ggml_tensor * gamma, ggml_tensor * beta, float eps = 1e-6f) { + static const bool disable_fused_layer_norm = + std::getenv("SUPERTONIC_DISABLE_FUSED_LAYER_NORM") != nullptr; + if (!disable_fused_layer_norm && + x->type == GGML_TYPE_F32 && gamma->type == GGML_TYPE_F32 && beta->type == GGML_TYPE_F32 && + x->ne[2] == 1 && x->ne[3] == 1 && + gamma->ne[0] == x->ne[1] && beta->ne[0] == x->ne[1] && + ggml_is_contiguous(x) && ggml_is_contiguous(gamma) && ggml_is_contiguous(beta)) { + return ggml_supertonic_layer_norm_channel(ctx, x, gamma, beta, eps); + } ggml_tensor * y = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); y = ggml_norm(ctx, y, eps); y = ggml_mul(ctx, y, repeat_like(ctx, gamma, y)); @@ -352,29 +383,128 @@ ggml_tensor * convnext_block_ggml(ggml_context * ctx, ggml_tensor * x, int idx) { static const int dilations[10] = {1, 2, 4, 1, 2, 4, 1, 1, 1, 1}; - // Audit follow-up #6 (F7) — fused LN + pw1 + gelu + pw2 + γ + - // residual. The fused helper keeps the layer-norm output in - // `[C, T0]` (channel-major) memory and lowers both K=1 pointwise - // convs to direct `ggml_mul_mat` against that layout, eliminating - // the LN back-permute/cont and both im2col copies the previous - // chain paid (audit cost: ~16.8 MiB / vocoder pass). The - // depthwise op stays in this TU so the CBLAS custom-op fast - // path is unaffected. Trace + pipeline parity preserved — the - // fused helper computes the same arithmetic in the same order, - // just on a different (compatible) intermediate layout. See - // `supertonic_internal.h::convnext_block_fused_ggml` for the - // op-by-op rationale and - // `test/test_supertonic_convnext_block_fused.cpp` for the - // parity test. + const bool use_cpu_custom = supertonic_use_cpu_custom_ops(); ggml_tensor * dw = depthwise_conv1d_causal_ggml(ctx, x, w.dw_w, w.dw_b, dilations[idx]); - return convnext_block_fused_ggml( - ctx, - /*residual=*/x, - /*dw_out=*/dw, - w.norm_g, w.norm_b, - w.pw1_w, w.pw1_b, - w.pw2_w, w.pw2_b, - w.gamma); + if (use_cpu_custom) { + // Audit follow-up #6 (F7) — fused LN + pw1 + gelu + pw2 + γ + + // residual. The fused helper keeps the layer-norm output in + // `[C, T0]` (channel-major) memory and lowers both K=1 pointwise + // convs to direct `ggml_mul_mat` against that layout, eliminating + // the LN back-permute/cont and both im2col copies the previous + // chain paid (audit cost: ~16.8 MiB / vocoder pass). The + // depthwise op stays in this TU so the CBLAS custom-op fast + // path is unaffected. Trace + pipeline parity preserved — the + // fused helper computes the same arithmetic in the same order, + // just on a different (compatible) intermediate layout. See + // `supertonic_internal.h::convnext_block_fused_ggml` for the + // op-by-op rationale and + // `test/test_supertonic_convnext_block_fused.cpp` for the + // parity test. + return convnext_block_fused_ggml( + ctx, + /*residual=*/x, + /*dw_out=*/dw, + w.norm_g, w.norm_b, + w.pw1_w, w.pw1_b, + w.pw2_w, w.pw2_b, + w.gamma); + } + // Metal / non-CPU backend path: keep the granular chain so the + // per-op Metal fused-kernel fast paths inside the helpers (layer + // norm, bias+gelu, ...) get a chance to fire. GGML_OP_CUSTOM is + // rejected on GPU backends so the F7 fused helper above isn't + // usable here regardless. + ggml_tensor * residual = x; + ggml_tensor * y = dw; + y = layer_norm_channel_ggml(ctx, y, w.norm_g, w.norm_b); + // pw1 + bias + GELU. On Metal we drop the bias from conv1d_causal_ggml + // and feed the pre-bias matmul output to the fused bias_gelu op (one + // dispatch instead of two: ggml_add + gelu_erf). CPU keeps its existing + // cblas+bias_inside path — the standard library erff in the unfused + // chain is already the cheapest there. + static const bool disable_fused_bias_gelu = + std::getenv("SUPERTONIC_DISABLE_FUSED_BIAS_GELU") != nullptr; + if (!disable_fused_bias_gelu && + y->type == GGML_TYPE_F32 && w.pw1_w->type == GGML_TYPE_F32 && + w.pw1_b->type == GGML_TYPE_F32) { + y = conv1d_causal_ggml(ctx, y, w.pw1_w, /*b=*/nullptr); + if (y->ne[2] == 1 && y->ne[3] == 1 && + w.pw1_b->ne[0] == y->ne[1] && + ggml_is_contiguous(y) && ggml_is_contiguous(w.pw1_b)) { + y = ggml_supertonic_bias_gelu(ctx, y, w.pw1_b); + } else { + y = ggml_add(ctx, y, repeat_like(ctx, w.pw1_b, y)); + y = ggml_gelu_erf(ctx, y); + } + } else { + y = conv1d_causal_ggml(ctx, y, w.pw1_w, w.pw1_b); + y = ggml_gelu_erf(ctx, y); + } + // NOTE: the vector_estimator's `ggml_supertonic_pw2_residual` op + // expects `gamma` to be `[C]` (per-channel scale); the vocoder + // however stores `gamma` as a `[1]` scalar (single learnable + // scale per ConvNeXt block). The shapes are incompatible, so we + // keep the unfused chain here. A vocoder-specific fused op with + // scalar gamma is possible but the win would be tiny (~10 + // dispatches × ~40μs = 0.4 ms). + y = conv1d_causal_ggml(ctx, y, w.pw2_w, w.pw2_b); + y = ggml_mul(ctx, y, repeat_like(ctx, w.gamma, y)); + return ggml_add(ctx, residual, y); +} + +ggml_tensor * pointwise_matmul_ct_voc(ggml_context * ctx, + ggml_tensor * x_ct, + ggml_tensor * w, + ggml_tensor * b) { + GGML_ASSERT(w->ne[0] == 1); + GGML_ASSERT(w->ne[1] == x_ct->ne[0]); + GGML_ASSERT(ggml_is_contiguous(w)); + ggml_tensor * w_2d = ggml_reshape_2d(ctx, w, w->ne[1], w->ne[2]); + ggml_tensor * x_2d = ggml_reshape_2d(ctx, x_ct, x_ct->ne[0], x_ct->ne[1]); + ggml_tensor * y = ggml_mul_mat(ctx, w_2d, x_2d); + if (b) y = ggml_add(ctx, y, repeat_like(ctx, b, y)); + return y; +} + +// Phase B2 follow-up: vocoder ConvNeXt block on `[C, T]` activations +// end-to-end. Takes `[C, T]` input and returns `[C, T]` — the caller +// wraps the 10-block chain in a single `[T, C] -> [C, T]` permute at +// entry and a single `[C, T] -> [T, C]` permute at exit, so this +// block has zero intra-block permutes. +// +// Vocoder ConvNeXt differs from vector_estimator's: (1) depthwise is +// **causal** (left-only pad) rather than symmetric edge-clamp — handled +// by the `_causal_ct` variant of the fused depthwise kernel (port-v14). +// (2) `gamma` is a scalar `[1]`, not per-channel, so the `pw2_residual_ct` +// fused op doesn't fit — unfused scalar `mul + add` tail. (3) `norm_g` / +// `norm_b` ship as `[1, C]` (same flatten-needed quirk as vector_estimator's +// `.gamma`). +// +// Caller: `SUPERTONIC_DISABLE_CT_VOCODER=1` reverts to legacy +// `convnext_block_ggml`. +ggml_tensor * convnext_block_ggml_ct(ggml_context * ctx, + const supertonic_vocoder_convnext_weights & w, + ggml_tensor * x_ct, + int idx) { + static const int dilations[10] = {1, 2, 4, 1, 2, 4, 1, 1, 1, 1}; + ggml_tensor * residual = x_ct; + + auto flatten_1d = [&](ggml_tensor * t) -> ggml_tensor * { + const int64_t n = ggml_nelements(t); + if (t->ne[0] == n && t->ne[1] == 1 && t->ne[2] == 1 && t->ne[3] == 1) return t; + return ggml_reshape_1d(ctx, t, n); + }; + + ggml_tensor * y_ct = ggml_supertonic_depthwise_1d_causal_ct(ctx, x_ct, + w.dw_w, flatten_1d(w.dw_b), dilations[idx]); + y_ct = ggml_supertonic_layer_norm_channel_ct(ctx, y_ct, + flatten_1d(w.norm_g), flatten_1d(w.norm_b), 1e-6f); + y_ct = pointwise_matmul_ct_voc(ctx, y_ct, w.pw1_w, /*bias=*/nullptr); + y_ct = ggml_supertonic_bias_gelu_ct(ctx, y_ct, flatten_1d(w.pw1_b)); + y_ct = pointwise_matmul_ct_voc(ctx, y_ct, w.pw2_w, flatten_1d(w.pw2_b)); + // Scalar gamma multiply (broadcasts in any layout). + y_ct = ggml_mul(ctx, y_ct, repeat_like(ctx, w.gamma, y_ct)); + return ggml_add(ctx, residual, y_ct); } struct vocoder_graph_cache { @@ -415,6 +545,10 @@ void free_vocoder_cache(vocoder_graph_cache & cache) { void build_supertonic_vocoder_cache(vocoder_graph_cache & cache, const supertonic_model & model, int latent_len) { + // `supertonic_op_dispatch_scope` is set by the outer + // `supertonic_vocoder_forward_ggml` entry point; inside graph builders + // we read the thread-local flag directly. + const bool use_cpu_custom = supertonic_use_cpu_custom_ops(); free_vocoder_cache(cache); cache.model = &model; cache.generation_id = model.generation_id; @@ -470,9 +604,28 @@ void build_supertonic_vocoder_cache(vocoder_graph_cache & cache, x = conv1d_causal_ggml(cache.ctx, x, model.vocoder.embed_w, model.vocoder.embed_b); ggml_set_name(x, "vocoder_embed"); - for (int i = 0; i < 10; ++i) { - x = convnext_block_ggml(cache.ctx, model.vocoder.convnext[(size_t) i], x, i); - ggml_set_name(x, ("vocoder_convnext_" + std::to_string(i)).c_str()); + // Phase B2 follow-up: route the 10-block ConvNeXt chain through the + // `[C, T]` variant on Metal. Each block runs depthwise (causal_ct) + + // layer_norm + pw1 + bias_gelu + pw2 + scalar gamma + residual add + // entirely on `[C, T]` — no intra-block permutes. The single + // `[T, C] -> [C, T]` permute happens once before the chain and the + // single reverse permute once after. Override: + // SUPERTONIC_DISABLE_CT_VOCODER=1. + static const bool disable_ct_vocoder = + std::getenv("SUPERTONIC_DISABLE_CT_VOCODER") != nullptr; + const bool use_ct_vocoder = !disable_ct_vocoder && !use_cpu_custom; + if (use_ct_vocoder) { + ggml_tensor * x_ct = ggml_cont(cache.ctx, ggml_permute(cache.ctx, x, 1, 0, 2, 3)); + for (int i = 0; i < 10; ++i) { + x_ct = convnext_block_ggml_ct(cache.ctx, model.vocoder.convnext[(size_t) i], x_ct, i); + ggml_set_name(x_ct, ("vocoder_convnext_" + std::to_string(i)).c_str()); + } + x = ggml_cont(cache.ctx, ggml_permute(cache.ctx, x_ct, 1, 0, 2, 3)); + } else { + for (int i = 0; i < 10; ++i) { + x = convnext_block_ggml(cache.ctx, model.vocoder.convnext[(size_t) i], x, i); + ggml_set_name(x, ("vocoder_convnext_" + std::to_string(i)).c_str()); + } } // F2: reference the pre-baked weight tensors directly instead