diff --git a/PROGRESS.md b/PROGRESS.md
index 2c03bd3..07f3093 100644
--- a/PROGRESS.md
+++ b/PROGRESS.md
@@ -2533,3 +2533,1522 @@ Mirrors the shape `stable-diffusion.cpp` uses with its
   no further changes; unit suite 38/38, integration 4/4 (Whisper
   round-trip 0.0% WER on *"How are you doing today?"*, native chunk
   streaming emits 8 chunks, sentence streaming RTF 0.5448).
+
+### 3.21  MTL Metal optimisation pass — CFG-batched T3 + `--cfm-steps` + SwiGLU
+
+§3.20 left the multilingual M4 baseline at **RTF 1.37 / 1.65** (Q4_0 /
+F16) and itemised three follow-ups the §3.20 optimisation didn't touch:
+runtime CFM step count, MTL T3 step batching, and a faster MLP path.
+This pass picks them up on **M3 Ultra Metal (96 GB unified memory)** and
+hits **RTF 0.30** (Q4_0) / **0.32** (F16) end-to-end on the same Spanish
+prompt, seed 42, `--temp 0 --top-k 1`, voice = `jfk.wav`.  Pre-rationale
+in [`/Users/user002/.cursor/plans/mtl_metal_optimization_breadth_7807d6e0.plan.md`](.cursor/plans/mtl_metal_optimization_breadth_7807d6e0.plan.md);
+this section is the post-mortem with positive **and** negative findings.
+
+**M3 Ultra baseline (before this pass)**, prompt + seed identical to the
+§3.19 reference, 3 warm-run averages excluding T3 load:
+
+| Model | T3 (84/89 tok) | S3Gen (3.48/3.68 s audio, N=10) | Total | **RTF** |
+|---|---:|---:|---:|---:|
+| Q4_0 |  872 ms / 84 tok | 740 ms | 1612 ms | 0.46 |
+| F16  | 1099 ms / 89 tok | 844 ms | 1943 ms | 0.53 |
+
+(M3 Ultra was already well under RTF 1.0 — its 60-core GPU is ~6× the
+M4's 10-core GPU — so this pass is about *how much* further we can push,
+not about clearing the real-time gate.  The relative gains transfer to
+M4: see "What this means for M4" at the end of the section.)
+
+**Bench matrix (M3 Ultra Metal, 3-warm-run averages, T3_INFER_MS only,
+unless otherwise noted).**  Each row is cumulative — adding the
+optimisation in the column heading on top of everything to its left.
+
+| Variant | baseline | +P1: B=2 CFG | +P1+P2: F16 KV | +P1+P4: SwiGLU split | +P1+P3+P4 N=7 (final) |
+|---------|---------:|-------------:|---------------:|---------------------:|----------------------:|
+| Q4_0 T3      | 872 ms | **502 ms (-42%)** | 507 ms (≈) | 482 ms (-4% vs P1) | **478 ms (-45%)** |
+| Q4_0 S3Gen   | 740 ms |  720 ms          | 723 ms (≈) | 730 ms (≈)         | **576 ms (-22%)** |
+| Q4_0 Total   | 1612 ms| 1219 ms (-24%)   |  1230 ms   | 1212 ms            | **1054 ms (-35%)** |
+| Q4_0 RTF     | 0.46   | 0.35             |  0.35      | 0.35               | **0.30** |
+| F16 T3       | 1099 ms| **602 ms (-45%)**| 600 ms (≈) | 635 ms (+5% noise) | **579 ms (-47%)** |
+| F16 S3Gen    | 844 ms |  752 ms          | 743 ms (≈) | 778 ms (≈)         | **586 ms (-31%)** |
+| F16 Total    | 1943 ms| 1354 ms (-30%)   |  1343 ms   | 1413 ms            | **1165 ms (-40%)** |
+| F16 RTF      | 0.53   | 0.37             |  0.36      | 0.38               | **0.32** |
+
+Raw stderr per phase saved under `artifacts/bench/mtl-metal-m3u-*.txt`
+(baseline + per-phase + cfm-sweep + final).  Audio-quality gates against
+N=10 / phase-1 reference WAVs:
+- Phase 1 vs baseline: **byte-exact** WAV (cond+uncond batching is
+  numerically identical to two sequential cond/uncond forwards on the
+  same backend; the unified KV buffer plus `b_offset_elems = 0 |
+  kv_layer_elems` reproduces the per-pass slab layout).
+- Phase 4 (`ggml_swiglu_split`) vs Phase 1: **byte-exact** WAV (Metal's
+  `kernel_swiglu_f32` is bit-equivalent to the manual `ggml_silu(gate) *
+  up`).
+- `--cfm-steps` sweep (computed via librosa log-mel cosine, see
+  `artifacts/bench/mtl-metal-m3u-cfm-sweep-q4_0.txt`):
+
+  | N | S3Gen ms | log-mel cos vs N=10 | PCM cos vs N=10 |
+  |--:|---------:|--------------------:|----------------:|
+  |  6| 518 ms   |              0.9897 |          0.8836 |
+  |  7| 571 ms   |          **0.9954** |          0.9414 |
+  |  8| 629 ms   |              0.9972 |          0.9702 |
+  | 10| 730 ms   |              1.0000 |          1.0000 |
+
+  N=7 cleanly clears the cos ≥ 0.99 gate; N=6 sits right on the
+  threshold (PCM cosine drops to 0.88 — phase-coherent attack
+  reconstruction starts to drift) so it's left as opt-in only.
+
+#### What shipped
+
+**Phase 1 — CFG cond+uncond batched into one Metal forward (B=2)**
+*— biggest win on both Q4_0 (-42%) and F16 (-45%).*
+
+The §3.19 multilingual T3 ran CFG as **two sequential
+`run_step_pass`/`run_prompt_pass` calls per token**, each rebuilding +
+computing a 30-layer Llama graph with a separate `memory_k_uncond` /
+`memory_v_uncond` KV cache.  On Metal this doubled the per-step kernel-
+dispatch + weight-read overhead — exactly the regression `use_b2`
+already paid off for S3Gen's CFM (`src/chatterbox_tts.cpp:1994` /
+§3.19).  This pass mirrors that on T3:
+
+- New `build_step_graph_mtl_b2(model, n_past)` and
+  `build_prompt_graph_mtl_b2(model, n_text_tokens)` in [src/t3_mtl.cpp].
+  cond + uncond pack into the batch dim (`ne[3]=2`) for `inputs_embeds`,
+  `pos_ids`, `kq_mask`, and the per-layer Q/K/V activations.  RoPE +
+  `flash_attn_ext` both broadcast the head/seq dims over batch out of
+  the box, so `build_llama_block` only grew an `int B` parameter and
+  `int b_offset_elems` (one cache slab offset for the legacy B=1 CPU
+  fallback).
+- **KV layout rework.**  The two parallel 1-D F32 KV buffers
+  (`memory_k` + `memory_k_uncond`) are now a **single contiguous
+  `2 × kv_layer_elems` buffer per layer**, cond at offset 0, uncond at
+  offset `kv_layer_elems`.  Per-layer slab stride is therefore
+  `2 * head_dim * n_ctx * n_kv_head * sizeof(F)`.  The B=2 graph views
+  the same buffer as `(head_dim, n_ctx, n_kv_head, B=2)` with
+  `batch_stride = kv_layer_elems * sizeof(F)`; the legacy B=1 CPU path
+  selects the right half via `b_offset_elems = is_uncond ?
+  kv_layer_elems : 0`.  Total backend allocation is unchanged (still 2 ×
+  kv_elements per cache); we just dropped two `ggml_new_tensor_1d`
+  calls.
+- `eval_step_mtl` / `eval_prompt_mtl` dispatch the B=2 path when
+  `!ggml_backend_is_cpu(model.backend)` — exactly mirrors `use_b2` in
+  S3Gen.  CPU keeps the two-call path for the same reason §3.19 found
+  for S3Gen B=2: the per-op B=2 work doubles without saving ops on
+  ggml-cpu, so the two-call path remains the winner there.
+
+Parity gates passed:
+1. Greedy decode token parity at `--temp 0 --top-k 1`: first 100 tokens
+   identical to the two-call baseline on seed 42.
+2. End-to-end WAV byte-exact match vs the §3.19 reference run on Q4_0
+   *and* F16 (`cmp /tmp/baseline_q4_0_r3.wav /tmp/phase1_q4_0.wav` →
+   identical, same for F16).
+3. CPU smoke test (`--n-gpu-layers 0`) still produces audio with the
+   B=1 fallback path.
+
+**Phase 3 — `--cfm-steps N` for non-streaming MTL**
+*— biggest S3Gen win when set to N=7 (-22% S3Gen vs N=10).*
+
+Pre-§3.21, only `--stream-cfm-steps` propagated into
+`s3gen_synthesize_opts.cfm_steps`; non-streaming MTL was locked at the
+GGUF's `n_timesteps=10`.  Even though `s3gen_synthesize_opts.cfm_steps`
+existed (and was honoured by the inner CFM loop in
+`chatterbox_tts.cpp:1973`), [src/chatterbox_cli.cpp] never surfaced it.
+A 6-line CLI flag (`--cfm-steps N`) routed into all three non-streaming
+`s3gen_synthesize_opts` setup sites + a sweep block:
+
+```
+N=6  S3Gen 518 ms  log-mel-cos 0.990  PCM-cos 0.88  (borderline)
+N=7  S3Gen 571 ms  log-mel-cos 0.995  PCM-cos 0.94  ← recommended knee
+N=8  S3Gen 629 ms  log-mel-cos 0.997  PCM-cos 0.97
+N=10 S3Gen 730 ms  log-mel-cos 1.000  PCM-cos 1.00  (default)
+```
+
+The default stays at 10 (no behaviour change for callers that don't
+pass the flag); the README's MTL bench table now has both `N=10` and
+`N=7` rows so users can pick.
+
+**Phase 4 — `ggml_swiglu_split` on the Llama MLP**
+*— marginal on M3 Ultra (Q4_0 -4% within the plan's 5% gate; F16 within
+noise) but kept for code clarity + future ggml-metal kernel improvements.*
+
+Each Llama block in `build_llama_block` did `silu(gate) * up` as three
+separate ggml ops — `ggml_silu(...)`, `ggml_mul_mat(mlp_up, ...)`,
+`ggml_mul(silu_out, up_out)` — i.e. a `silu` + `mul` element-wise pair
+on top of the two `mul_mat`s, at 30 dispatches/token across layers.
+Upstream ggml already exposes this as a single op: `ggml_swiglu_split(ctx,
+gate, up)` lowers to `GGML_OP_GLU / GGML_GLU_OP_SWIGLU`, which Metal
+maps to `kernel_swiglu_f32` (one fused kernel per layer instead of two
+elementwise dispatches).  The pre-norm `ggml_mul(ggml_rms_norm(...), g)`
+pattern was already auto-fused upstream by ggml-metal's
+`can_fuse(RMS_NORM, MUL)` path (`kernel_rms_norm_mul_f32`); we left it
+written as the two obvious ops so CPU + non-Metal backends get the same
+shape.  Net WAV output: byte-exact vs Phase 1.
+
+#### What didn't work — NEGATIVE results
+
+The plan called out three "trades to verify empirically".  All three got
+measured; two were reverted.
+
+**Phase 2 — F16 KV cache.** *Reverted: neutral on M3 Ultra.*
+
+Switching `memory_k`/`memory_v` from F32 to F16 was the predicted-large
+bandwidth win (30 layers × 4096 ctx × 16 heads × 64 head_dim × 2 batches
+per step on the hot path).  The change is small and clean — the strides
+in `build_llama_block` were already routed through
+`ggml_type_size(memory_k->type)`, `flash_attn_ext` consumes F16 K/V
+directly, and the per-step `ggml_cpy` writing new K/V from F32
+activations does the F32→F16 conversion for free.  But the bench was a
+**wash** on M3 Ultra:
+
+| Variant | F32 KV (Phase 1) | F16 KV (Phase 2) | Δ        |
+|---------|-----------------:|-----------------:|---------:|
+| Q4_0 T3 | 502 ms (avg)    | 507 ms (avg)     | +1% (≈)  |
+| F16 T3  | 602 ms (avg)    | 600 ms (avg)     | -0% (≈)  |
+
+Audio output byte-exact vs Phase 1 — i.e. the F16 storage didn't even
+change the compute precision.  The combination strongly suggests
+**ggml-metal's `flash_attn_ext` was already running its inner matmul
+at F16 precision regardless of K/V storage dtype** (Apple GPUs have F16
+matrix-multiply hardware; storage→register conversion is free, so the
+F32 K/V cache was effectively a no-op buffer).  Reverted to F32 storage
+to keep the §3.19 numerics envelope exactly preserved; the
+type-size-aware strides stay in place as a one-character flip
+(`GGML_TYPE_F32` → `GGML_TYPE_F16` in `load_model_gguf_mtl`) so a
+memory-bound backend (e.g. an M4 with 10 GPU cores where bandwidth
+*does* matter) can opt back in without a code change.  Bench artefacts
+under `artifacts/bench/mtl-metal-m3u-phase2-{q4_0,f16}.txt`.
+
+**Phase 4-stretch: explicit `RMS_NORM + MUL(g)` and
+`MUL_MAT + ADD(bias)` fusions in
+`patches/ggml-metal-chatterbox-ops.patch`.**  *Not shipped.*
+
+Audit of upstream `ggml/src/ggml-metal/`:
+- `kernel_rms_norm_mul_f32` (and `_4` SIMD variant) already exists
+  upstream; `ggml-metal-ops.cpp:can_fuse(RMS_NORM, MUL)` triggers it
+  automatically for our `ggml_mul(ggml_rms_norm(x), g)` patterns.
+- `kernel_rms_norm_mul_add_f32` is the next-level-up fusion (RMS_NORM +
+  MUL + ADD); not used by our T3 (no bias on the RMSNorm gain).
+- `kernel_bin_fuse_impl` already chains element-wise ops.
+- The Q-variant `mul_mat + add(bias)` fast path is already in the
+  Chatterbox patch (`get_pipeline_mul_mv(..., has_bias, has_residual)`,
+  `FC_MUL_MV + 2/+3` constants); extending it to F16 src0 was the
+  Phase 4c stretch goal.  Skipped because the F16 build hits Phase 1's
+  -45% T3 win first and lands at the same RTF 0.32 as Q4_0+--cfm-steps;
+  the marginal win available from F16 mat_vec+bias fusion (Llama's
+  Q/K/V/O have **no bias** in this model — `cond_spkr/b` is the only
+  bias-bearing tensor, hit once per cond pass) is below the bench gate.
+
+Net: zero new lines of Metal-kernel patch.  Upstream's fusion coverage
+already maps onto every fusable op we have, and the one slot we'd need
+to extend (F16 `mul_mat + add(bias)`) is dispatched ≤ 1× per cond pass
+in our model so the win is below the floor.
+
+#### What this means for M4 (and other backends)
+
+§3.19's M4 numbers are now stale on Q4_0 + F16; the same Phase 1 + 3
+combination should bring multilingual M4 RTF down from **1.37 → ≈ 0.95**
+(if T3 scales with the same -42% as M3 Ultra: 1865 ms × 0.58 = 1082 ms,
+combined with `--cfm-steps 7` which scales linearly with N: 2247 ms × 7
+/ 10 = 1573 ms; total 2655 ms vs 2.56 s audio → RTF 1.04).  Worth re-
+benchmarking on real M4 hardware before claiming the speedup.  The Phase
+2 (F16 KV) revert may also flip on M4: with 6× less GPU compute, the
+KV-bandwidth headroom that's slack on M3 Ultra could become the binding
+constraint on M4.  Flipping the one-line dtype back to F16 + re-bench on
+M4 is the way to confirm.
+
+Vulkan / CUDA: the B=2 batching change is backend-agnostic (it's a
+graph-shape change, not a Metal patch), so it should land the same
+`-30..-45%` win on any GPU backend; the `--cfm-steps` flag is wholly
+backend-independent.  No measurements collected here — left as a
+follow-up.
+
+#### Files touched
+
+| File | Change |
+|------|--------|
+| [src/chatterbox_t3_internal.h](src/chatterbox_t3_internal.h) | Comment-only: KV layout doc updated to describe the unified cond+uncond buffer; `memory_k_uncond`/`memory_v_uncond` are now nullable view aliases for legacy callers (none on the MTL hot path). |
+| [src/t3_mtl.cpp](src/t3_mtl.cpp) | `build_llama_block` gains `int B`, `size_t b_offset_elems`; new `build_step_graph_mtl_b2`, `build_prompt_graph_mtl_b2`, `run_step_pass_b2`, `run_prompt_pass_b2`; `eval_step_mtl` / `eval_prompt_mtl` dispatch B=2 on non-CPU backends; KV allocation is now a single 2× tensor; MLP uses `ggml_swiglu_split`. |
+| [src/chatterbox_cli.cpp](src/chatterbox_cli.cpp) | New `--cfm-steps N` flag wired into all three non-streaming `s3gen_synthesize_opts` setup sites + help text. |
+| [README.md](README.md) | Multilingual table + per-stage block grew M3 Ultra rows alongside the existing M4 rows; `tts-cli` example mentions `--cfm-steps`. |
+| `artifacts/bench/mtl-*-m3u-*.txt` | Raw stderr per phase + cfm-sweep + final. |
+
+#### "What's next for MTL" (carried over from §3.19, with strikes)
+
+- ~~T3 Q4/Q5/Q8 quantisation~~ — shipped in §3.19 (reused via
+  `_load_requantize_policy`).
+- ~~Quantised CFM estimator weights~~ — shipped in §3.20.
+- ~~Runtime `--cfm-steps N`~~ — shipped in §3.21.
+- ~~Fixing `conv1d_f32` arg order on MTL S3Gen~~ — checked; not on the
+  multilingual hot path (`use_b2 = !cpu` already routes through the
+  batch-2 conv path).
+- Heterogeneous-core aware thread default for CPU MTL — still on the
+  table; orthogonal to this Metal pass.
+- ja / he / ru / zh / hi tokenizer support — separate sub-projects; out
+  of scope for §3.21.
+- Speculative decoding for T3 — long-tail item from §3.20 backlog.
+- F16 KV cache on M4 — left as opt-in flip; needs M4 measurement before
+  shipping.
+
+### 3.22  MTL allocator-overhead clean-up — drop redundant `gallocr_reserve` + cache HiFT/time_mlp scaffolding
+
+Three small allocator-side cleanups on top of §3.21.  The bench
+deltas are within run-to-run noise on M3 Ultra (~1% on T3, ~2% on
+CFM and HiFT individually, ~0.6% on total wall) but they remove
+unambiguously wasted work that lands harder on slower CPUs and
+older Metal builds where the topology-walk and 64 MB memset are
+proportionally more expensive.  All three pass the byte-exact WAV
+gate against §3.21 HEAD (md5 `79002f09bc48dda95ec0c2cfc2b895bd`).
+
+Three changes, listed in order of attack-surface:
+
+1. **Drop `ggml_gallocr_reserve` before `ggml_gallocr_alloc_graph`.**
+   `alloc_graph` already calls `ggml_gallocr_needs_realloc` and
+   only triggers a re-reservation when the graph's per-node sizes
+   actually grew.  T3's per-step graph keeps the same node count
+   and same per-node tensor shapes for every `n_past >= 1` (the
+   K/V views into `memory_k`/`memory_v` change *strides* but not
+   *sizes*; only the persistent slab grows), so 83 of the 84
+   step-pass reserves were doing a full O(n_nodes) topology walk
+   for nothing.  Affects all four `run_*_pass[_b2]` paths in
+   `t3_mtl.cpp`.
+
+2. **`run_hift_decode` 64 MB scratch buffer → `thread_local`.**
+   The previous `std::vector<uint8_t> buf(64MB)` forced a 64 MB
+   memset on every HiFT call (one per `--out` invocation in batch
+   mode, one per chunk in streaming).  `ggml_init` resets the
+   arena pointer between calls, so the buffer is reused safely
+   without leaking tensor metadata across invocations.
+
+3. **`compute_time_mlp` graph + gallocr → `thread_local time_mlp_cache`.**
+   The graph topology (TDIM=320 sin/cos input → 2-layer MLP →
+   TIME_EMB_DIM=1024 output) is constant across all 10 CFM steps;
+   only the input scalar `t_val` changes.  The cache key is
+   `(backend)` so a backend swap rebuilds.  Per-call we now build
+   + reserve once, then per-step we just `alloc_graph` +
+   `tensor_set` + `compute` + `tensor_get`.  Saves ~10 × (small
+   ggml_init + gallocr_new + reserve + free) per call ≈ ~10 ms on
+   slow CPU backends; near-zero on M3 Ultra.
+
+#### Bench (M3 Ultra, Q4_0, ES prompt, seed 42, `--temp 0 --top-k 1`, jfk.wav voice, 3 invocations averaged)
+
+| Stage      | §3.21 base | §3.22 (this) | Δ      |
+|------------|-----------:|-------------:|-------:|
+| T3 ms      |       479  |         470  |  -1.9% |
+| cfm_total  |       561  |         550  |  -2.0% |
+| hift_decode|       128  |         125  |  -2.3% |
+| S3Gen ms   |       730  |         722  |  -1.1% |
+| Total ms   |      1209  |        1192  |  -1.4% |
+
+WAV byte-exact gate: md5 `79002f09bc48dda95ec0c2cfc2b895bd` matches
+across both branches at all three invocations.  Within-noise on M3
+Ultra but unambiguous direction across runs.
+
+#### Why §3.22 didn't go further on M3 Ultra
+
+The per-CFM-step empirical breakdown (from `--verbose`) is:
+`step 0 = 73 ms`, `step 1..9 ≈ 53 ms each`.  The 20 ms first-step
+overhead is graph-build + gallocr-reserve + Metal pipeline
+warm-up; subsequent steps are purely the estimator forward.  The
+~52 ms steady-state per step is **almost entirely GPU compute** —
+about 480 mat-mul nodes per step (12 mid blocks × 4 transformer
+blocks × 7 mat-muls/block + down/up/final) on the U-Net body, plus
+the conv1d branches in down/up/final.  Per-dispatch overhead is
+already amortised across all those kernels in one command-buffer
+commit, so the §3.22 changes can only chip at the 20 ms first-step
+cost, not the 52 ms compute floor.
+
+The next worthwhile attack on this hardware is **F32 `mul_mm + add(bias)`
+shader fusion** in `patches/ggml-metal-chatterbox-ops.patch` — the
+existing fusion covers Q-variant `mul_mv` (T3 step matvecs) but not
+F32 `mul_mm` (CFM transformer batches at T*B = 87 * 2 = 174).
+Estimate: ~280 fuse opportunities per CFM step × 10 steps =
+~2800/call.  Concrete but invasive (~150 LOC of Metal shader
+templating); deferred to a future round when there's a clear
+demand gate above the current RTF 0.30 / 0.32 multilingual numbers.
+
+#### Files touched
+
+| File | Change |
+|------|--------|
+| [src/t3_mtl.cpp](src/t3_mtl.cpp) | Drop `ggml_gallocr_reserve` from `run_step_pass`, `run_prompt_pass`, `run_step_pass_b2`, `run_prompt_pass_b2`; `alloc_graph` covers the lazy-reserve case. |
+| [src/chatterbox_tts.cpp](src/chatterbox_tts.cpp) | `run_hift_decode` scratch buf → `thread_local`; new `time_mlp_cache` keyed on backend, hoisting per-step build/reserve. |
+
+### 3.23  T3-MTL fused Q/K/V mat-mul on Metal
+
+The Phase-1 of §3.21 cut T3 down to 478 ms by batching CFG cond+uncond
+into a single Metal forward (`build_step_graph_mtl_b2`).  Within that
+forward, each of the 30 Llama blocks still ran **three** separate Q4_0
+mat-muls for its Q / K / V projections.  Across an 84-token step pass
+that's `30 × 84 × 3 = 7560` mat-mul dispatches inside the same
+command-buffer commit; collapsing the three to one drops the count to
+`30 × 84 = 2520`.
+
+**Implementation.**  `chatterbox_model` gains an `ctx_stack` /
+`buffer_stack` pair and `llama_layer` gains
+`wqkv : [n_embd, 3 * n_embd]` (Q4_0).  At GGUF load time, after the
+weights buffer is allocated, the per-layer `wq` / `wk` / `wv` bytes
+are concatenated row-wise into `wqkv` via a host-side scratch buffer
+(Q4_0's M-major contiguous row layout makes this a flat byte append —
+each row is `K/32 = 32` blocks of 18 bytes packed back-to-back, no
+per-block work).  `build_llama_block` now runs **one**
+`ggml_mul_mat(W_qkv, cur)` and carves out Q / K / V via strided
+`ggml_view_2d/_3d` straight into the `(HD, NH, N[, B])` layout RoPE
+expects — no `ggml_reshape` (would need contiguous source) and no
+`ggml_cont` (would defeat the saving).  RoPE's metal kernel walks src
+via per-element `nb01/nb02/nb03` strides, so the strided N dim is
+transparent.
+
+CPU backend keeps the per-projection path: ggml-cpu's per-kernel
+overhead is already negligible and the +30 MB weight footprint trades
+unfavourably with thread-cache locality there.  Process-wide
+`t3_stack_registry` + atexit hook frees `buffer_stack` before Metal's
+static device destructors run; mirrors the existing
+`s3gen_model_cache_release` pattern in `chatterbox_tts.cpp`.
+
+**Why gate / up isn't stacked.**  The multilingual T3 GGUF ships
+`mlp_gate` as F16 and `mlp_up` as Q4_0 (verified via
+`gguf.GGUFReader('models/chatterbox-t3-mtl-q4_0.gguf')`).  A single
+`ggml_tensor` can't hold mixed element widths, so the stack is gated
+on `wq->type == wk->type == wv->type` and skipped for any layer that
+doesn't satisfy it.  A future converter pass that lands gate at Q4_0
+would unlock the same fusion for the SwiGLU MLP (saves another 30 × 84
+= 2520 dispatches).
+
+**Why CFM transformer Q/K/V isn't stacked.**  Tried it
+(56 transformer blocks × 10 CFM steps = ~1100 saved dispatches per
+call, predicted real-time gain).  CFM regresses by ~15 % on
+`cfm_total` (549 → 632 ms).  The CFM transformer matmul has
+`M = INNER = 512`, `K = 256`, `T·B = 87 × 2 = 174`; with
+ggml-metal's `mul_mm` tile size `NR0 = 64`, separate Q matmul yields
+`512 / 64 = 8` row tiles × `174 / 32 = 6` col tiles = 48 chunks,
+which fits ~comfortably on M3 Ultra's 60 GPU cores in one wave.
+Stacked `M = 3 × 512 = 1536` → `24 × 6 = 144` chunks, three GPU waves
+where the un-stacked path used one.  The wider-M tile loop is supposed
+to amortise dispatch over more work, but on a 60-core GPU at this
+problem size the un-stacked path is already saturated — adding waves
+just adds overhead.  Reverted.  (The same calculus is why T3 _wins_:
+T3's step graph has `N = 1`, `B = 2`, `M = 1024`; separate Q matmul
+is `16 × 1 = 16` chunks (way under 60 cores → only ~25 % occupancy),
+stacked is `48 × 1 = 48` chunks (80 %).  So the lever is exactly
+"how undersaturated is the un-stacked GPU mat-mul".)
+
+#### Bench (M3 Ultra, Metal, ES prompt + jfk.wav voice, seed 42, mean of 5 invocations)
+
+| Variant | T3 §3.22 base | T3 +Phase 15 | Δ T3       | Total §3.22 base | Total +P15 | Δ Total    |
+|---------|--------------:|-------------:|-----------:|-----------------:|-----------:|-----------:|
+| Q4_0    |        474 ms |   **433 ms** | **-8.7%**  |          1192 ms | **1153 ms**| **-3.3%**  |
+| F16     |        522 ms |   **493 ms** | **-5.5%**  |              ~   |          ~ |          ~ |
+
+Cumulative on the §3.21 baseline (pre-§3.21):
+- Q4_0 T3: 872 ms → **433 ms** (**−50 %** since §3.20)
+- Q4_0 RTF: 0.46 → **0.29**
+- F16 T3: 1099 ms → **493 ms** (**−55 %** since §3.20)
+
+WAV byte-exact gate: md5 `79002f09bc48dda95ec0c2cfc2b895bd` matches
+across §3.22 base and post-§3.23 at five separate invocations
+(`--temp 0 --top-k 1`, deterministic).
+
+#### Files touched
+
+| File | Change |
+|------|--------|
+| [src/chatterbox_t3_internal.h](src/chatterbox_t3_internal.h) | `llama_layer` gains `wqkv`; `chatterbox_model` gains `ctx_stack` + `buffer_stack`. |
+| [src/t3_mtl.cpp](src/t3_mtl.cpp) | Post-load: allocate the Phase-15 stacked buffer + register with `t3_stack_registry` for atexit; per-layer copy of `wq`+`wk`+`wv` rows into `wqkv` via host scratch. `build_llama_block`: when `l.wqkv` is set, single mat-mul + view-split into Q/K/V; otherwise legacy three-mul path. New `t3_stack_unregister()` for `free_t3()` to call on error returns. |
+| [src/t3_mtl.h](src/t3_mtl.h) | Export `t3_stack_unregister()`. |
+| [src/chatterbox_cli.cpp](src/chatterbox_cli.cpp) | `free_t3()` calls `t3_stack_unregister()` then frees `buffer_stack` / `ctx_stack`. |
+
+### 3.24  HiFT conv-kernel F16 quantisation (multilingual S3Gen)
+
+The §3.20 quantisation pass left HiFT entirely at F32 (246 tensors,
+~80 MB) because both the converter and `requantize-gguf.py`
+wholesale-rejected 3-D shapes — `len(shape) != 2` always returned
+`False` in `should_quantize()`.  The remaining HiFT decode time
+(~125 ms, ~17 % of S3Gen wall) is mostly conv kernels whose
+weight bandwidth could plausibly come down with a smaller storage
+dtype.
+
+#### Q4_0 attempt: structurally blocked by K-dim alignment
+
+The plan's first prediction was that
+`should_quantize()` could allow 3-D when `K * IC % 32 == 0`
+(numpy `shape[-1] * shape[-2]` divisible by the Q4_0 block).  Tested
+empirically; the patch is structurally correct, **but the
+HiFT-specific gain is zero**:
+
+  - Q4_0's on-disk block layout assumes blocks span 32 consecutive
+    `ne[0]` values within a fixed `(ne[1], ne[2])` row.  For ggml
+    conv kernel shape `(K, IC, OC)` that means K must be 32-aligned.
+  - HiFT conv kernels have K ∈ {3, 7, 11, 16}.  None of these are
+    32-aligned, so Q4_0 along K is structurally impossible.
+  - Re-quantising with a flattened (K \* IC) reduction dim *would*
+    unblock the alignment gate, but the resulting on-disk shape is
+    `(K*IC, OC)` — i.e. 2-D — which then breaks
+    `ggml_im2col(kernel, ...)` on the C++ side (it derives the
+    kernel size from `kernel->ne[0]`).  That's a structural change
+    to `conv1d_f32` and gated on a future commit.
+
+The script patch is shipped as a forward-compatible no-op for
+HiFT: any future converter that ships K-aligned conv kernels gets
+the win for free.  Tested by re-quantising
+`chatterbox-s3gen-mtl-f16.gguf` to `q4_0` post-patch — output is
+structurally identical to the baseline `chatterbox-s3gen-mtl-q4_0.gguf`
+GGUF for HiFT (still 246 F32, no Q4_0).
+
+#### F16 alternate path: ships, modest win, audio quality preserved
+
+F16 has `block_size = 1` in `GGML_QUANT_SIZES`, so the alignment
+gate is a no-op for any shape.  Adding `f16` as a target dtype +
+a `--name-filter SUBSTRING` arg (constrains the rewrite to a
+tensor-name substring) lets us downcast HiFT conv kernels
+F32 → F16 without disturbing the existing Q4_0 CFM linears.
+
+Two-pass recipe:
+
+```bash
+python scripts/requantize-gguf.py \
+    models/chatterbox-s3gen-mtl-f16.gguf \
+    /tmp/intermediate.gguf f16 --name-filter hift/
+python scripts/requantize-gguf.py \
+    /tmp/intermediate.gguf \
+    models/chatterbox-s3gen-mtl-q4_0_hift_f16.gguf q4_0
+```
+
+Of the 246 HiFT tensors:
+  - 159 are 1-D biases / scalars — kept F32 by the `n_elements >= 1024`
+    + `len(shape) == {2,3}` shape gates.
+  - 64 are 2-D / 3-D conv weights — converted to F16.
+  - 21 are `source_downs/*` + `source_resblocks/*` 3-D conv
+    kernels — kept F32 because the existing `/s` deny-list
+    matches them as a substring.  Refining the deny-list to
+    endswith-only unblocks them, but `kernel_mul_mv_f32_f16_short`
+    isn't compiled in the pinned ggml-metal build, so HiFT
+    decode segfaults at runtime; left F32 with an inline note in
+    `requantize-gguf.py` for the next round.
+  - 2 small 2-D weights — kept F32 by `n_elements < 1024`.
+
+Bench on M3 Ultra Metal (3 invocations, ES prompt
+`"Hola mundo, esta es una prueba multilingue."`, `--seed 42
+--temp 0 --top-k 1`, jfk.wav voice):
+
+| Metric             | baseline q4_0 GGUF | q4_0 + HiFT F16 GGUF | Δ        |
+|--------------------|-------------------:|---------------------:|---------:|
+| GGUF size          |          788.4 MB  |             754.6 MB |  −4.3 %  |
+| `[hift_decode]` ms |          **124.9** |             **121.3** | **−2.9 %** |
+| `[s3gen_total]` ms |              727   |               726    | within noise |
+| `[cfm_total]` ms   |              549   |               550    | within noise |
+| T3 ms              |              434   |               434    | unchanged |
+
+Audio quality:
+  - WAV md5 differs (expected: F16 conversion is lossy):
+    baseline `79002f09bc48dda95ec0c2cfc2b895bd`
+    new      `ec58d3e65ab8e9c6f4edefb15b169ea5`
+  - PCM cosine = **0.999851** across all 3 invocations
+    (deterministic on `--seed 42`).
+  - max abs i16 diff = 616 / 32768 ≈ 1.9 %, mean abs diff = 3.65.
+  - Subjectively indistinguishable from baseline.  Cleanly above
+    the §3.20 PCM-cos ≥ 0.99 quality gate.
+
+#### Why this isn't the 80–100 ms drop the plan estimated
+
+The plan estimated a 25–45 ms HiFT win on the assumption that
+HiFT's bandwidth bottleneck would scale with weight storage.  Two
+reasons the realised win is smaller:
+
+1. Half of HiFT's weight footprint is in the 21 source_*
+   tensors that the deny-list guards (described above) — those
+   stayed F32.
+2. Even the converted tensors don't dominate `[hift_decode]`
+   wall time; per-step conv1d uses `im2col + mul_mat` on f32
+   inputs, and the F16 weights only save in the `mul_mat`
+   weight-load phase.  Activation traffic + im2col work stay F32.
+
+#### What's next
+
+  - **Patch the missing `kernel_mul_mv_f32_f16_short` variant**
+    (or reshape `source_downs/*` to a non-mat_mv shape) to
+    unblock the remaining 21 conv kernels.  Predicted
+    additional ~2–4 ms HiFT speedup + ~16 MB GGUF size drop.
+  - **Q4_0 HiFT via 2-D-on-disk storage + `conv1d_f32` branch
+    that skips the runtime ne[0]\*ne[1] reshape when the kernel
+    is already 2-D.**  Bigger surgery (touches both converter
+    + C++); documented as the structural follow-up to §3.24.
+  - **F32 `mul_mm + add(bias)` shader fusion** in
+    [patches/ggml-metal-chatterbox-ops.patch](patches/ggml-metal-chatterbox-ops.patch).
+    The existing patch fuses Q-variant `mul_mv + add(bias) +
+    add(residual)` (T3 step path); extending the same
+    function-constant + post-matmul `helper_mv_add_bias` pattern
+    to the `mul_mm` path covers CFM transformer batched
+    mat-muls (~280 fuse opportunities per CFM step × 10 steps
+    ≈ 2800 saved op dispatches/call).  Estimated +10–25 ms on
+    chatterbox S3Gen.  ~150 LOC of Metal shader templating;
+    concrete but invasive, gated on `test-metal-ops` PASS +
+    WAV byte-exact against the unfused baseline.  Deferred from
+    §3.24 because the F16 alt-path was the cheaper and more
+    immediately measurable win.
+
+#### Files touched
+
+| File | Change |
+|------|--------|
+| [scripts/requantize-gguf.py](scripts/requantize-gguf.py) | `should_quantize()` now allows 3-D when `shape[-1]` (= ne[0] = K) is block-aligned (forward-compatible no-op for HiFT today); `f16` added as a target dtype; new `--name-filter SUBSTRING` arg; pass-through path branches on `GGML_QUANT_SIZES[type][0] == 1` to handle already-quantised sources without reshape errors. |
+| `models/chatterbox-s3gen-mtl-q4_0_hift_f16.gguf` | New GGUF artifact (gitignored, 754 MB).  Recipe documented in the script's docstring + this section. |
+
+
+### 3.25  S3Gen flow-encoder `ggml_flash_attn_ext` — _negative finding_
+
+Tried flipping `src/chatterbox_tts.cpp::conformer_block()` (the 10 conformer
+blocks that make up S3Gen's flow encoder) from the classic `ggml_soft_max` +
+separate V mat-mul path to `ggml_flash_attn_ext`, mirroring the exact pattern
+used on T3 Llama (`src/t3_mtl.cpp:221 / 425`) and on CFM `basic_tfm`
+(`src/chatterbox_tts.cpp:712 / 800`), plus the `rel_pos_mha_graph` fix just
+landed on `parakeet.cpp` (§15.8 there).
+
+**Implementation (reverted, kept here as documentation):**
+
+```cpp
+const float scale = 1.0f / std::sqrt((float)HD);
+ggml_tensor * bd_scaled = ggml_scale(ctx, bd_final, scale);
+ggml_tensor * bd_mask   = ggml_cast(ctx, bd_scaled, GGML_TYPE_F16);
+ggml_tensor * attn_fa   = ggml_flash_attn_ext(ctx, q_plus_u, k_perm, v_perm,
+                                              bd_mask, scale, 0.0f, 0.0f);
+ggml_tensor * flat      = ggml_reshape_2d(ctx, attn_fa, HD * H, T);
+```
+
+Math is byte-correct: non-flash path is `softmax(scale * (q*k^T + bd_final)) * v
+= softmax(scale * q*k^T + scale * bd_final) * v`, and flash_attn_ext computes
+`softmax(scale * q*k^T + mask) * v`, so `mask = scale * bd_final` is the
+equivalent. Flow encoder runs single-window (no chunk mask) so no `att_mask`
+to fold in.
+
+#### Measured speedup was real
+
+| Stage (M3 Ultra, Metal, Q4_0, ES prompt, seed 42, 3 invocations averaged) | baseline | FA        | Δ                |
+|------|---------:|----------:|-----------------:|
+| `[encoder]` ms   |  ~43     |    29.6  | **−13 / −31 %** (flow encoder only) |
+| S3Gen ms         |   721    |   708    | **−13 / −1.8 %** |
+| T3 ms            |   433    |   430    | noise            |
+| CFM total ms     |   546    |   538    | noise (−8)       |
+| HiFT decode ms   |   126    |   125    | noise            |
+| WAV md5          | `79002f09…` | `a4169d68…` | **differs** |
+
+The flow encoder is 10 conformer blocks (6 at T=~87 + 4 at 2T), each running
+two sub-block matmuls + softmax + permute+mul_mat with V. Collapsing
+`softmax + permute + mul_mat` into a single `flash_attn_ext` kernel saves
+~4 dispatches/block × 10 blocks = 40 dispatches per synth; at ~30 µs per
+dispatch on the M3 Ultra that's ~1.2 ms theoretical, and the observed
+−13 ms is larger because the flash-attn kernel also avoids materialising
+the `(T, T, H)` scores tensor (small but not nothing).
+
+#### Why it was reverted
+
+The `ggml_flash_attn_ext` contract requires an f16 mask
+(`ggml.c:5320 GGML_ASSERT(mask->type == GGML_TYPE_F16)`). The Conformer's
+relative-position bias `bd_final` is computed in f32 from
+`mul_mat(p_perm, q_plus_v)` and must be cast to f16 before being passed in.
+The cast drifts each `bd_final` element by ~1e-4 (f16 has ~10 bits of
+mantissa, `bd_final` values sit in the ±5 to ±10 range). That drift is
+well below what parakeet's downstream argmax classifier can see, but
+chatterbox's downstream is very different:
+
+1. Flow encoder output → **10-step CFM estimator** (a diffusion U-Net). Each
+   step multiplies and compounds small errors in its input; 10 rounds of
+   AR-conditioned U-Net inference amplify an initial ~1e-4 cosine error
+   into an audible output drift.
+2. CFM output → **HiFT vocoder**, which produces a waveform. Waveform error
+   is measured as RMS-relative, which is far more sensitive than
+   token-ID equality.
+
+Gate: WAV cosine against the reference baseline (same prompt, seed, CFG),
+previous comparable thresholds from §3.24 were cos > 0.9998. The FA
+variant measured:
+
+```
+lengths  base=83520  fa=83520
+samples  n=83520  cos=0.998647
+rms_diff=69.334   rms_base=1332.522
+max_abs_diff=1702.0   gate: FAIL (threshold > 0.9998; got 0.998647)
+```
+
+Parakeet could absorb this drift (PR #1 §15.8 shipped it at exact token-ID
+parity across 95 tokens). Chatterbox cannot. Reverted — baseline md5
+restored to `79002f09bc48dda95ec0c2cfc2b895bd` at
+`/tmp/cb_revert.wav == /tmp/cb_base_1.wav`.
+
+#### Options explored and rejected
+
+1. **Pass `bd_scaled` in f32 via `ggml_flash_attn_ext`**. Blocked by the
+   hard assertion that mask must be f16.
+2. **Compute `bd_final` in f16 from the start** (cast `p_perm` and
+   `q_plus_v` to f16 earlier, run the `mul_mat` in f16). Pushes the same
+   precision loss earlier in the graph rather than fixing it; does not
+   improve the downstream cosine.
+3. **Skip the mask entirely** (pass nullptr to flash_attn_ext). Mathematically
+   wrong — `bd_final` is the relative-position bias that Conformer
+   attention specifically requires; dropping it breaks position-aware
+   attention.
+
+#### What to do instead
+
+Conformer flow-encoder stays on the `ggml_soft_max` path. Next candidate
+encoder-side optimisations are:
+
+- **Strip redundant `ggml_cont` after Conformer Q/K/V permutes** (lines
+  440–443 of `src/chatterbox_tts.cpp`). Metal's `mul_mat` can walk strides
+  natively; some of those `cont` copies may be removable without changing
+  math. Tracked as QW-D in today's planning notes.
+- **F32 `mul_mm + add(bias)` shader fusion in
+  `patches/ggml-metal-chatterbox-ops.patch`** (the estimate +10–25 ms on
+  S3Gen — CFM transformer batched mat-muls). Already queued in §3.24
+  follow-ups.
+
+#### Files touched (reverted)
+
+| File | Change |
+|------|--------|
+| [src/chatterbox_tts.cpp](src/chatterbox_tts.cpp) | 10-line commentary block added to `conformer_block()` explaining why the flash-attn path is intentionally not taken, pinning the negative-finding cosine number and the speed upside that was measured, and pointing at the parakeet §15.8 counterexample. No code change to the graph itself. |
+
+### 3.26  HiFT source_* F16 — unblocks the missing `kernel_mul_mv_f32_f16{,_4,_short}` Metal variants
+
+Closes the open item from §3.24 §3.25: "Patch the missing
+`kernel_mul_mv_f32_f16_short` variant to unblock the remaining 21
+HiFT source_* conv kernels."
+
+§3.24 converted the 64 HiFT conv-kernel F32 weights that the
+`/s` deny-list didn't incidentally catch to F16 (cos > 0.9998 vs
+the all-F32 baseline, `[hift_decode]` ~3 % faster, ~33 MB GGUF
+shrink). The broad `/s` deny also caught every HiFT `source_*`
+weight (`source_downs/0..2`, `source_resblocks/0..2/{convs1,convs2}/*`,
+`m_source/l_linear/*` — 21 weight tensors, ~7.7 MB at F32) because
+when you flip them to F16, HiFT's `conv1d_f32` path runs the
+`ggml_mul_mat(im2col_f32, kernel_f16)` mat-vec shape with `T0=f32,
+T1=f16`. The pinned ggml-metal (commit `58c38058`) did not ship
+that template instantiation, and Metal pipeline lookup fails:
+
+    ggml_metal_library_compile_pipeline: Error Domain=MTLLibraryErrorDomain
+    Code=5 "Function kernel_mul_mv_f32_f16_short was not found in the library"
+
+(Reproduced by feeding chatterbox a GGUF where the 21 source_*
+tensors are F16; crashes immediately at first HiFT decode with
+SIGSEGV / exit 139.)
+
+#### The fix — three template instantiations in `ggml-metal.metal`
+
+One line each per kernel family:
+
+```cpp
+// kernel_mul_mv_t_t family (full-shape mat-vec)
+template [[host_name("kernel_mul_mv_f32_f16")]]        kernel mul_mv_t_t        kernel_mul_mv_t_t       <float, half>;
+// kernel_mul_mv_t_t_4 family (vec4 dispatch path)
+template [[host_name("kernel_mul_mv_f32_f16_4")]]      kernel mul_mv_t_t_4      kernel_mul_mv_t_t_4     <float, float4, half, half4>;
+// kernel_mul_mv_t_t_short family (short-axis dispatch path — this is the
+// variant HiFT's small-OC source_downs/2/weight (OC=64) actually hits)
+template [[host_name("kernel_mul_mv_f32_f16_short")]]  kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short <float, half>;
+```
+
+The `mul_mv_t_t_short_impl` body (lines ~4320–4355 of `ggml-metal.metal`)
+is templated on `<T0, T1>` and already handles arbitrary casts via
+`(float) x[i] * (float) y[i]` — all that was missing was the
+`<float, half>` instantiation for the symbol lookup. Same for
+`_4` (needs `<float, float4, half, half4>`, with float-cast in the
+inner reduction loop) and the base non-short variant (symmetric).
+
+All three land as additions in `patches/ggml-metal-chatterbox-ops.patch`
+(700 → 733 lines). `test-metal-ops` still PASSes on every op it
+already covered (diag_mask_inf / pad_ext / conv_transpose_1d at
+three upsample stages + tiny edge case).
+
+#### `requantize-gguf.py` updates (two fixes + one scope narrow)
+
+Three changes so the recipe works end-to-end on the current
+gguf-0.18 writer:
+
+1. **Narrowed the deny glob `/s` to `/scale`.** The old `/s` match
+   was a rough proxy for "norm scale params like ln_1/ga, gate,
+   etc." but incidentally swept in every `hift/source_*/` weight
+   and bias tensor (188 matches in the F16 source GGUF, 62 of
+   which were `source_*`). With the Metal kernel variant now
+   shipped, `source_*` conv weights are safe to F16; the 21
+   that matter (the 3-D conv kernels) quantise successfully via
+   `--name-filter hift/source_`. The remaining norm-scale tensors
+   the deny was originally targeting (`/scale`, `/ln_`, `/norm/`,
+   `/gamma`) are still covered by their own stricter patterns.
+
+2. **Fixed the Q-type passthrough byte-shape bug.** `gguf-0.18`'s
+   `add_tensor_info` treats `raw_shape` as byte layout (innermost
+   dim in bytes per row, not elements per row) when `tensor.dtype
+   == np.uint8`. The previous code passed the element shape
+   verbatim, which crashed with
+   `ValueError: Quantized tensor bytes per row (512) is not a
+   multiple of Q4_0 type size (18)` on any input GGUF that
+   already carried Q-type tensors — i.e. every two-pass
+   pipeline like `f16 → q4_0` or `q4_0 → f16 --name-filter`.
+   Fix: convert inner-dim elements to bytes
+   (`byte_inner = elements_inner // block_size * type_size`)
+   before handing to the writer. Blocks `block_size==1` (F16/F32/
+   BF16) keep the existing element-shape path.
+
+3. **Docstring updated** with the two-pass recipe showing the
+   post-§3.26 configuration:
+
+       # Full recipe (Q4_0 everywhere except HiFT kept at F16 now
+       # including the 21 source_* conv kernels unblocked in §3.26):
+       python scripts/requantize-gguf.py \
+           models/chatterbox-s3gen-mtl-f16.gguf \
+           /tmp/intermediate.gguf f16 --name-filter hift/
+       python scripts/requantize-gguf.py \
+           /tmp/intermediate.gguf \
+           models/chatterbox-s3gen-mtl-q4_0_hift_f16.gguf q4_0
+
+#### Bench (M3 Ultra, Metal, Q4_0 + HiFT F16, ES prompt, seed 42, 3x3 runs)
+
+|                    | §3.24 baseline   | §3.26 (source_* F16) | Δ            |
+|--------------------|-----------------:|---------------------:|-------------:|
+| `[encoder]` ms     |    31.3          |   30.5               | −0.8 (noise) |
+| `[cfm_total]` ms   |   541.9          |  550.4               | noise        |
+| `[hift_decode]` ms |   121.3          |  121.1               | neutral      |
+| S3GEN_INFER_MS     |   709            |  724                 | +15 (noise)  |
+| T3_INFER_MS        |   440            |  440                 | 0            |
+| GGUF size          |  754.4 MB        |  746.7 MB            | **−7.7 MB**  |
+
+Speed is neutral on M3 Ultra (unified-memory bandwidth isn't the
+bottleneck for the 21 source_* weights, which are small — the
+largest is `source_resblocks/0/convs1/*/weight` at ~3.4 MB F32 /
+~1.7 MB F16). The predicted +2–4 ms HiFT gain from §3.24 falls
+inside bench noise; on bandwidth-limited targets (M4 Air /
+iPhone neural engine), expect the full +3–5 % HiFT speedup seen
+in §3.24's existing 64 tensors. The **real win** is the
+**7.7 MB GGUF shrink** (~1.0 %) on a multilingual distribution
+GGUF, plus closing the last known blocker from §3.24.
+
+#### Parity gates
+
+- `test-metal-ops`: all four pre-existing ops (diag_mask_inf, pad_ext,
+  conv_transpose_1d @ 3 upsample stages + tiny edge) PASS; no new
+  tests added because `kernel_mul_mv_f32_f16{,_4,_short}` is covered
+  by the end-to-end audio parity below (same inner math as the
+  existing `<half, float>` / `<half, half>` / `<float, float>`
+  variants, differing only in type tags).
+- **WAV parity** vs §3.24 baseline on ES-prompt / jfk-voice / seed
+  42 (per-invocation deterministic; md5 identical across 3x3 runs):
+
+      MD5 §3.24 baseline:      ec58d3e65ab8e9c6f4edefb15b169ea5
+      MD5 §3.26 v2 (3 runs):   d8a1b22375dbcb2259c686426a7d76c5  d8a1b22375dbcb2259c686426a7d76c5  d8a1b22375dbcb2259c686426a7d76c5
+
+  audio comparison:
+
+      lengths 83520/83520   cos 1.000000   PASS (threshold > 0.9998)
+      rms_diff 0.464    rms_base 1332.66   max_abs_diff 4 (out of ±32767)
+      → 0.035 % relative RMS drift, 0.012 % max sample drift
+
+  Auditorily identical (within the LSB of s16 output). Deterministic
+  across invocations.
+
+#### Files touched
+
+| File | Change |
+|------|--------|
+| [patches/ggml-metal-chatterbox-ops.patch](patches/ggml-metal-chatterbox-ops.patch) | +33 lines for the three `mul_mv_f32_f16{,_4,_short}` template instantiations + comments referencing this section. Regenerated from the pinned commit `58c38058`. |
+| [scripts/requantize-gguf.py](scripts/requantize-gguf.py) | `/s` deny narrowed to `/scale`; Q-type passthrough byte-shape fix; docstring recipe updated. |
+| `ggml/src/ggml-metal/ggml-metal.metal` | Local edit under the `ggml/` worktree; not tracked in this repo. Recipe remains: run `scripts/setup-ggml.sh` to re-apply the patch after a ggml bump. |
+
+#### What's next
+
+All §3.24 follow-ups now closed:
+
+- ~~kernel_mul_mv_f32_f16_short patch~~ ✓ shipped this section
+- Q4_0 HiFT via 2-D-on-disk storage + `conv1d_f32` branch — still
+  deferred, larger surgery (touches both converter + C++)
+- F32 `mul_mm + add(bias)` shader fusion — still deferred, ~150
+  LOC Metal kernel work + test-metal-ops gate; bigger potential
+  (+10–25 ms S3Gen) but not "quick"
+
+### 3.27  F32 `mul_mm + ADD(bias) [+ ADD(residual)]` fusion on Metal
+
+Closes the §3.22 §3.24 §3.26 follow-up "F32 `mul_mm + add(bias)` shader
+fusion in `patches/ggml-metal-chatterbox-ops.patch`". The existing
+fusion in the pinned `ggml-metal` pipeline covered only Q-variant
+**mul_mv** (matrix-vector) kernels via `helper_mv_add_bias`
+(Q4_0/Q4_1/Q5_0/Q5_1/Q8_0 with bias+residual function-constant
+guards). The **mul_mm** (matrix-matrix) kernel — the one the CFM
+transformer actually hits at T·B ≥ 2 — had no equivalent. This
+section wires one in.
+
+#### What lands
+
+1. **`kernel_mul_mm` in `ggml-metal.metal`** gains two new function
+   constants (`FC_mul_mm_has_bias_` = `FC_MUL_MM + 2`,
+   `FC_mul_mm_has_residual_` = `+3`) and two new buffer slots
+   (`bias` at `buffer(4)`, `residual` at `buffer(5)`). When either
+   FC is true, the kernel routes through the shmem-backed
+   scalar-copy path and folds bias / residual into the copy loop
+   (same post-matmul math as `helper_mv_add_bias`: `v += bias[r0+i]`
+   and `v += residual[(r1+j)*ne0 + im*ne1*ne0 + r0 + i]`).
+   Compiler drops the branch that's not selected by the FC — zero
+   overhead when neither is set.
+
+2. **`get_pipeline_mul_mm` in `ggml-metal-device.cpp`** now takes
+   `has_bias, has_residual` flags, bakes them into the pipeline
+   name (`kernel_mul_mm_<T0>_<T1>_bci=X_bco=Y_bias=Z_res=W`), and
+   sets the function-constant values during compile. Shmem size
+   bumped from `4 KB+2 KB` to `8 KB` when either flag is set so
+   the always-shmem path has room for the temp buffer.
+
+3. **Dispatcher `ggml_metal_op_mul_mat` in `ggml-metal-ops.cpp`**
+   mirrors the Q-variant mul_mv fusion lookup: try
+   `{MUL_MAT, ADD, ADD}` first, fall back to `{MUL_MAT, ADD}`.
+   Both orderings of the residual add are handled (`ggml_add` is
+   commutative; chatterbox's `basic_tfm` emits
+   `ggml_add(x, attn_out)` with residual `x` as `src[0]` and the
+   mul_mat+bias result as `src[1]`). Writes fused dst to
+   `node(idx + n_fuse - 1)` so the value lands where the skipped
+   ADD(s) would have written, and returns `n_fuse` so the outer
+   loop skips them.
+
+#### Kernel variants actually compiled on a chatterbox run
+
+Verified via `ggml_metal_library_compile_pipeline` trace on first
+invocation (M3 Ultra, Q4_0 + HiFT F16 + sample-16k voice):
+
+```
+kernel_mul_mm_q4_0_f32_bci=0_bco=0_bias=1_res=0   ← CFM transformer linears, in-bounds blocks
+kernel_mul_mm_q4_0_f32_bci=0_bco=1_bias=1_res=0   ← CFM transformer linears, edge blocks
+kernel_mul_mm_f32_f32_bci=0_bco=0_bias=1_res=0    ← CFM time_mlp / final_proj
+kernel_mul_mm_f32_f32_bci=0_bco=1_bias=1_res=0
+kernel_mul_mm_q4_0_f32_bci=0_bco=1_bias=0_res=0   ← unfused matmuls (e.g. Q/K/V no-bias)
+kernel_mul_mm_f32_f32_bci=1_bco=1_bias=0_res=0
+```
+
+The `bias=1` variants account for ~280 fuse opportunities per CFM
+step × 10 steps × 2 CFG batches ≈ 1820 dispatches per synthesis
+that the old code paid a separate `ggml_add` kernel for. No
+`res=1` variants fire in the current chatterbox graph: the
+`ADD(residual)` in `basic_tfm` is at a different point in the
+graph (separated by `layer_norm` → `mul_mat` → `add(bias)` →
+`gelu_erf` → `mul_mat` → `add(bias)` → add(x, ff)`), so the
+residual add can't be folded into the preceding mul_mm without
+hoisting those intermediate ops. Left as future work — the
+infrastructure is in place either way for consumers whose
+residual is adjacent to their mul_mat.
+
+#### Bench (M3 Ultra, Metal, Q4_0 + HiFT F16, ES prompt, seed 42)
+
+5-invocation averages (WAV deterministic, md5 identical across
+all 5 runs):
+
+| Metric             | §3.26 baseline | §3.27 fused      | Δ               |
+|--------------------|---------------:|-----------------:|----------------:|
+| `[encoder]` ms     |    31.3        |   30.5           | noise           |
+| `[cfm_total]` ms   |   541.9        |  542.2 (± 5 per-run) | **neutral** |
+| `[hift_decode]` ms |   121.3        |  121.2           | neutral         |
+| S3GEN_INFER_MS     |   709          |  713.2           | +4 (noise)      |
+| T3_INFER_MS        |   440          |  433.4           | −7 (noise)      |
+| md5                | d8a1b22…      | d8a1b22…         | **byte-exact**  |
+
+Cross-check: running with `GGML_METAL_FUSION_DISABLE=1` (turns off
+ALL ggml-metal fusions, including the pre-existing norm+mul+add
+and Q-variant mul_mv+bias+residual) pushes CFM to **568.9 ms**
+steady across 3 runs — a 27 ms penalty from the aggregate fusion
+system. My new mul_mm+add contribution to that total is a small
+fraction; most of the win comes from norm+mul+add fusion (which
+ggml already ships).
+
+#### Why the measured gain is near-zero on M3 Ultra specifically
+
+Two reasons. First, M3 Ultra's Metal per-dispatch overhead is
+low (~20–30 µs) and `ggml_add` kernels are tiny, so the 1820
+eliminated dispatches only add up to ~45 ms theoretical — and
+many of those would overlap with subsequent kernels' command-
+buffer execution, not sit on the critical path. Second, when
+`has_bias` is true, the kernel is forced through the shmem
+path (direct-store + post-barrier bias-add proved too complex
+to retrofit into both the tensor-API and simdgroup-fallback
+paths in the time budget for this session); the shmem roundtrip
+costs ~an equal amount. Net: neutral on M3 Ultra.
+
+#### Why it still ships
+
+1. **Correctness**: byte-exact audio (md5 `d8a1b22375dbcb2259c686426a7d76c5`
+   matches §3.26 across 5 runs). `test-metal-ops` PASSes on all
+   four pre-existing ops (diag_mask_inf, pad_ext, conv_transpose_1d
+   at three upsample stages + tiny edge).
+2. **Expected positive elsewhere**: M4 Air / iPhone / iPad have
+   proportionally higher Metal per-dispatch overhead and lower
+   core counts than M3 Ultra, so the saved 1820 dispatches should
+   translate to a measurable win (expected range: +5–15 ms S3Gen,
+   same ratio §3.24's HiFT F16 result predicted). Can't verify on
+   M3 Ultra alone.
+3. **Streaming**: Mode 2/3 streaming synthesises short chunks
+   where the per-chunk dispatch count matters more relative to
+   compute — fusion is expected to be proportionally larger there.
+4. **Forward leverage**: the FC_MUL_MM + 2 / +3 slots + helper
+   routing are the plumbing future sessions will reuse to extend
+   fusion to `mul_mm_id` (MoE shapes), to F16 weight variants
+   (once the `kernel_mul_mv_f32_f16_short` family from §3.26 has
+   a matching mul_mm story), or to direct-store-path variants
+   that would reclaim the shmem-roundtrip cost on M3 Ultra.
+
+#### Files touched
+
+| File | Change |
+|------|--------|
+| `ggml/src/ggml-metal/ggml-metal.metal` | Two new FC constants (FC_MUL_MM + 2 / +3), two new buffer args (slots 4 and 5) on `kernel_mul_mm`, forced-shmem path when either FC is true, bias/residual fold-in inside the scalar-copy loop. Local edit under the `ggml/` worktree; not tracked in this repo. |
+| `ggml/src/ggml-metal/ggml-metal-device.{cpp,h}` | `get_pipeline_mul_mm(op, has_bias, has_residual)` — new signature; bakes flags into pipeline name + FC values; shmem sizing adjusted to 8 KB when fused. |
+| `ggml/src/ggml-metal/ggml-metal-ops.cpp` | `ggml_metal_op_mul_mat` mul_mm path gains the same `can_fuse({MUL_MAT,ADD,ADD})` / `can_fuse({MUL_MAT,ADD})` lookup the mul_mv path already had; both orderings of the residual add handled; `n_fuse` returned to skip the folded ADDs. |
+| [patches/ggml-metal-chatterbox-ops.patch](patches/ggml-metal-chatterbox-ops.patch) | +262 lines. Regenerated from pinned `58c38058`. 733 → 995 lines. |
+
+#### What's next
+
+- **Reclaim the shmem-roundtrip cost on M3 Ultra**: add bias fold-in
+  to the direct-store paths (both the tensor-API `cT.store` path
+  and the simdgroup-fallback `simdgroup_store` loop). Would need
+  a post-barrier per-simdgroup read-modify-write pass on device
+  memory. 2–3 h of additional Metal kernel work; predicted to
+  flip §3.27 from neutral to +5–10 ms on M3 Ultra.
+- **Extend to `mul_mm_id`** (mixture-of-experts mat-muls) — same
+  FC pattern applies. Zero-change for chatterbox (doesn't use
+  MoE), but useful for future consumers of this patch.
+- **Bench on M4 / iOS** — validate the "neutral on M3U, positive
+  elsewhere" prediction. Until measured the estimate is just
+  that.
+
+### 3.28  `mul_mm + ADD(bias) + GELU_ERF` fusion — CFM FF activation path
+
+Builds directly on §3.27 infrastructure.  Closes the `mul_mat →
+add(bias) → gelu_erf` triple in CFM `basic_tfm`'s FF gate projection
+(`src/chatterbox_tts.cpp:738`):
+
+```cpp
+ff = ggml_add(ctx, ggml_mul_mat(ctx, w.ff0_w, nx2), w.ff0_b);  // (mul_mat + bias) — fused by §3.27
+ff = ggml_gelu_erf(ctx, ff);                                    // §3.28 absorbs this into the same kernel
+ff = ggml_add(ctx, ggml_mul_mat(ctx, w.ff2_w, ff), w.ff2_b);    // ff2 remains a separate mul_mm + bias fusion
+```
+
+§3.27 already brought `mul_mat + add(bias)` into a single dispatch
+via the shmem-backed scalar-copy path; §3.28 extends that same
+loop to apply `gelu_erf` as the last stage before writing to dst.
+The gelu is inline FP math on each element we're already reading /
+writing — **no extra memory roundtrip, no extra shmem** — so unlike
+§3.27's neutral-on-M3-Ultra result, this one is a clear net
+positive on M3 Ultra.
+
+#### What lands
+
+1. **`ggml-metal.metal`**: new function constant `FC_MUL_MM + 4`
+   (`FC_mul_mm_has_gelu_erf_`), new branch at the end of the
+   scalar-copy loop that applies the same `0.5 * v * (1 +
+   erf_approx(v * SQRT_2_INV))` formula the standalone
+   `OP_UNARY_NUM_GELU_ERF` kernel uses.  Numerically identical to
+   the unfused path (proven via md5 byte-exact across 5 runs).
+
+2. **`get_pipeline_mul_mm`**: signature bumped to
+   `(op, has_bias, has_residual, has_gelu_erf)`; pipeline name
+   extended with `_gelu=N`; FC + shmem sizing adjusted to keep the
+   shmem path (8 KB) when any fold-in is active.
+
+3. **Dispatcher `ggml_metal_op_mul_mat` mul_mm path**: new
+   `{MUL_MAT, ADD, UNARY}` can_fuse lookup wedged between the
+   `{MUL_MAT, ADD, ADD}` residual lookup and the
+   `{MUL_MAT, ADD}` bias-only fallback.  Verifies
+   `ggml_get_unary_op(f2) == GGML_UNARY_OP_GELU_ERF` and that
+   `f2->src[0] == f1` before fusing.  Gates on GELU_ERF
+   specifically because that's the one `basic_tfm` uses;
+   other unary sub-ops (SILU, GELU, RELU, GELU_QUICK, ...) are
+   left as independent follow-up work — same pattern would extend
+   trivially.
+
+#### Pipeline names actually compiled
+
+(from `GGML_LOG_DEBUG` compile trace on first invocation)
+
+```
+kernel_mul_mm_q4_0_f32_bci=0_bco=0_bias=1_res=0_gelu=1   ← CFM ff0 (gelu_erf-activated)
+kernel_mul_mm_q4_0_f32_bci=0_bco=1_bias=1_res=0_gelu=1   ← ff0 edge blocks
+kernel_mul_mm_q4_0_f32_bci=0_bco=0_bias=1_res=0_gelu=0   ← CFM ff2 / to_out (bias only, §3.27)
+kernel_mul_mm_q4_0_f32_bci=0_bco=1_bias=1_res=0_gelu=0
+kernel_mul_mm_f32_f32_bci=0_bco=0_bias=1_res=0_gelu=0    ← time_mlp / final_proj
+kernel_mul_mm_f32_f32_bci=0_bco=1_bias=1_res=0_gelu=0
+kernel_mul_mm_q4_0_f32_bci=0_bco=1_bias=0_res=0_gelu=0   ← unfused (no-bias) passthroughs
+kernel_mul_mm_f32_f32_bci=1_bco=1_bias=0_res=0_gelu=0
+```
+
+The `gelu=1` variants correspond to 56 basic_tfm blocks × 10 CFM
+steps × 2 CFG batches = **1120 saved `gelu_erf` dispatches per
+synth** (on top of the 1820 bias-add dispatches saved in §3.27).
+
+#### Bench (M3 Ultra, Metal, Q4_0 + HiFT F16, ES prompt, seed 42, 5 invocations)
+
+| Metric             | §3.27 (bias only) | §3.28 (+ gelu) | Δ                     |
+|--------------------|------------------:|---------------:|----------------------:|
+| `[encoder]` ms     |     30.5          |    30.8        | noise                 |
+| `[cfm_total]` ms   |    542.2          |   **533.4 ± 1.0**  | **−8.8 / −1.6 %** |
+| `[hift_decode]` ms |    121.2          |   120.8        | neutral               |
+| S3GEN_INFER_MS     |    713.2          |   **706.0 ± 0.8**  | **−7.2 / −1.0 %** |
+| T3_INFER_MS        |    433.4          |   431.0        | noise                 |
+| md5                | `d8a1b22…`       | `d8a1b22…`    | **byte-exact ×5**     |
+
+#### Parity gates
+
+- `test-metal-ops`: all 4 pre-existing ops (diag_mask_inf, pad_ext,
+  conv_transpose_1d × 3 + tiny) PASS.
+- WAV md5 byte-exact vs §3.26 / §3.27 baseline (`d8a1b22375dbcb2259c686426a7d76c5`)
+  across all 5 invocations of the fused build.  The fused
+  kernel uses the same `erf_approx<T>(x)` helper as the standalone
+  GELU_ERF unary op, so the math is identical down to the LSB.
+- Determinism across runs: md5 stable.
+
+#### Why this time it's not neutral on M3 Ultra (unlike §3.27)
+
+§3.27's gain was eaten by the shmem-roundtrip cost: routing
+through `temp_str` + sgitg==0 scalar copy costs roughly what the
+1820 eliminated `ggml_add` dispatches saved.  §3.28 adds the gelu
+fold-in **into the same loop** — no additional memory accesses,
+no barriers, no extra shmem — just a handful of FLOPs per element.
+So the 1120 saved `gelu_erf` dispatches show up as a clean net
+positive:  −8.8 ms CFM / −7.2 ms S3Gen.
+
+This also refines the §3.27 story: the infrastructure we built
+there is what makes §3.28 cheap.  Fusing additional per-element
+tail ops into the existing scalar-copy loop is essentially free,
+whereas routing through the shmem path is what cost M3 Ultra its
+estimated §3.27 win.
+
+#### Files touched
+
+| File | Change |
+|------|--------|
+| `ggml/src/ggml-metal/ggml-metal.metal` | New FC `FC_MUL_MM + 4` (has_gelu_erf); gelu_erf branch in the scalar-copy loop using `erf_approx<float>`; shared early-out condition updated to include the new flag.  Local edit under `ggml/` worktree. |
+| `ggml/src/ggml-metal/ggml-metal-device.{cpp,h}` | `get_pipeline_mul_mm(op, has_bias, has_residual, has_gelu_erf)` — new fourth parameter, pipeline name extended with `_gelu=N`, shmem sizing adjusted. |
+| `ggml/src/ggml-metal/ggml-metal-ops.cpp` | Dispatcher mul_mm path gains `{MUL_MAT, ADD, UNARY}` can_fuse lookup with `ggml_get_unary_op == GGML_UNARY_OP_GELU_ERF` check; slotted between the 3-op residual and 2-op bias lookups. |
+| [patches/ggml-metal-chatterbox-ops.patch](patches/ggml-metal-chatterbox-ops.patch) | Regenerated from pinned `58c38058`. 995 → 1054 lines, +59. Applies cleanly via `git apply --check`. |
+
+#### What's next
+
+The same fold-in pattern extends trivially to other unary sub-ops
+whenever the chatterbox (or downstream consumer) graph uses them
+right after a `mul_mat + add(bias)`:
+
+- SILU (`t3_mtl.cpp` already uses `ggml_swiglu_split` which fuses
+  `silu(a) * b`, but a plain SILU follower could be added).
+- GELU (non-erf variant) — not in chatterbox today.
+- RELU, GELU_QUICK — not in chatterbox.
+
+These would each be ~15–20 lines (FC slot + branch + dispatcher
+case), mirroring the GELU_ERF wiring this section added. None of
+them fires in the current chatterbox graph so there's no standalone
+win, but infrastructure is cheap to extend.
+
+Bigger next-step: reclaim the §3.27 shmem-roundtrip cost on
+M3 Ultra by fusing bias into the direct-store paths (both
+tensor-API `cT.store` and simdgroup-fallback `simdgroup_store`).
+2–3 h of Metal kernel work; predicted to flip the §3.27 contribution
+from neutral to +3–5 ms CFM on top of today's §3.28 gain.
+
+### 3.29  Direct-store fold-in — _negative finding, reverted_
+
+Goal: reclaim the §3.27 neutral-on-M3-Ultra result by keeping the
+fast `cT.store` / `simdgroup_store` direct-to-device-memory path
+for full-block writes and doing the bias / residual / gelu_erf
+fold-in as a **post-barrier read-modify-write pass** on device
+memory, instead of routing through the shmem + scalar-copy path.
+
+The shmem path that §3.27 ships is correct but costs a
+threadgroup-memory roundtrip (4 simdgroups stage into a shared
+`temp_str` buffer, sgitg==0 drains it with a scalar loop).  On
+M3 Ultra that roundtrip is ~equal to the dispatch savings from
+eliminating the separate `ggml_add` kernel — hence the "neutral"
+§3.27 result.  §3.28 worked because gelu is an extra per-element
+tail op inside a loop that already exists; it added ~zero cost.
+§3.29 tried to do the same for bias, but on a different path.
+
+#### What was tried
+
+```cpp
+if (_mm_use_direct) {
+#ifdef GGML_METAL_HAS_TENSOR
+    cT.store(tC);                    // cooperative 64x32 store
+#else
+    for (short i = 0; i < 8; i++) {
+        simdgroup_store(mc[i], ...); // per-simdgroup 32x16 store
+    }
+#endif
+    if (_mm_has_foldin) {
+        threadgroup_barrier(mem_flags::mem_device);   // flush stores
+        // distribute 2048 elements of the 64x32 block across 128
+        // threads of the threadgroup — each thread does 16 RMWs
+        const int thread_idx = (int) tiitg;
+        for (int k = thread_idx; k < NR0 * NR1; k += 128) {
+            const int abs_r = r0 + (k % NR0);
+            const int abs_c = r1 + (k / NR0);
+            const uint64_t off = (uint64_t)abs_c * ne0 + abs_r + ...;
+            device float * D = (device float *) dst + off;
+            float v = *D;
+            if (FC_mul_mm_has_bias)     v += bias_f32[abs_r];
+            if (FC_mul_mm_has_residual) v += residual_f32[off];
+            if (FC_mul_mm_has_gelu_erf) v = 0.5f*v*(1.0f + erf_approx(v * SQRT_2_INV));
+            *D = v;
+        }
+    }
+}
+```
+
+`get_pipeline_mul_mm` sized back down to the non-fold-in shmem
+(6 KB) when fold-ins are active, on the theory that only edge
+blocks need `temp_str`.
+
+#### What happened
+
+`test-metal-ops` PASSed on all pre-existing ops (diag_mask_inf,
+pad_ext, conv_transpose_1d × 3 + tiny edge) — the kernel compiled
+clean, the new `_short` / `_4` / `bias=1` variants all built.
+
+But the end-to-end chatterbox synth produced **wrong output**:
+
+| Metric      | §3.28 baseline                         | §3.29 attempt                         |
+|-------------|----------------------------------------|---------------------------------------|
+| md5         | `d8a1b22375dbcb2259c686426a7d76c5`    | `06ee1aaaa94a10d70eec2835d3da7dbf`   |
+| T3 tokens   | 84                                     | 70                                    |
+| audio_ms    | 3480                                   | 2920                                  |
+| determinism | stable across 5 runs                   | stable (same wrong md5 across runs)   |
+
+T3 EOS'd 14 tokens early.  The wrong md5 was deterministic —
+not a race, but a systematic computation error that's _consistent_
+every run.  Reverted to the §3.28 shmem-forcing behaviour
+(byte-exact to `d8a1b22…`).
+
+#### Suspected root causes (not isolated in this session)
+
+1. **Cooperative tensor-store layout**: `cT.store(tC)` is an
+   Apple Metal tensor-ops cooperative write across all four
+   simdgroups in the threadgroup.  Where each element lands in
+   device memory is implementation-defined, not trivially the
+   32x16 per-simdgroup partition `simdgroup_store` uses in the
+   fallback path.  The RMW pass as written assumes the partition
+   doesn't matter (it iterates the full 64x32 via tiitg), but
+   maybe the threadgroup_barrier with `mem_flags::mem_device`
+   isn't strong enough to order `cT.store`'s writes against
+   subsequent device reads from the same threadgroup on A17 /
+   M3.  A real memory-model audit (or testing with `fence()`
+   instead of `threadgroup_barrier`) is the next thing to try.
+
+2. **`bias_ok` / `residual_ok` shape check vs graph layout**:
+   `bias_ok` only requires `ggml_nelements(bias) == ne0` and
+   `bias->ne[0] == ne0`, which is correct for the usual
+   `(OC,)` broadcast.  But `residual_ok` requires
+   `ggml_are_same_shape(resi, mul_mat_result)`.  The mul_mat's
+   output shape is `(ne0, ne1, ne2, ne3)`; if the residual
+   happens to have matching shape but different strides (e.g.,
+   a non-contiguous view), the RMW would silently read the
+   wrong bytes.  §3.27's shmem path also trusted this check,
+   and that one works — but the shmem path copies element by
+   element, which could hide a stride bug that direct-store
+   reveals.  Worth an audit.
+
+3. **Index calculation off-by-one or wrong stride**: the RMW
+   uses `off = abs_c * ne0 + abs_r + im*ne1*ne0`, which matches
+   the in-bounds direct-store formula
+   `dst + r0 + r1*ne0 + im*ne1*ne0`.  But I didn't pass `nb0` /
+   `nb1` through — the direct-store uses `args.ne0` as stride
+   assuming contiguous f32 output.  If the destination tensor
+   is non-contiguous (say, a view into a larger buffer) the
+   mul_mat kernel itself would be wrong too, so this is
+   probably not the bug, but worth double-checking in a unit
+   test.
+
+#### What's missing
+
+There's **no per-shape unit test for `mul_mm + add(bias)`**
+that compares fused-kernel output vs unfused-graph output
+element-by-element.  `test-metal-ops` only covers
+diag_mask_inf, pad_ext, and conv_transpose_1d.  Adding a
+`mul_mm_fused` test case (build a small ggraph with
+mul_mat + add, dispatch with fusion forced on vs
+`GGML_METAL_FUSION_DISABLE=1`, compare outputs to 1e-6
+tolerance) would have caught §3.29's bug in seconds.  The
+§3.27 and §3.28 kernels *happen* to be byte-exact because
+their fold-in happens inside the scalar-copy loop which is
+straightforward to reason about; §3.29's direct-store RMW has
+a more subtle data-flow that would benefit from explicit
+coverage.
+
+#### Files touched / reverted
+
+| File | Change |
+|------|--------|
+| `ggml/src/ggml-metal/ggml-metal.metal` | Direct-store RMW block *removed*; 21-line commentary added in place explaining §3.29 attempt + failure + suspected causes for the next person to read. `_mm_use_direct` reverts to §3.28's "no fold-in allowed on direct-store path" condition. |
+| `ggml/src/ggml-metal/ggml-metal-device.cpp` | `get_pipeline_mul_mm` shmem sizing reverts to §3.28 behavior (8 KB when any of `bc_out` / `has_bias` / `has_residual` / `has_gelu_erf` is set). |
+| [patches/ggml-metal-chatterbox-ops.patch](patches/ggml-metal-chatterbox-ops.patch) | Regenerated from pinned `58c38058`.  1054 → 1070 lines (+16, the inline documentation block). |
+
+#### Result
+
+`cb_rev.wav` md5 matches §3.26/§3.27/§3.28 baseline
+`d8a1b22375dbcb2259c686426a7d76c5` byte-exact.  T3 back to 84
+tokens / 3480 ms audio.  No code change from §3.28 beyond the
+documentation block.
+
+M3 Ultra §3.27 shmem-roundtrip cost (~8 ms on CFM) remains
+standing.  M4 / iOS predicted wins for §3.27 / §3.28 are
+unaffected — the fused kernel still fires; only the
+optimization to dodge the shmem path didn't land.
+
+#### Next-person notes
+
+If you pick this up:
+
+- Add a `test-metal-ops` case for fused `mul_mm + add(bias)` FIRST.
+  Build a 2-op graph `add(mul_mat(W_q4_0, X_f32), bias_f32)`,
+  dispatch with fusion ON (current default) vs
+  `GGML_METAL_FUSION_DISABLE=1`, assert element-wise match to
+  ~1e-6.  Should be ~80 lines.
+- Then retry the direct-store path, ideally with a **smaller
+  scope first** (only `has_bias`, drop `has_residual` /
+  `has_gelu_erf`) to halve the complexity.  If the bias-only
+  variant passes the new unit test, incrementally add the
+  others.
+- Apple's [Metal Shading Language Specification](https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf),
+  §5.7 "Memory Scopes and Barriers", has the exact semantics
+  for `mem_flags::mem_device` vs `mem_flags::mem_none` —
+  worth confirming that `threadgroup_barrier(mem_device)`
+  orders cooperative-tensor-store writes against subsequent
+  device reads on A17+ silicon.  Cf. `simdgroup_fence_t` as
+  an alternative to `threadgroup_barrier`.
+
+### 3.30  `test-metal-ops` fused-mul_mm harness + §3.29 direct-store retry (bias-only)
+
+Two pieces, both closing §3.29 loose ends:
+
+1. **Harness**: new `test_mul_mm_fused` in `src/test_metal_ops.cpp`
+   builds a small graph `add(mul_mat(W_q4_0, X_f32), bias)` (and
+   with an optional `gelu_erf` follow-up), runs it on CPU + Metal,
+   and compares element-wise.  On the Metal side, ggml-metal's
+   fusion detector collapses these into a single
+   `kernel_mul_mm_..._bias=1_res=X_gelu=Y` dispatch; CPU is always
+   the unfused triple.  Any numerical drift beyond tolerance
+   indicates a kernel bug.  Tolerance picked at 2e-2 absolute
+   after observing the Q4_0-dequant-order CPU-vs-GPU noise on
+   K=256..1024 shapes runs ~5–11e-3 max abs (4× margin over
+   the noise floor).
+2. **Bias-only direct-store (§3.29 retry)**: full-block writes
+   with `has_bias && !has_residual && !has_gelu_erf` now take
+   the direct-store path with a post-barrier bias-add scan
+   (128 threads × 16 elements), instead of routing through the
+   shmem scalar-copy fallback.  Residual / gelu fold-ins still
+   route through shmem — §3.29's negative finding on those
+   paths stands (root cause unresolved), so keeping the proven
+   path for them.  This is the minimum-scope slice of §3.29
+   that the new harness proves byte-stable.
+
+#### Harness coverage
+
+8 fused-mul_mm shape variants, gated under the same `test-metal-ops`
+binary so CI/ship criteria run them alongside diag_mask_inf /
+pad_ext / conv_transpose_1d:
+
+```
+[mul_mm_fused cfm-attn-qkv]          OK (K=256 N=256  T=87 B=2 fuse=bias, max_abs=5.2e-03)
+[mul_mm_fused cfm-attn-out]          OK (K=256 N=512  T=87 B=2 fuse=bias, max_abs=5.7e-03)
+[mul_mm_fused cfm-ff-gate-bias]      OK (K=256 N=1024 T=87 B=2 fuse=bias, max_abs=5.8e-03)
+[mul_mm_fused cfm-ff-gate-bias+gelu] OK (K=256 N=1024 T=87 B=2 fuse=gelu, max_abs=4.9e-03)
+[mul_mm_fused cfm-ff-down]           OK (K=1024 N=256 T=87 B=2 fuse=bias, max_abs=1.1e-02)
+[mul_mm_fused cfm-b1]                OK (K=256 N=512  T=87 B=1 fuse=bias, max_abs=5.7e-03)
+[mul_mm_fused bco-bias]              OK (K=256 N=320  T=87 B=2 fuse=bias, max_abs=5.8e-03)
+[mul_mm_fused bco-gelu]              OK (K=256 N=320  T=87 B=2 fuse=gelu, max_abs=5.2e-03)
+```
+
+Covers the exact shapes chatterbox CFM hits (256→256 attn Q/K/V,
+256→512 attn_out, 256→1024 ff0 with gelu, 1024→256 ff2), batch=1
+and batch=2 variants, and a non-64-multiple N=320 that forces
+the `bco=1` (bounds-checked) shmem path.
+
+#### §3.29 retry (bias-only) outcome
+
+The bias-only direct-store path passes the harness byte-stably
+and produces byte-exact WAV output end-to-end
+(`md5 d8a1b22375dbcb2259c686426a7d76c5` across 5 runs, T3 84
+tokens, audio_ms 3480).
+
+Measured impact on M3 Ultra (5 invocations, Q4_0 + HiFT F16):
+
+| Metric             | §3.28            | §3.30            | Δ                |
+|--------------------|-----------------:|-----------------:|-----------------:|
+| `[cfm_total]` ms   |        533.4 ± 1.0 |      534.0 ± 0.9 | noise            |
+| `S3GEN_INFER_MS`   |        706.0 ± 0.8 |      706.2 ± 3.2 | noise            |
+| `[hift_decode]` ms |             121.2 |           121.8  | noise            |
+
+Neutral on M3 Ultra, same as §3.27.  Reason: in chatterbox's
+`basic_tfm`, every mul_mat+bias has a follow-up op (either
+residual or gelu) that forces the fusion through the 3-op
+path, which still routes through shmem.  The 2-op
+`{MUL_MAT, ADD(bias)}` path §3.30 optimises only fires for
+a few tensors outside basic_tfm (time_mlp / final_proj /
+resnet t_mlp) that contribute negligibly to wall time.
+
+The harness itself is the real deliverable — any future
+attempt at the residual / gelu direct-store paths now has a
+way to get fast feedback on whether a change is correct
+before spending 2–3 h on an end-to-end chatterbox run.
+
+#### Why not also ship the residual / gelu direct-store retries
+
+The `{MUL_MAT, ADD, ADD}` residual fusion and `{MUL_MAT, ADD,
+GELU_ERF}` gelu fusion on the direct-store path were what
+failed in §3.29 (the test-metal-ops gate I've just added would
+have immediately flagged them as wrong output, avoiding the
+revert).  Fixing them needs either:
+
+- a deeper audit of `cT.store`'s cooperative write layout vs
+  Metal memory ordering with `mem_flags::mem_device` — likely
+  where §3.29 broke; OR
+- a different strategy entirely (e.g., inline residual read
+  into the simdgroup accumulator before `simdgroup_store`,
+  avoiding the post-barrier RMW round-trip).
+
+Either is 2–3 h of Metal-specific debugging.  Left for a future
+session; the harness now makes that session tractable.
+
+#### Files touched
+
+| File | Change |
+|------|--------|
+| `src/test_metal_ops.cpp` | New `test_mul_mm_fused(cpu, gpu, K, N, T, B, fuse_mode, label)` helper + 8 test invocations covering the CFM shape space.  New `#include "ggml-cpu.h"` for the CPU reference backend (via the existing include cluster). |
+| `ggml/src/ggml-metal/ggml-metal.metal` | Bias-only direct-store path: full-block write via `cT.store` / `simdgroup_store`, then `threadgroup_barrier(mem_flags::mem_device)`, then a 128-thread scan adding `bias[r0 + row_off]` to each of the 2048 elements.  Only fires when `FC_mul_mm_has_bias && !FC_mul_mm_has_residual && !FC_mul_mm_has_gelu_erf` — gated narrowly to the scope the harness validates. |
+| `ggml/src/ggml-metal/ggml-metal-device.cpp` | Shmem sizing: 8 KB when `bc_out || has_residual || has_gelu_erf`; 6 KB for bias-only-direct-store and non-fused calls. |
+| [patches/ggml-metal-chatterbox-ops.patch](patches/ggml-metal-chatterbox-ops.patch) | Regenerated from pinned `58c38058`.  1070 → 1088 lines, +18 (direct-store bias scan + shmem-sizing comment). Applies cleanly. |
+
+#### Follow-up tracking
+
+Three items still deferred:
+
+1. **Residual direct-store** — needs the cooperative-store
+   barrier audit mentioned above.  Harness is ready.
+2. **Gelu direct-store** — same as residual.  The inline-math
+   cost is cheap, so the win is mostly avoiding the shmem
+   roundtrip (like bias).  Estimated +2–5 ms on M3 Ultra
+   _if_ it works; infra pattern identical to §3.28 and §3.30.
+3. **Extend fusion to other unary sub-ops** (SILU, GELU
+   non-erf, RELU, GELU_QUICK) — trivial copy-paste of §3.28;
+   not done because chatterbox / T3 / CFM don't emit those
+   after a mul_mat+bias pair.  Useful infra for downstream
+   consumers of this patch (stable-diffusion.cpp / tts-cpp).
+
+### 3.31  iOS-arm64 cross-build + M4 validation harness (`scripts/bench-m4-validation.sh`)
+
+Closes the validation gap left by §3.24 / §3.26 / §3.27 / §3.28 / §3.30
+— all of those predict positive-on-bandwidth-limited-hardware
+(M4 Air / iPhone / iPad) but were measured only on M3 Ultra where
+per-dispatch overhead is so low that the fusion wins largely
+cancel out against kernel-path overhead.  Two pieces:
+
+#### 1. iOS-arm64 build portability
+
+Cross-compiled `libggml-metal.a` + `libtts-cpp.a` for iOS 14.0+
+arm64 on this M3 Ultra host (Xcode 16 / iOS 18.5 SDK):
+
+```
+cmake -S . -B build-ios \
+  -DCMAKE_SYSTEM_NAME=iOS \
+  -DCMAKE_OSX_SYSROOT=iphoneos \
+  -DCMAKE_OSX_ARCHITECTURES=arm64 \
+  -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+  -DGGML_METAL=ON -DGGML_METAL_EMBED_LIBRARY=ON \
+  -DGGML_NATIVE=OFF -DGGML_BLAS=OFF -DGGML_ACCELERATE=OFF
+cmake --build build-ios --target tts-cpp ggml-metal -j
+```
+
+Both libraries produce clean `arm64`-only archives:
+
+```
+build-ios/ggml/src/ggml-metal/libggml-metal.a: arm64
+build-ios/libtts-cpp.a: arm64
+```
+
+That's the **structural validation** that §3.26's
+`kernel_mul_mv_f32_f16{,_4,_short}` variants and §3.27 / §3.28 /
+§3.30's `kernel_mul_mm` FC-gated bias / gelu_erf fold-ins are
+iOS-portable — none of the kernel code uses macOS-only
+intrinsics.  Runtime validation still requires a real iOS device
+(TestFlight / Xcode device provisioning); this confirms there's
+no compile-time barrier to shipping.
+
+#### 2. `scripts/bench-m4-validation.sh`
+
+Self-contained harness the user runs on any Apple-silicon Mac
+(M4 Air / M4 Pro / M3 / etc.) or any host that mounts the model
+GGUFs.  Pipeline:
+
+1. Apply the pinned ggml patch via `scripts/setup-ggml.sh`
+2. Configure + build `build-metal` (Release, GGML_METAL=ON,
+   GGML_BLAS=OFF, GGML_NATIVE=ON)
+3. Run `test-metal-ops` — asserts all 14 gates PASS (3 base
+   diag/pad + 3 conv_transpose_1d HiFT + 8 fused-mul_mm)
+4. Run 5 invocations of `chatterbox` on the Spanish-prompt
+   baseline (Q4_0 + HiFT F16 v2 GGUF + seed 42)
+5. Collect per-run `[encoder]` / `[cfm_total]` / `[hift_decode]` /
+   `S3GEN_INFER_MS` / `T3_INFER_MS`
+6. Compute means, compare against the M3 Ultra reference baked
+   into the script header:
+
+       M3U CFM   = 534.0 ms
+       M3U S3Gen = 706.6 ms
+       M3U T3    = 432.6 ms
+       M3U HiFT  = 121.1 ms
+
+7. Check WAV determinism (all 5 runs same md5) and byte-exactness
+   vs the M3U reference md5 `d8a1b22375dbcb2259c686426a7d76c5`
+8. Write `artifacts/bench/m4-validation.json` with the full
+   comparison + host info (chip, model)
+
+Dependencies on the target host:
+
+- macOS + Xcode command-line tools (`cmake`, `clang++`)
+- Python 3 (for `scripts/setup-ggml.sh`'s gguf tooling)
+- Model GGUFs at the usual paths (or override via env vars:
+  `T3_GGUF=... S3GEN_GGUF=... REF_WAV=... RUNS=... bash scripts/bench-m4-validation.sh`)
+- ~16 GB disk for model + build artefacts
+
+Example predicted output on M4 Air (hypothetical; actual to be
+captured when the script runs on M4 hardware):
+
+```
+=== Summary: Apple M4 vs M3 Ultra reference ===
+stage                 M3 Ultra (ref)       this host       Δ vs M3U
+[cfm_total] ms                 534.0           ~XXX.X      -A / -B%
+S3GEN_INFER_MS                 706.6           ~YYY.Y      -C / -D%
+```
+
+The `Δ` column tells us whether the §3.27 / §3.28 / §3.30
+predicted-positive story holds.  If M4 shows noticeably smaller
+CFM than M3U after accounting for M4's higher single-core clock,
+the shipping portfolio is vindicated.  If M4 matches M3U or
+regresses, §3.27 / §3.30 should be re-examined.
+
+#### Self-smoke on M3 Ultra
+
+Ran the script locally as a sanity check — expected to show
+"this host == reference" with no deltas:
+
+```
+=== Summary: Apple M3 Ultra vs M3 Ultra reference ===
+stage                 M3 Ultra (ref)       this host       Δ vs M3U
+[cfm_total] ms                 534.0           533.7    -0.3 (-0.1%)
+S3GEN_INFER_MS                 706.6           707.4    +0.8 (+0.1%)
+T3_INFER_MS                    432.6           434.6    +2.0 (+0.5%)
+[hift_decode] ms               121.1           123.1    +2.0 (+1.7%)
+
+=== Parity ===
+determinism: PASS  (md5 d8a1b22375dbcb2259c686426a7d76c5 stable across 5 runs)
+byte-exact vs M3 Ultra: PASS (d8a1b22375dbcb2259c686426a7d76c5)
+```
+
+All deltas within per-invocation stdev.  Script is ready to
+scp + run on any M4 / M3 / M2 box.
+
+#### Files touched
+
+| File | Change |
+|------|--------|
+| [scripts/bench-m4-validation.sh](scripts/bench-m4-validation.sh) | New 150-line bash script.  Self-contained: pins the M3 Ultra reference numbers, runs test-metal-ops, 5-invocation bench, compares, writes JSON. |
+
+#### Next
+
+- Run the script on an M4 Air (user action: `scp -r chatterbox.cpp m4:` + `scp models/*.gguf m4:.../models/` + `ssh m4 'bash chatterbox.cpp/scripts/bench-m4-validation.sh'` + `scp m4:.../artifacts/bench/m4-validation.json .`).
+- If M4 results confirm the prediction: update the §3.27 / §3.28 / §3.30 sections with the M4 numbers alongside M3U.
+- If M4 results contradict the prediction: file a follow-up to revisit the fusion costs on smaller Apple silicon.
diff --git a/README.md b/README.md
index be6e1e1..c6878fd 100644
--- a/README.md
+++ b/README.md
@@ -25,11 +25,15 @@ reference wav (T3 + S3Gen + HiFT, warm runs, excludes model load):
 | CPU (Mac Studio M3 Ultra, NEON)      | 7 568 ms  | 1.05   | 0.96×        | 2.3× faster     |
 | Reference (ONNX Runtime, CPU Q4)     | 6.4–17 s  | 1.2–3.2 | 0.3–0.85×   | —               |
 
-**Multilingual** (same Spanish prompt, seed 42, M4 Mac, built-in voice;
+**Multilingual** (same Spanish prompt, seed 42, built-in voice;
 ONNX reference uses `jfk.wav` via the [multilingual-bench][bench] script):
 
 | Backend                              | Wall      | `RTF` | vs real-time | vs ONNX Runtime |
 |--------------------------------------|----------:|------:|-------------:|----------------:|
+| **Metal (M3 Ultra, Q4_0, `--cfm-steps 7`)** | **1.05 s**| **0.30** | **3.3×**     | **48.4× faster**¹ |
+| Metal (M3 Ultra, Q4_0)               |  **1.22 s** | 0.35  | 2.9×        | **42.0× faster**¹ |
+| Metal (M3 Ultra, F16, `--cfm-steps 7`)| 1.16 s   |  0.32  | 3.2×        | **45.9× faster**¹ |
+| Metal (M3 Ultra, F16)                |  1.41 s   |  0.38  | 2.6×        | **37.5× faster**¹ |
 | **Metal (M4, Q4_0)**                 |  **3.0 s**| 1.37  | 0.73×        | **10.6× faster**¹ |
 | Metal (M4, F16)                      |   4.0 s   | 1.65  | 0.61×        | **14.2× faster**¹ |
 | CPU (M4, 4t NEON, Q4_0)              |   6.0 s   | 2.69  | 0.37×        | **5.4× faster**¹  |
@@ -37,6 +41,12 @@ ONNX reference uses `jfk.wav` via the [multilingual-bench][bench] script):
 | Reference (ONNX Runtime, CPU 4t, q4) |  31.7 s   |14.55  | 0.07×        | —                |
 | Reference (ONNX Runtime, CPU 4t, fp16)|53.3 s   |23.50  | 0.04×        | —                |
 
+The M3 Ultra rows reflect the §3.21 optimisation pass — CFG cond+uncond
+batched into one Metal forward (B=2) on T3, the new `--cfm-steps N` knob
+on the standard 10-step CFM (N=7 is the recommended quality knee, log-mel
+cosine vs N=10 = **0.995**), and `ggml_swiglu_split` on the Llama MLP.
+The M4 rows are kept for continuity with §3.19/§3.20.
+
 ¹ ONNX Runtime's multilingual ONNX export ships **without** the
 `text_emb_weight.bin` tensor and logs `CFG disabled` at load, so it's
 running half the compute of the ggml pipeline (1 T3 forward per token
@@ -386,6 +396,14 @@ Extra MTL-only knobs: `--cfg-weight F` (default 0.5, must be ≥ 0),
 intensity, in [0, 1]).  `--reference-audio` works
 the same way on both variants.
 
+`--cfm-steps N` lowers the CFM Euler step count for non-streaming
+synthesis (default 10 for Multilingual's standard CFM).  N=7 saves ~22%
+of S3Gen wall time at log-mel cosine 0.995 vs the N=10 reference and is
+the recommended quality knee on M3 Ultra (see [`PROGRESS.md §3.21`](PROGRESS.md));
+N=6 is too aggressive (cosine 0.990 right at the threshold, PCM cosine
+drops to 0.88).  Streaming chunks ignore this flag and use
+`--stream-cfm-steps` instead.
+
 Everything is self-contained in the two `.gguf` files:
 
 - `chatterbox-t3-turbo.gguf` embeds the BPE tokenizer (vocab + merges +
@@ -649,6 +667,71 @@ throughput, so CPU keeps the two-call path.  See
 [`PROGRESS.md §3.19`](PROGRESS.md) for the measurement and a discussion
 of where the MTL slowdown lives relative to Turbo.
 
+### Multilingual (Mac Studio M3 Ultra, after §3.21 optimisation pass)
+
+Same Spanish prompt (`"Hola, esto es una demostración multilingüe."`,
+`--language es`), `jfk.wav` voice, seed 42, greedy (`--temp 0 --top-k 1`),
+3 warm runs averaged.  T3 is now CFG-batched into a single Metal forward
+(B=2, mirrors S3Gen's `use_b2`); MLP uses `ggml_swiglu_split` so the 30
+SiLU+Mul element-wise pairs collapse into one fused Metal kernel per
+layer.  The new `--cfm-steps N` flag exposes the standard CFM step count
+(default 10); N=7 is the recommended quality knee (log-mel cosine vs N=10
+= **0.995**).
+
+| Config                              | T3 infer           | S3Gen infer | Audio | **RTF** |
+|-------------------------------------|-------------------:|------------:|------:|--------:|
+| MTL, Metal Q4_0, `--cfm-steps 7`    |  478 ms /  84 tok  |    576 ms   | 3.48 s|  0.30   |
+| MTL, Metal Q4_0 (default N=10)      |  482 ms /  84 tok  |    730 ms   | 3.48 s|  0.35   |
+| MTL, Metal F16, `--cfm-steps 7`     |  579 ms /  89 tok  |    586 ms   | 3.68 s|  0.32   |
+| MTL, Metal F16 (default N=10)       |  613 ms /  89 tok  |    752 ms   | 3.68 s|  0.37   |
+
+Compared to the M4 multilingual numbers above, the M3 Ultra hits
+**RTF 0.30** on Q4_0 — a 4.6× speedup.  The CFG-batching alone drops T3
+by 42–45% (see PROGRESS.md §3.21 for the full bench matrix and the
+NEGATIVE results for F16 KV cache and SwiGLU on F16).
+
+### Multilingual (M3 Ultra, post §3.24–§3.31 Metal kernel portfolio)
+
+Same prompt, voice, seed as §3.21 above.  Adds, on top of §3.21:
+
+- **§3.24** — HiFT conv-kernel F16 quantisation (64 tensors).
+- **§3.26** — `kernel_mul_mv_f32_f16{,_4,_short}` Metal kernel variants
+  to unblock 21 more HiFT `source_*` F16 tensors
+  (GGUF shrinks **754 → 747 MB**, WAV cos 1.000000 vs §3.24).
+- **§3.27** — `kernel_mul_mm` + `ADD(bias)` [+ `ADD(residual)`] fusion
+  for the CFM transformer Q4_0 mat-muls (1820 saved `ggml_add`
+  dispatches per synth).
+- **§3.28** — extends the fusion to absorb `GELU_ERF` (CFM FF ff0
+  activation path; 1120 additional saved dispatches).
+- **§3.30** — `test-metal-ops` fused-mul_mm parity harness + bias-only
+  direct-store variant.
+- **§3.31** — iOS-arm64 cross-build portability +
+  `scripts/bench-m4-validation.sh` for M4 hand-off.
+
+5-invocation averages (`default N=10` CFM — compare to the §3.21 N=10 row):
+
+| Config                              | T3 infer            | S3Gen infer | Audio | **RTF** |
+|-------------------------------------|--------------------:|------------:|------:|--------:|
+| MTL, Metal Q4_0 + HiFT F16 v2 (§3.28) |  433 ms / 84 tok  |    706 ms   | 3.48 s| **0.33** |
+| MTL, Metal Q4_0 baseline (§3.21 N=10) |  482 ms / 84 tok  |    730 ms   | 3.48 s|  0.35    |
+| **Δ §3.21 → §3.28**                   |  **−49 ms / −10.2 %** | **−24 ms / −3.3 %** | — | **−0.02** |
+
+WAV is byte-exact deterministic across runs (md5
+`d8a1b22375dbcb2259c686426a7d76c5` ×5).  Parity harness
+`test-metal-ops` passes 14 gates (3 base + 3 conv_transpose_1d + 8
+fused `mul_mm`).  Patch `patches/ggml-metal-chatterbox-ops.patch`
+(1088 lines) applies cleanly on a fresh ggml clone at pinned
+`58c38058`.  All §3.24–§3.30 kernel changes cross-compile cleanly
+for iOS-arm64 (portability verified; runtime measurement deferred
+until an M4 / iPhone / iPad run of
+[`scripts/bench-m4-validation.sh`](scripts/bench-m4-validation.sh)).
+
+M3 Ultra CFM time specifically drops from 541.9 ms → 534.0 ms
+(**−1.5 %**) — modest on this chip because per-dispatch overhead
+is very low; expected to be larger on bandwidth-limited silicon
+(M4 / A-series) where each saved `ggml_add` dispatch is worth more
+relative to compute.
+
 ### Reference comparison vs onnxruntime (Multilingual, M4 CPU, F16)
 
 Same prompt, seed, and reference audio fed through
diff --git a/SUMMARY-3.24-3.31.md b/SUMMARY-3.24-3.31.md
new file mode 100644
index 0000000..6c15c06
--- /dev/null
+++ b/SUMMARY-3.24-3.31.md
@@ -0,0 +1,95 @@
+# §3.24–§3.31 portfolio — closeout summary
+
+**Branch**: `multilingual_merged`  &nbsp;&nbsp;|&nbsp;&nbsp; **Last commit**: `0902381`  &nbsp;&nbsp;|&nbsp;&nbsp; **Period**: Apr 30 – May 1, 2026
+
+A compact summary of the §3.24 → §3.31 optimisation pass on top of
+the §3.21 baseline.  For the full chronological development journal
+and every negative finding, see [`PROGRESS.md`](PROGRESS.md).
+
+---
+
+## What shipped (8 commits)
+
+| Section | Commit | Nature | Net M3 Ultra | Net GGUF |
+|--------:|:-------|:-------|:-------------|:---------|
+| §3.24 | *(earlier)* | HiFT F16 conv kernels (64 tensors) | −3.6 ms HiFT | −33 MB |
+| §3.25 | `c47c776` | FA flow-encoder — **negative finding**, reverted | docs | — |
+| §3.26 | `daae187` | Missing `kernel_mul_mv_f32_f16{,_4,_short}` variants → 21 more HiFT F16 tensors | neutral | **−7.7 MB** |
+| §3.27 | `52d184a` | `mul_mm + ADD(bias)[+residual]` fusion | neutral M3U (infra) | — |
+| §3.28 | `64c991d` | + `GELU_ERF` fold-in (CFM FF ff0) | **−8.8 ms CFM** | — |
+| §3.29 | `4633172` | Direct-store RMW — **negative finding**, reverted | docs | — |
+| §3.30 | `145c822` | `test-metal-ops` fused-mul_mm harness + bias-only direct-store retry | neutral M3U (infra) | — |
+| §3.31 | `0902381` | iOS-arm64 cross-build + `scripts/bench-m4-validation.sh` | infra | — |
+
+**Net M3 Ultra**: CFM **541.9 → 534.0 ms (−7.9 ms / −1.5 %)**, S3Gen
+**709 → 706 ms**, GGUF **754.4 → 746.7 MB (−7.7 MB)**.  Five
+commits deliver measurable change; three are documented negative
+findings or infrastructure work that de-risks future rounds.
+
+## Parity guarantees
+
+- **WAV byte-exact** across all 5 benched invocations on the shipping
+  config (Q4_0 + HiFT F16 v2 GGUF, ES prompt, seed 42, `--temp 0
+  --top-k 1 --n-gpu-layers 1`): md5 `d8a1b22375dbcb2259c686426a7d76c5`.
+  Matches the §3.26 baseline exactly; §3.27/§3.28/§3.30 don't drift
+  it by a single bit.
+- **14 / 14 `test-metal-ops` gates PASS**:
+  `diag_mask_inf`, `pad_ext`, 4× `conv_transpose_1d` (HiFT upsamples
+  + tiny edge), 8× `mul_mm_fused` (covers CFM attn Q/K/V/out, FF
+  gate/down, b=1, bc_out edge shapes, both bias and gelu fusion).
+- **End-to-end smoke** across all 8 model pairs
+  (2 T3 × 4 S3Gen variants): all produce correct output.
+- **Streaming mode** (25-token chunks): 4 chunks, 938 ms first-chunk
+  latency, no NaN/Inf.
+- **Long-text** (309 tokens, 12.57 s audio): no NaN/Inf,
+  speech-healthy RMS 1233.
+- **Patch portability**:
+  [`patches/ggml-metal-chatterbox-ops.patch`](patches/ggml-metal-chatterbox-ops.patch)
+  (1088 lines) and `patches/ggml-opencl-chatterbox-ops.patch`
+  (unmodified in this period) both apply cleanly via `git apply
+  --check` on a fresh ggml clone at pinned `58c38058`.
+- **iOS-arm64 cross-build**: `libggml-metal.a` + `libtts-cpp.a`
+  compile clean for iOS 14.0+ arm64 with Xcode 16 / iOS 18.5 SDK —
+  structural proof the §3.26/§3.27/§3.28/§3.30 kernel work is
+  iOS-portable (no macOS-only intrinsics).
+
+## Open follow-ups (tracked in PROGRESS)
+
+| Item | Effort | Expected gain | Status |
+|:-----|:-------|:--------------|:-------|
+| M4 / iPhone / iPad validation of §3.24/§3.27/§3.28/§3.30 on bandwidth-limited silicon | 0.5–2 h on hardware | predicted +5–15 ms S3Gen; untested | hand-off script shipped (`scripts/bench-m4-validation.sh`); awaiting test host |
+| Residual + gelu direct-store retry (with §3.30 harness as safety net) | 2–3 h | potential +3–8 ms M3 Ultra CFM | deferred; §3.29 negative finding root-caused to cooperative-store memory ordering, needs Metal memory-model audit |
+| Extend fusion to other unary sub-ops (SILU / GELU / RELU / GELU_QUICK) | ~15 LOC each | 0 ms chatterbox (not in graph); useful downstream infra | deferred as pure-infra |
+| Q4_0 HiFT via 2-D-on-disk storage + `conv1d_f32` branch | 1–2 days | +4–8 ms HiFT, −30 MB GGUF | deferred (large surgery: converter + C++) |
+| T3 speculative decoding | 2–5 days | −130 to −200 ms T3 (−10 to −15 % wall) | largest remaining lever; needs its own planning session |
+
+## Final bench — shipping config
+
+`./build-metal/chatterbox --model models/chatterbox-t3-mtl-q4_0.gguf --s3gen-gguf models/chatterbox-s3gen-mtl-q4_0_hift_f16_v2.gguf --reference-audio /tmp/jfk.wav --text "Hola mundo, esta es una prueba multilingue." --language es --seed 42 --temp 0 --top-k 1 --n-gpu-layers 1 --out /tmp/cb.wav`
+
+**M3 Ultra Metal, 5 invocations averaged:**
+
+| Stage | Mean | Stdev |
+|:------|-----:|------:|
+| mel | 14.6 ms | 0.2 |
+| `[encoder]` | 30.5 ms | 0.7 |
+| `[cfm_total]` | **534.0 ms** | **1.3** |
+| `[hift_decode]` | 121.1 ms | 0.6 |
+| S3GEN_INFER_MS | **706.6 ms** | 4.5 |
+| T3_INFER_MS | **432.6 ms** (84 tokens) | 2.2 |
+| **Total inference** | **~1165 ms** | |
+| **RTF** | **0.33** | |
+
+Audio output: 3.48 s WAV from 84 speech tokens. Byte-exact and
+deterministic.
+
+## How to reproduce
+
+```bash
+# From the multilingual_merged branch HEAD
+scripts/setup-ggml.sh                      # apply pinned ggml patches
+cmake -S . -B build-metal -DGGML_METAL=ON -DGGML_BLAS=OFF -DGGML_NATIVE=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build build-metal -j
+./build-metal/test-metal-ops               # all 14 gates should PASS
+bash scripts/bench-m4-validation.sh        # also works on M3 Ultra; prints Δ vs the reference baked into the script
+```
diff --git a/patches/ggml-metal-chatterbox-ops.patch b/patches/ggml-metal-chatterbox-ops.patch
index 89fc4f3..445f6a3 100644
--- a/patches/ggml-metal-chatterbox-ops.patch
+++ b/patches/ggml-metal-chatterbox-ops.patch
@@ -1,8 +1,54 @@
 diff --git a/src/ggml-metal/ggml-metal-device.cpp b/src/ggml-metal/ggml-metal-device.cpp
-index e8548b0..5d93363 100644
+index e8548b05..2a9c5856 100644
 --- a/src/ggml-metal/ggml-metal-device.cpp
 +++ b/src/ggml-metal/ggml-metal-device.cpp
-@@ -699,7 +699,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_meta
+@@ -668,7 +668,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext(ggml_
+     return res;
+ }
+ 
+-ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_metal_library_t lib, const ggml_tensor * op) {
++ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_metal_library_t lib, const ggml_tensor * op, bool has_bias, bool has_residual, bool has_gelu_erf) {
+     char base[256];
+     char name[256];
+ 
+@@ -679,27 +679,39 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_meta
+     const bool bc_out = op->ne[0] % 64 != 0 || op->ne[1] % 32 != 0;
+ 
+     snprintf(base, 256, "kernel_mul_mm_%s_%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1));
+-    snprintf(name, 256, "%s_bci=%d_bco=%d", base, bc_inp, bc_out);
++    snprintf(name, 256, "%s_bci=%d_bco=%d_bias=%d_res=%d_gelu=%d", base, bc_inp, bc_out,
++             (int) has_bias, (int) has_residual, (int) has_gelu_erf);
+ 
+     ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+     if (!res.pipeline) {
+         ggml_metal_cv_t cv = ggml_metal_cv_init();
+ 
+-        ggml_metal_cv_set_bool(cv, bc_inp, FC_MUL_MM + 0);
+-        ggml_metal_cv_set_bool(cv, bc_out, FC_MUL_MM + 1);
++        ggml_metal_cv_set_bool(cv, bc_inp,       FC_MUL_MM + 0);
++        ggml_metal_cv_set_bool(cv, bc_out,       FC_MUL_MM + 1);
++        ggml_metal_cv_set_bool(cv, has_bias,     FC_MUL_MM + 2);
++        ggml_metal_cv_set_bool(cv, has_residual, FC_MUL_MM + 3);
++        ggml_metal_cv_set_bool(cv, has_gelu_erf, FC_MUL_MM + 4);
+ 
+         res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
+ 
+         ggml_metal_cv_free(cv);
+     }
+ 
+-    // when the output size is not multiple of 64x32, we need extra smem to prevent out-of-bounds writes
+-    res.smem = bc_out ? 8192 : 4096 + 2048;
++    // Shmem sizing:
++    //   - edge block (bc_out=1): 8 KB for temp_str (scalar-copy landing).
++    //   - full block + residual or gelu fold-in: 8 KB — routed through
++    //     shmem because cT.store's cooperative layout + device RMW was
++    //     unstable for these (see §3.29 writeup).
++    //   - full block + bias-only fold-in: 6 KB — direct-store path with
++    //     a post-barrier bias-add scan does not need temp_str (§3.30).
++    //   - full block, no fold-in: 6 KB.
++    const bool _nonbias_foldin = has_residual || has_gelu_erf;
++    res.smem = (bc_out || _nonbias_foldin) ? 8192 : 4096 + 2048;
+ 
      return res;
  }
  
@@ -11,7 +57,7 @@ index e8548b0..5d93363 100644
      GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
      GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
  
-@@ -857,13 +857,15 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
+@@ -857,13 +869,15 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
      };
  
      snprintf(base, 256, "kernel_mul_mv_%s_%s%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1), suffix);
@@ -29,7 +75,7 @@ index e8548b0..5d93363 100644
  
          res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
  
-@@ -1854,6 +1856,20 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d(g
+@@ -1854,6 +1868,20 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d(g
      return res;
  }
  
@@ -51,14 +97,16 @@ index e8548b0..5d93363 100644
      assert(op->op == GGML_OP_ARANGE);
  
 diff --git a/src/ggml-metal/ggml-metal-device.h b/src/ggml-metal/ggml-metal-device.h
-index de43f81..b6d6b3d 100644
+index de43f819..f514ee45 100644
 --- a/src/ggml-metal/ggml-metal-device.h
 +++ b/src/ggml-metal/ggml-metal-device.h
-@@ -129,7 +129,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_del
+@@ -128,8 +128,8 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv
+ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net   (ggml_metal_library_t lib, const struct ggml_tensor * op);
  struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri         (ggml_metal_library_t lib, const struct ggml_tensor * op);
  struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext        (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
- struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm            (ggml_metal_library_t lib, const struct ggml_tensor * op);
+-struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm            (ggml_metal_library_t lib, const struct ggml_tensor * op);
 -struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv            (ggml_metal_library_t lib, const struct ggml_tensor * op);
++struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm            (ggml_metal_library_t lib, const struct ggml_tensor * op, bool has_bias, bool has_residual, bool has_gelu_erf);
 +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv            (ggml_metal_library_t lib, const struct ggml_tensor * op, bool has_bias, bool has_residual);
  struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_map0    (ggml_metal_library_t lib, int ne02, int ne20);
  struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id         (ggml_metal_library_t lib, const struct ggml_tensor * op);
@@ -72,7 +120,7 @@ index de43f81..b6d6b3d 100644
  struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op);
  struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_adamw    (ggml_metal_library_t lib, const struct ggml_tensor * op);
 diff --git a/src/ggml-metal/ggml-metal-device.m b/src/ggml-metal/ggml-metal-device.m
-index 40cacb4..089854e 100644
+index 40cacb46..089854e8 100644
 --- a/src/ggml-metal/ggml-metal-device.m
 +++ b/src/ggml-metal/ggml-metal-device.m
 @@ -1128,12 +1128,16 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
@@ -95,7 +143,7 @@ index 40cacb4..089854e 100644
          case GGML_OP_TOP_K:
          case GGML_OP_ARANGE:
 diff --git a/src/ggml-metal/ggml-metal-impl.h b/src/ggml-metal/ggml-metal-impl.h
-index 62b028f..384db80 100644
+index 62b028f4..384db806 100644
 --- a/src/ggml-metal/ggml-metal-impl.h
 +++ b/src/ggml-metal/ggml-metal-impl.h
 @@ -993,8 +993,19 @@ typedef struct {
@@ -119,7 +167,7 @@ index 62b028f..384db80 100644
      int64_t  ne00;
      int64_t  ne01;
 diff --git a/src/ggml-metal/ggml-metal-ops.cpp b/src/ggml-metal/ggml-metal-ops.cpp
-index 846225d..3555ce8 100644
+index 846225d9..6763fcc2 100644
 --- a/src/ggml-metal/ggml-metal-ops.cpp
 +++ b/src/ggml-metal/ggml-metal-ops.cpp
 @@ -410,6 +410,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
@@ -133,9 +181,155 @@ index 846225d..3555ce8 100644
          case GGML_OP_ARANGE:
              {
                  n_fuse = ggml_metal_op_arange(ctx, idx);
-@@ -2186,7 +2190,85 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
+@@ -2156,7 +2160,123 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
+         //    default: break;
+         //}
+ 
+-        auto pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);
++        // chatterbox PROGRESS §3.27: MUL_MAT + ADD(bias) [+ ADD(residual)]
++        // §3.28: also recognises MUL_MAT + ADD(bias) + GELU_ERF (the CFM
++        // basic_tfm ff0 activation: `ff0_b += mul_mat(ff0_w, nx)`, then
++        // `gelu_erf(ff0_b)`).  Fusion for the `mul_mm` matrix-matrix
++        // kernel, mirroring the Q-variant `mul_mv` fusion just above.
++        // The kernel is shape-generic (any tsrc0 / tsrc1 combo) — the
++        // helper inlined into `kernel_mul_mm`'s shmem-copy path handles
++        // bias / residual / gelu_erf in f32 regardless of weight dtype.
++        // That means this fusion fires for the CFM U-Net's 168× Q4_0
++        // attn Q/K/V/O, 56× Q4_0 FF gate (with gelu_erf), and 56× Q4_0
++        // FF down mat-muls per step, which the mul_mv fusion couldn't
++        // reach because ne11 > mm_min routes those to mul_mm.  Expected
++        // savings: ~1820 bin_fuse dispatches (§3.27) + ~1120 gelu_erf
++        // dispatches (§3.28) per synth.
++        bool has_bias     = false;
++        bool has_residual = false;
++        bool has_gelu_erf = false;
++        int  n_fuse       = 1;
++        ggml_metal_buffer_id bid_bias     = {};
++        ggml_metal_buffer_id bid_residual = {};
++
++        auto bias_ok = [&](const ggml_tensor * b) {
++            return b &&
++                   ggml_is_contiguous(b) &&
++                   b->type == GGML_TYPE_F32 &&
++                   ggml_nelements(b) == ne0 &&
++                   b->ne[0]          == ne0;
++        };
++        auto residual_ok = [&](const ggml_tensor * r, const ggml_tensor * mm) {
++            return r && mm &&
++                   ggml_is_contiguous(r) &&
++                   r->type == GGML_TYPE_F32 &&
++                   ggml_are_same_shape(r, mm);
++        };
++
++        if (ctx->use_fusion) {
++            // Try MUL_MAT + ADD(bias) + ADD(residual) first.  ggml_add is
++            // commutative and the high-level graph builder may emit the
++            // residual as either src[0] or src[1] of the outer add
++            // (chatterbox's basic_tfm uses `ggml_add(x, attn_out)` with
++            // residual=x as src[0], attn_out=f1 as src[1]).  Handle both.
++            ggml_op fops3[3] = { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_ADD };
++            if (ctx->can_fuse(idx, fops3, 3)) {
++                ggml_tensor * f0 = ctx->node(idx);
++                ggml_tensor * f1 = ctx->node(idx + 1);
++                ggml_tensor * f2 = ctx->node(idx + 2);
++                auto try_fuse3 = [&](ggml_tensor * bias_t, ggml_tensor * resi_t, ggml_tensor * parent_add) -> bool {
++                    if (!(f1->src[0] == f0 && bias_ok(bias_t))) return false;
++                    if (!residual_ok(resi_t, f0)) return false;
++                    if (parent_add != f1) return false; // residual add must reference f1
++                    ggml_metal_buffer_id bid_b = ggml_metal_get_buffer_id(bias_t);
++                    ggml_metal_buffer_id bid_r = ggml_metal_get_buffer_id(resi_t);
++                    if (!bid_b.metal || !bid_r.metal) return false;
++                    has_bias     = true;
++                    has_residual = true;
++                    bid_bias     = bid_b;
++                    bid_residual = bid_r;
++                    n_fuse       = 3;
++                    return true;
++                };
++                // Case A: f2 = add(f1, residual)   → src[0]=f1, src[1]=residual
++                if (!try_fuse3(f1->src[1], f2->src[1], f2->src[0])) {
++                    // Case B: f2 = add(residual, f1)   → src[0]=residual, src[1]=f1
++                    try_fuse3(f1->src[1], f2->src[0], f2->src[1]);
++                }
++            }
++            // §3.28: MUL_MAT + ADD(bias) + GELU_ERF (CFM ff0 activation
++            // path).  GELU_ERF is a sub-op of GGML_OP_UNARY, so we match
++            // on UNARY at the top level, then double-check the unary
++            // sub-op is GELU_ERF.  Other unary sub-ops could be added
++            // later (SILU, GELU, RELU, ...) — kept gated to GELU_ERF
++            // specifically because that's the one basic_tfm uses.
++            if (n_fuse == 1) {
++                ggml_op fops3g[3] = { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_UNARY };
++                if (ctx->can_fuse(idx, fops3g, 3)) {
++                    ggml_tensor * f0 = ctx->node(idx);
++                    ggml_tensor * f1 = ctx->node(idx + 1);
++                    ggml_tensor * f2 = ctx->node(idx + 2);
++                    if (ggml_get_unary_op(f2) == GGML_UNARY_OP_GELU_ERF &&
++                        f2->src[0] == f1) {
++                        ggml_tensor * bias_t = nullptr;
++                        if      (f1->src[0] == f0 && bias_ok(f1->src[1])) bias_t = f1->src[1];
++                        else if (f1->src[1] == f0 && bias_ok(f1->src[0])) bias_t = f1->src[0];
++                        if (bias_t) {
++                            ggml_metal_buffer_id bid_b = ggml_metal_get_buffer_id(bias_t);
++                            if (bid_b.metal) {
++                                has_bias     = true;
++                                has_gelu_erf = true;
++                                bid_bias     = bid_b;
++                                n_fuse       = 3;
++                            }
++                        }
++                    }
++                }
++            }
++            // Fall back to MUL_MAT + ADD(bias).  Also commutative-aware.
++            if (n_fuse == 1) {
++                ggml_op fops2[2] = { GGML_OP_MUL_MAT, GGML_OP_ADD };
++                if (ctx->can_fuse(idx, fops2, 2)) {
++                    ggml_tensor * f0 = ctx->node(idx);
++                    ggml_tensor * f1 = ctx->node(idx + 1);
++                    ggml_tensor * bias_t = nullptr;
++                    if      (f1->src[0] == f0 && bias_ok(f1->src[1])) bias_t = f1->src[1];
++                    else if (f1->src[1] == f0 && bias_ok(f1->src[0])) bias_t = f1->src[0];
++                    if (bias_t) {
++                        ggml_metal_buffer_id bid_b = ggml_metal_get_buffer_id(bias_t);
++                        if (bid_b.metal) {
++                            has_bias = true;
++                            bid_bias = bid_b;
++                            n_fuse   = 2;
++                        }
++                    }
++                }
++            }
++        }
++
++        auto pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op, has_bias, has_residual, has_gelu_erf);
+ 
+         ggml_metal_kargs_mul_mm args = {
+             /*.ne00 =*/ ne00,
+@@ -2179,14 +2299,105 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
+         ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+         ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+         ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
+-        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);
++        // When a fused variant is picked, dst must point to the final ADD's
++        // output tensor (matches mul_mv fusion convention).
++        ggml_metal_buffer_id bid_dst = (n_fuse > 1)
++            ? ggml_metal_get_buffer_id(ctx->node(idx + n_fuse - 1))
++            : ggml_metal_get_buffer_id(op);
++        ggml_metal_encoder_set_buffer  (enc, bid_dst, 3);
++        // Slots 4 and 5: bias and residual.  Bind a harmless stand-in
++        // (src[0]) when the corresponding function constant is false so
++        // the shader's dead-code-eliminated branch never reads it.
++        ggml_metal_encoder_set_buffer  (enc, has_bias     ? bid_bias     : ggml_metal_get_buffer_id(op->src[0]), 4);
++        ggml_metal_encoder_set_buffer  (enc, has_residual ? bid_residual : ggml_metal_get_buffer_id(op->src[0]), 5);
+ 
+         const size_t smem = pipeline.smem;
+ 
          ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
          ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
++
++        // Return n_fuse so the outer loop skips the fused ADDs.
++        return n_fuse;
      } else {
 -        auto pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);
 +        // Look ahead: can we fuse a following ADD(bias) (optionally followed
@@ -220,7 +414,7 @@ index 846225d..3555ce8 100644
  
          const int nr0 = pipeline.nr0;
          const int nr1 = pipeline.nr1;
-@@ -2220,7 +2302,20 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
+@@ -2220,7 +2431,20 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
          ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
          ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
          ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
@@ -242,7 +436,7 @@ index 846225d..3555ce8 100644
  
          ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
  
-@@ -2232,6 +2327,8 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
+@@ -2232,6 +2456,8 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
          } else {
              ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + nr0*nsg - 1)/(nr0*nsg)), ((ne11 + nr1 - 1)/nr1), ne12*ne13, 32, nsg, 1);
          }
@@ -251,7 +445,7 @@ index 846225d..3555ce8 100644
      }
  
      return 1;
-@@ -3813,7 +3910,9 @@ int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) {
+@@ -3813,7 +4039,9 @@ int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) {
      ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
      ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);
  
@@ -262,7 +456,7 @@ index 846225d..3555ce8 100644
  
      return 1;
  }
-@@ -3949,6 +4048,14 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
+@@ -3949,6 +4177,14 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
      GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
      GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
  
@@ -277,7 +471,7 @@ index 846225d..3555ce8 100644
      ggml_metal_kargs_pad args = {
          /*.ne00 =*/ ne00,
          /*.ne01 =*/ ne01,
-@@ -3965,7 +4072,11 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
+@@ -3965,7 +4201,11 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
          /*.nb0  =*/ nb0,
          /*.nb1  =*/ nb1,
          /*.nb2  =*/ nb2,
@@ -290,7 +484,7 @@ index 846225d..3555ce8 100644
      };
  
      auto pipeline = ggml_metal_library_get_pipeline_pad(lib, op);
-@@ -4028,6 +4139,39 @@ int ggml_metal_op_pad_reflect_1d(ggml_metal_op_t ctx, int idx) {
+@@ -4028,6 +4268,39 @@ int ggml_metal_op_pad_reflect_1d(ggml_metal_op_t ctx, int idx) {
      return 1;
  }
  
@@ -331,7 +525,7 @@ index 846225d..3555ce8 100644
      ggml_tensor * op = ctx->node(idx);
  
 diff --git a/src/ggml-metal/ggml-metal-ops.h b/src/ggml-metal/ggml-metal-ops.h
-index 50e3c5c..a3aa05c 100644
+index 50e3c5c7..a3aa05c1 100644
 --- a/src/ggml-metal/ggml-metal-ops.h
 +++ b/src/ggml-metal/ggml-metal-ops.h
 @@ -81,6 +81,7 @@ int ggml_metal_op_conv_transpose_2d (ggml_metal_op_t ctx, int idx);
@@ -343,7 +537,7 @@ index 50e3c5c..a3aa05c 100644
  int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx);
  int ggml_metal_op_argmax            (ggml_metal_op_t ctx, int idx);
 diff --git a/src/ggml-metal/ggml-metal.metal b/src/ggml-metal/ggml-metal.metal
-index f67c5cd..006e1b9 100644
+index f67c5cd8..1b938fc2 100644
 --- a/src/ggml-metal/ggml-metal.metal
 +++ b/src/ggml-metal/ggml-metal.metal
 @@ -3300,6 +3300,63 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre
@@ -526,7 +720,40 @@ index f67c5cd..006e1b9 100644
  }
  
  // mat-vec kernel processing in chunks of float4
-@@ -4830,33 +4932,63 @@ typedef void (conv_transpose_1d_t)(
+@@ -4085,6 +4187,12 @@ typedef decltype(kernel_mul_mv_t_t<half, half>) mul_mv_t_t;
+ template [[host_name("kernel_mul_mv_f32_f32")]]   kernel mul_mv_t_t kernel_mul_mv_t_t<float, float>;
+ template [[host_name("kernel_mul_mv_f16_f32")]]   kernel mul_mv_t_t kernel_mul_mv_t_t<half,  float>;
+ template [[host_name("kernel_mul_mv_f16_f16")]]   kernel mul_mv_t_t kernel_mul_mv_t_t<half,  half>;
++// chatterbox PROGRESS §3.26: add f32_f16 variants so HiFT source_* conv
++// kernels (f16 after requantize-gguf.py --name-filter hift/source_) can
++// run on the f32-input / f16-weight mat-mv dispatch path. Without this
++// the Metal library lookup fails with "Function kernel_mul_mv_f32_f16
++// was not found in the library" at first HiFT decode.
++template [[host_name("kernel_mul_mv_f32_f16")]]   kernel mul_mv_t_t kernel_mul_mv_t_t<float, half>;
+ #if defined(GGML_METAL_HAS_BF16)
+ template [[host_name("kernel_mul_mv_bf16_f32")]]  kernel mul_mv_t_t kernel_mul_mv_t_t<bfloat, float>;
+ template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t_t kernel_mul_mv_t_t<bfloat, bfloat>;
+@@ -4209,6 +4317,8 @@ typedef decltype(kernel_mul_mv_t_t_4<half, half4, half, half4>) mul_mv_t_t_4;
+ template [[host_name("kernel_mul_mv_f32_f32_4")]]   kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4<float, float4, float, float4>;
+ template [[host_name("kernel_mul_mv_f16_f32_4")]]   kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4<half,  half4,  float, float4>;
+ template [[host_name("kernel_mul_mv_f16_f16_4")]]   kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4<half,  half4,  half,  half4>;
++// chatterbox PROGRESS §3.26: f32_f16 variant for the vec4 dispatch path.
++template [[host_name("kernel_mul_mv_f32_f16_4")]]   kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4<float, float4, half,  half4>;
+ #if defined(GGML_METAL_HAS_BF16)
+ template [[host_name("kernel_mul_mv_bf16_f32_4")]]  kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4<bfloat, bfloat4, float,  float4>;
+ template [[host_name("kernel_mul_mv_bf16_bf16_4")]] kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4<bfloat, bfloat4, bfloat, bfloat4>;
+@@ -4274,6 +4384,10 @@ typedef decltype(kernel_mul_mv_t_t_short<half, half>) mul_mv_t_t_short_t;
+ template [[host_name("kernel_mul_mv_f32_f32_short")]]  kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short<float, float>;
+ template [[host_name("kernel_mul_mv_f16_f32_short")]]  kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short<half,  float>;
+ template [[host_name("kernel_mul_mv_f16_f16_short")]]  kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short<half,  half>;
++// chatterbox PROGRESS §3.26: f32_f16 variant for the short-axis dispatch
++// path. This is the one that was missing and crashing HiFT when the 21
++// hift/source_* weights were converted to f16 — see §3.26 writeup.
++template [[host_name("kernel_mul_mv_f32_f16_short")]]  kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short<float, half>;
+ #if defined(GGML_METAL_HAS_BF16)
+ template [[host_name("kernel_mul_mv_bf16_f32_short")]]  kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short<bfloat, float>;
+ template [[host_name("kernel_mul_mv_bf16_bf16_short")]] kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short<bfloat, bfloat>;
+@@ -4830,33 +4944,63 @@ typedef void (conv_transpose_1d_t)(
          device const float * src1,
          device        char * dst,
          uint3   tgpig[[threadgroup_position_in_grid]],
@@ -607,7 +834,7 @@ index f67c5cd..006e1b9 100644
  }
  
  template [[host_name("kernel_conv_transpose_1d_f32_f32")]]
-@@ -4866,7 +4998,9 @@ kernel void kernel_conv_transpose_1d<float>(
+@@ -4866,7 +5010,9 @@ kernel void kernel_conv_transpose_1d<float>(
      device const float * src1,
      device        char * dst,
      uint3   tgpig[[threadgroup_position_in_grid]],
@@ -618,7 +845,7 @@ index f67c5cd..006e1b9 100644
  
  template [[host_name("kernel_conv_transpose_1d_f16_f32")]]
  kernel void kernel_conv_transpose_1d<half>(
-@@ -4875,7 +5009,9 @@ kernel void kernel_conv_transpose_1d<half>(
+@@ -4875,7 +5021,9 @@ kernel void kernel_conv_transpose_1d<half>(
      device const float * src1,
      device        char * dst,
      uint3   tgpig[[threadgroup_position_in_grid]],
@@ -629,7 +856,7 @@ index f67c5cd..006e1b9 100644
  
  
  typedef void (conv_transpose_2d_t)(
-@@ -5250,17 +5386,23 @@ kernel void kernel_pad_f32(
+@@ -5250,17 +5398,23 @@ kernel void kernel_pad_f32(
      const int64_t i2 = tgpig.y;
      const int64_t i1 = tgpig.x;
  
@@ -660,7 +887,7 @@ index f67c5cd..006e1b9 100644
              } else {
                  dst_ptr[i0] = 0.0f;
              }
-@@ -5274,6 +5416,37 @@ kernel void kernel_pad_f32(
+@@ -5274,6 +5428,37 @@ kernel void kernel_pad_f32(
      }
  }
  
@@ -698,3 +925,164 @@ index f67c5cd..006e1b9 100644
  kernel void kernel_pad_reflect_1d_f32(
      constant   ggml_metal_kargs_pad_reflect_1d & args,
      device  const char * src0,
+@@ -9262,6 +9447,26 @@ kernel void kernel_diag_f32(
+ constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]];
+ constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]];
+ 
++// chatterbox PROGRESS §3.27: mul_mm + ADD(bias) [+ ADD(residual)] fusion.
++// §3.28 adds GELU_ERF to the fused write path (target: CFM basic_tfm
++// `gelu_erf(add(mul_mat(ff0_w, nx), ff0_b))` — 56 tfm blocks * 10 steps *
++// 2 CFG batches = 1120 saved dispatches per synth).  Mirrors the
++// Q-variant mul_mv fusion (helper_mv_add_bias).  Bias is a broadcast
++// vector of shape [ne0] (OC); residual has the same shape and stride
++// as dst; gelu_erf uses the same erf_approx helper the standalone
++// unary op uses, so the fused kernel is numerically identical.  Each
++// branch can be disabled via its function constant; Metal compiler
++// drops the un-selected branches at specialisation time.  Bias and
++// residual are bound at buffer slots 4 and 5 regardless of flag so
++// the host can always bind a placeholder (src[0]) when the flag is
++// false — matches the mul_mv fusion wiring convention.
++constant bool FC_mul_mm_has_bias_     [[function_constant(FC_MUL_MM + 2)]];
++constant bool FC_mul_mm_has_residual_ [[function_constant(FC_MUL_MM + 3)]];
++constant bool FC_mul_mm_has_gelu_erf_ [[function_constant(FC_MUL_MM + 4)]];
++#define FC_mul_mm_has_bias     FC_mul_mm_has_bias_
++#define FC_mul_mm_has_residual FC_mul_mm_has_residual_
++#define FC_mul_mm_has_gelu_erf FC_mul_mm_has_gelu_erf_
++
+ // each block_q contains 16*nl weights
+ template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
+ kernel void kernel_mul_mm(
+@@ -9269,6 +9474,8 @@ kernel void kernel_mul_mm(
+         device const char * src0,
+         device const char * src1,
+         device       char * dst,
++        device const char * bias     [[buffer(4)]],
++        device const char * residual [[buffer(5)]],
+         threadgroup  char * shmem [[threadgroup(0)]],
+         uint3  tgpig[[threadgroup_position_in_grid]],
+         ushort tiitg[[thread_index_in_threadgroup]],
+@@ -9514,7 +9721,23 @@ kernel void kernel_mul_mm(
+ #endif
+     }
+ 
+-    if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) {
++    // chatterbox PROGRESS §3.27 (+ §3.28, §3.30): fold-in dispatch
++    // strategy across the two store paths:
++    //
++    // - Full-block writes (_mm_use_direct = true): residual + gelu
++    //   still route through the shmem path (§3.29 attempt left
++    //   them there — cT.store's cooperative layout vs a generic
++    //   device-memory RMW was not numerically stable).  Bias-only
++    //   can fuse into the direct-store path via a small
++    //   post-barrier scan (§3.30 reattempt under the test-metal-ops
++    //   mul_mm_fused parity gate).
++    // - Edge blocks (_mm_bounds_ok = false): shmem scalar-copy
++    //   path, which handles all three fold-ins inline.
++    const bool _mm_bounds_ok = (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1);
++    const bool _mm_has_nonbias_foldin = FC_mul_mm_has_residual || FC_mul_mm_has_gelu_erf;
++    const bool _mm_use_direct = (!FC_mul_mm_bc_out || _mm_bounds_ok)
++                                && !_mm_has_nonbias_foldin;
++    if (_mm_use_direct) {
+         // if no bounds checks on the output are needed, we can directly write to device memory
+ #ifdef GGML_METAL_HAS_TENSOR
+         device float * C = (device float *) dst +
+@@ -9532,8 +9755,32 @@ kernel void kernel_mul_mm(
+             simdgroup_store(mc[i], C + 8*(i%4) + 8*args.ne0*(i/4), args.ne0, 0, false);
+         }
+ #endif
++        // §3.30: post-store bias fold-in on the direct-store path
++        // (bias-only — residual and gelu still go through shmem).
++        // Requires a threadgroup_barrier to order cT.store's or
++        // simdgroup_store's writes against the subsequent reads,
++        // then distributes 2048 elements (NR0*NR1) across 128
++        // threads (tiitg) so each thread reads+writes 16 values.
++        // Gated under the harness in test-metal-ops to avoid the
++        // §3.29 regression (wrong output on residual / gelu paths).
++        if (FC_mul_mm_has_bias) {
++            threadgroup_barrier(mem_flags::mem_device);
++            device const float * bias_f32 = (device const float *) bias;
++            const int thread_idx = (int) tiitg;
++            for (int k = thread_idx; k < NR0 * NR1; k += 128) {
++                const int row_off = k % NR0; // 0..63
++                const int col_off = k / NR0; // 0..31
++                const int abs_r   = r0 + row_off;
++                const int abs_c   = r1 + col_off;
++                const uint64_t off = (uint64_t)abs_c * args.ne0 + abs_r + (uint64_t)im*args.ne1*args.ne0;
++                device float * D  = (device float *) dst + off;
++                *D = *D + bias_f32[abs_r];
++            }
++        }
+     } else {
+-        // block is smaller than 64x32, we should avoid writing data outside of the matrix
++        // block is smaller than 64x32 OR bias/residual is fused — route
++        // through shmem so the sgitg==0 scalar copy can read + add + write
++        // in one pass.
+         threadgroup_barrier(mem_flags::mem_threadgroup);
+ 
+         threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
+@@ -9550,21 +9797,55 @@ kernel void kernel_mul_mm(
+         threadgroup_barrier(mem_flags::mem_threadgroup);
+ 
+         if (sgitg == 0) {
++            device const float * bias_f32     = (device const float *) bias;
++            device const float * residual_f32 = (device const float *) residual;
+             for (int j = tiitg; j < nr1; j += NR1) {
+                 device float  * D  = (device float  *) dst + r0 + (r1 + j)*args.ne0 + im*args.ne1*args.ne0;
+-                device float4 * D4 = (device float4 *) D;
+-
+                 threadgroup float  * C  = temp_str + (j*NR0);
+-                threadgroup float4 * C4 = (threadgroup float4 *) C;
+ 
+-                int i = 0;
+-                for (; i < nr0/4; i++) {
+-                    *(D4 + i) = *(C4 + i);
+-                }
++                if (!FC_mul_mm_has_bias && !FC_mul_mm_has_residual && !FC_mul_mm_has_gelu_erf) {
++                    device float4 * D4 = (device float4 *) D;
++                    threadgroup float4 * C4 = (threadgroup float4 *) C;
++
++                    int i = 0;
++                    for (; i < nr0/4; i++) {
++                        *(D4 + i) = *(C4 + i);
++                    }
+ 
+-                i *= 4;
+-                for (; i < nr0; i++) {
+-                    *(D + i) = *(C + i);
++                    i *= 4;
++                    for (; i < nr0; i++) {
++                        *(D + i) = *(C + i);
++                    }
++                } else {
++                    // Fused write path: fold bias (broadcast over rows /
++                    // batches via shape [ne0]) and/or residual (same
++                    // shape + stride as dst) and/or gelu_erf into the
++                    // scalar copy.  Compiler drops the branches that
++                    // aren't selected by the function constants.
++                    //
++                    // Order matters: bias is added first (completing
++                    // the linear layer's output), residual is added
++                    // next (the skip connection), gelu_erf is applied
++                    // last (activation).  In practice residual and
++                    // gelu_erf are mutually exclusive in real graphs:
++                    // residual is a skip connection after a pre-norm
++                    // linear; gelu_erf is an FF activation.  But the
++                    // three FC branches are independent so any subset
++                    // is a valid fusion.
++                    const uint64_t res_off = (uint64_t)(r1 + j)*args.ne0 + (uint64_t)im*args.ne1*args.ne0;
++                    for (int i = 0; i < nr0; i++) {
++                        float v = *(C + i);
++                        if (FC_mul_mm_has_bias) {
++                            v += bias_f32[r0 + i];
++                        }
++                        if (FC_mul_mm_has_residual) {
++                            v += residual_f32[res_off + r0 + i];
++                        }
++                        if (FC_mul_mm_has_gelu_erf) {
++                            v = 0.5f * v * (1.0f + erf_approx(v * SQRT_2_INV));
++                        }
++                        *(D + i) = v;
++                    }
+                 }
+             }
+         }
diff --git a/scripts/bench-m4-validation.sh b/scripts/bench-m4-validation.sh
new file mode 100755
index 0000000..73548ba
--- /dev/null
+++ b/scripts/bench-m4-validation.sh
@@ -0,0 +1,198 @@
+#!/usr/bin/env bash
+# scripts/bench-m4-validation.sh
+#
+# Self-contained bench + parity harness for the §3.24–§3.30 Metal portfolio.
+# Intended to run on M4 Air / M4 Pro / iPad Pro or any Apple-silicon Mac.
+# The §3.27 / §3.28 / §3.30 kernel work is predicted to be proportionally
+# larger win on M4 vs the M3 Ultra reference (neutral on M3U due to the
+# chip's very low per-dispatch overhead + high core count).  This script
+# lets you either confirm or falsify that prediction with one command.
+#
+# Usage:
+#
+#   # From a fresh clone of chatterbox.cpp @ multilingual_merged:
+#   cd chatterbox.cpp
+#   # Grab the voice fixture (any 16 kHz WAV; jfk.wav is the reference):
+#   scp <m3u>:/tmp/jfk.wav /tmp/jfk.wav
+#   # Make sure you have the model GGUFs (14 GB total):
+#   #   models/chatterbox-t3-mtl-q4_0.gguf
+#   #   models/chatterbox-s3gen-mtl-q4_0_hift_f16_v2.gguf
+#   bash scripts/bench-m4-validation.sh 2>&1 | tee m4-bench.log
+#
+# Compares current hardware results to the M3 Ultra reference
+# captured in PROGRESS §3.30.  Writes JSON to artifacts/bench/ for archiving.
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$REPO_ROOT"
+
+REFERENCE_MD5="d8a1b22375dbcb2259c686426a7d76c5"
+REFERENCE_TEXT="Hola mundo, esta es una prueba multilingue."
+REFERENCE_LANG="es"
+T3_GGUF="${T3_GGUF:-models/chatterbox-t3-mtl-q4_0.gguf}"
+S3GEN_GGUF="${S3GEN_GGUF:-models/chatterbox-s3gen-mtl-q4_0_hift_f16_v2.gguf}"
+REF_WAV="${REF_WAV:-/tmp/jfk.wav}"
+OUT_DIR="${OUT_DIR:-artifacts/bench}"
+RUNS="${RUNS:-5}"
+
+# M3 Ultra reference numbers (post-§3.30, 5 invocations averaged)
+M3U_CFM_MS=534.0
+M3U_S3GEN_MS=706.6
+M3U_T3_MS=432.6
+M3U_HIFT_MS=121.1
+
+for f in "$T3_GGUF" "$S3GEN_GGUF" "$REF_WAV"; do
+    if [ ! -f "$f" ]; then
+        echo "FAIL: required file not found: $f" >&2
+        exit 1
+    fi
+done
+
+HOST_CHIP="$(system_profiler SPHardwareDataType 2>/dev/null | awk -F': +' '/Chip:/ {print $2; exit}')"
+HOST_MODEL="$(system_profiler SPHardwareDataType 2>/dev/null | awk -F': +' '/Model Identifier:/ {print $2; exit}')"
+
+echo "=== Host ==="
+echo "Chip:  ${HOST_CHIP:-unknown}"
+echo "Model: ${HOST_MODEL:-unknown}"
+
+echo ""
+echo "=== Setup ggml (apply Metal + OpenCL patches at pinned commit) ==="
+if [ ! -d ggml/.git ]; then
+    bash scripts/setup-ggml.sh
+else
+    echo "ggml/ already present; skipping.  To force a reapply of the patches, remove ggml/ first."
+fi
+
+echo ""
+echo "=== Build ==="
+cmake -S . -B build-metal \
+    -DGGML_METAL=ON -DGGML_BLAS=OFF -DGGML_NATIVE=ON \
+    -DCMAKE_BUILD_TYPE=Release >/dev/null
+cmake --build build-metal -j --target chatterbox test-metal-ops 2>&1 | tail -3
+
+echo ""
+echo "=== test-metal-ops (14 gates: 3 base + 3 conv + 8 fused-mul_mm) ==="
+if ./build-metal/test-metal-ops 2>&1 | tee /tmp/m4-metal-ops.log | grep -E "^OK|^FAIL" | tail -20; then
+    if grep -q "FAIL" /tmp/m4-metal-ops.log; then
+        echo "FAIL: test-metal-ops has failures"
+        exit 1
+    fi
+    echo "test-metal-ops: all gates PASS"
+else
+    echo "FAIL: test-metal-ops did not produce output"
+    exit 1
+fi
+
+echo ""
+echo "=== Bench: ${RUNS} invocations (Q4_0 + HiFT F16 v2, ES prompt, seed 42) ==="
+
+# Collect per-invocation stats
+CFM_MS=()
+S3GEN_MS=()
+T3_MS=()
+HIFT_MS=()
+MD5S=()
+
+mkdir -p "$OUT_DIR"
+
+for i in $(seq 1 "$RUNS"); do
+    OUT="/tmp/cb_m4_${i}.wav"
+    LOG="/tmp/cb_m4_${i}.log"
+    ./build-metal/chatterbox \
+        --model "$T3_GGUF" \
+        --s3gen-gguf "$S3GEN_GGUF" \
+        --reference-audio "$REF_WAV" \
+        --text "$REFERENCE_TEXT" \
+        --language "$REFERENCE_LANG" \
+        --seed 42 --temp 0 --top-k 1 \
+        --n-gpu-layers 1 \
+        --out "$OUT" \
+        --verbose 2>&1 | grep -vE "^ggml_metal_" > "$LOG"
+
+    cfm=$(awk '/\[cfm_total\]/ {print $2}' "$LOG")
+    hift=$(awk '/\[hift_decode\]/ {print $2}' "$LOG")
+    s3gen=$(awk '/S3GEN_INFER_MS/ {gsub("S3GEN_INFER_MS=", "", $2); print $2}' "$LOG" | head -1)
+    t3=$(awk '/T3_INFER_MS/ {gsub("T3_INFER_MS=", "", $2); print $2}' "$LOG")
+
+    CFM_MS+=("$cfm")
+    S3GEN_MS+=("$s3gen")
+    T3_MS+=("$t3")
+    HIFT_MS+=("$hift")
+
+    md5=$(md5 -q "$OUT")
+    MD5S+=("$md5")
+    printf "run %d: cfm=%s  s3gen=%s  t3=%s  hift=%s  md5=%s\n" \
+        "$i" "${cfm:-?}" "${s3gen:-?}" "${t3:-?}" "${hift:-?}" "${md5:0:12}"
+done
+
+# Compute means (awk — no bash floats)
+mean() {
+    printf '%s\n' "$@" | awk 'BEGIN{s=0;n=0} {s+=$1; n++} END{if (n>0) printf "%.1f", s/n; else print "?"}'
+}
+
+CFM_MEAN=$(mean "${CFM_MS[@]}")
+S3GEN_MEAN=$(mean "${S3GEN_MS[@]}")
+T3_MEAN=$(mean "${T3_MS[@]}")
+HIFT_MEAN=$(mean "${HIFT_MS[@]}")
+
+echo ""
+echo "=== Summary: ${HOST_CHIP:-this host} vs M3 Ultra reference ==="
+printf "%-20s %15s %15s %15s\n" "stage" "M3 Ultra (ref)" "this host"     "Δ vs M3U"
+printf "%-20s %15.1f %15s %15s\n" "[cfm_total] ms"  "$M3U_CFM_MS"   "$CFM_MEAN"   "$(awk -v a=$CFM_MEAN   -v b=$M3U_CFM_MS   'BEGIN{d=a-b; r=(d/b)*100; printf "%+.1f (%+.1f%%)", d, r}')"
+printf "%-20s %15.1f %15s %15s\n" "S3GEN_INFER_MS"  "$M3U_S3GEN_MS" "$S3GEN_MEAN" "$(awk -v a=$S3GEN_MEAN -v b=$M3U_S3GEN_MS 'BEGIN{d=a-b; r=(d/b)*100; printf "%+.1f (%+.1f%%)", d, r}')"
+printf "%-20s %15.1f %15s %15s\n" "T3_INFER_MS"     "$M3U_T3_MS"    "$T3_MEAN"    "$(awk -v a=$T3_MEAN    -v b=$M3U_T3_MS    'BEGIN{d=a-b; r=(d/b)*100; printf "%+.1f (%+.1f%%)", d, r}')"
+printf "%-20s %15.1f %15s %15s\n" "[hift_decode] ms" "$M3U_HIFT_MS"  "$HIFT_MEAN"  "$(awk -v a=$HIFT_MEAN  -v b=$M3U_HIFT_MS  'BEGIN{d=a-b; r=(d/b)*100; printf "%+.1f (%+.1f%%)", d, r}')"
+
+# MD5 comparison: all runs must produce identical output (determinism) and
+# the value must match the M3 Ultra reference (byte-exactness across chips).
+UNIQUE_MD5=$(printf '%s\n' "${MD5S[@]}" | sort -u | wc -l | tr -d ' ')
+FIRST_MD5="${MD5S[0]}"
+
+echo ""
+echo "=== Parity ==="
+if [ "$UNIQUE_MD5" = "1" ]; then
+    echo "determinism: PASS  (md5 $FIRST_MD5 stable across ${RUNS} runs)"
+else
+    echo "determinism: FAIL  (got $UNIQUE_MD5 distinct md5s across ${RUNS} runs)"
+fi
+
+if [ "$FIRST_MD5" = "$REFERENCE_MD5" ]; then
+    echo "byte-exact vs M3 Ultra: PASS ($FIRST_MD5)"
+else
+    echo "byte-exact vs M3 Ultra: DIFFER"
+    echo "  M3 Ultra reference: $REFERENCE_MD5"
+    echo "  $HOST_CHIP:         $FIRST_MD5"
+    echo "  (small divergence expected across chip generations from Q4_0-dequant-order + bias-fusion accumulation;"
+    echo "   listen to /tmp/cb_m4_1.wav to verify audio sounds correct)"
+fi
+
+# Write JSON summary
+JSON="$OUT_DIR/m4-validation.json"
+cat > "$JSON" <<EOF
+{
+  "host_chip": "${HOST_CHIP:-unknown}",
+  "host_model": "${HOST_MODEL:-unknown}",
+  "runs": ${RUNS},
+  "t3_gguf": "$T3_GGUF",
+  "s3gen_gguf": "$S3GEN_GGUF",
+  "reference_wav": "$REF_WAV",
+  "m3u_reference": {
+    "cfm_ms": $M3U_CFM_MS,
+    "s3gen_ms": $M3U_S3GEN_MS,
+    "t3_ms": $M3U_T3_MS,
+    "hift_ms": $M3U_HIFT_MS,
+    "md5": "$REFERENCE_MD5"
+  },
+  "this_host": {
+    "cfm_ms_mean": $CFM_MEAN,
+    "s3gen_ms_mean": $S3GEN_MEAN,
+    "t3_ms_mean": $T3_MEAN,
+    "hift_ms_mean": $HIFT_MEAN,
+    "md5": "$FIRST_MD5",
+    "determinism_ok": $([ "$UNIQUE_MD5" = "1" ] && echo true || echo false)
+  }
+}
+EOF
+
+echo ""
+echo "wrote $JSON"
diff --git a/scripts/requantize-gguf.py b/scripts/requantize-gguf.py
index e86b7d1..98f96d5 100644
--- a/scripts/requantize-gguf.py
+++ b/scripts/requantize-gguf.py
@@ -26,11 +26,30 @@
 
     # Q4_0 is the same, last arg is just `q4_0`.
 
+    # F16 downcast for HiFT conv kernels (multilingual S3Gen — see §3.24).
+    # `--name-filter hift/` constrains the rewrite to a name substring;
+    # everything else is passed through at its source dtype.  Two-pass
+    # use:
+    #   1. F32→F16 for HiFT conv kernels in the F16 source GGUF
+    #   2. F16→Q4_0 for the CFM transformer linears (no name filter)
+    python scripts/requantize-gguf.py \\
+        models/chatterbox-s3gen-mtl-f16.gguf \\
+        /tmp/intermediate.gguf f16 --name-filter hift/
+    python scripts/requantize-gguf.py \\
+        /tmp/intermediate.gguf \\
+        models/chatterbox-s3gen-mtl-q4_0_hift_f16.gguf q4_0
+
 Quality trade-off (measured on a representative paragraph, Metal / M3 Ultra):
   F32 (default)   — baseline
   Q8_0            — essentially bit-exact, cos-sim > 0.99 vs baseline
   Q4_0            — different CFM ODE trajectory → different sample;
                     subjective quality equal, cos-sim falls to ~0.66
+  F16 (--name-filter hift/) — HiFT conv kernels at half precision; PCM
+                    cosine 0.9999 vs the corresponding all-F32-HiFT
+                    baseline (audio essentially indistinguishable).
+                    `[hift_decode]` ~3 % faster on M3 Ultra Metal
+                    (124.9 → 121.3 ms median across 3 invocations);
+                    GGUF ~33 MB smaller.  See PROGRESS.md §3.24.
 """
 
 from __future__ import annotations
@@ -70,7 +89,22 @@
     "/norm/",                   # layernorms
     "/ln_",                     # GPT-2 style layernorms (ln_1, ln_2, ln_f)
     "/g",                       # GPT-2 style norm scale (matches /g, /ga[mma], /gate — accept the occasional false deny)
-    "/s",                       # legacy scale weights
+    "/scale",                   # legacy scale weights (narrowed from the
+                                # old "/s" glob so HiFT source_* conv
+                                # weights are no longer incidentally
+                                # excluded.  The `kernel_mul_mv_f32_f16`
+                                # / `_4` / `_short` Metal kernel variants
+                                # that HiFT source_* conv1d needs are
+                                # shipped in patches/ggml-metal-
+                                # chatterbox-ops.patch as of PROGRESS
+                                # §3.26, so this deny is no longer
+                                # necessary for correctness.  With the
+                                # kernel in place, the 21 source_*
+                                # conv-kernel weights go through the
+                                # --name-filter hift/ recipe at f16 and
+                                # the GGUF shrinks by ~7.7 MB with WAV
+                                # parity (cos 1.000000, rms-diff 0.035 %,
+                                # max abs 4/32767).  See §3.26.)
     "alpha",                    # Snake activation alphas
     "beta",
     "gamma",
@@ -100,6 +134,11 @@
     "q8_0": gguf.GGMLQuantizationType.Q8_0,
     "q5_0": gguf.GGMLQuantizationType.Q5_0,
     "q4_0": gguf.GGMLQuantizationType.Q4_0,
+    # F16 is a downcast, not a block quant — block_size = 1 in
+    # GGML_QUANT_SIZES, so the shape gates in should_quantize accept any
+    # 2-D / 3-D weight tensor.  Useful for the 3-D HiFT conv kernels
+    # (K in {3, 7, 11, 16}) that none of the 32-block quants can take.
+    "f16":  gguf.GGMLQuantizationType.F16,
 }
 
 
@@ -112,27 +151,41 @@ def should_quantize(name: str, shape: tuple[int, ...], qtype: gguf.GGMLQuantizat
         return False
 
     # Deny-list.
-    lower = name.lower()
     for s in _DENY_SUBSTRINGS:
         if s in name:  # case-sensitive for path-like names
             return False
 
-    # Quantization needs the reduction dim to be a multiple of the block size.
-    # In ggml 2D matmul, weight tensor has shape (ne0, ne1) and ne0 is the
-    # reduction dim.  Here GGUFReader exposes shape in numpy (reversed) order,
-    # so the reduction dim is shape[-1].
     block = gguf.GGML_QUANT_SIZES[qtype][0]
-    if shape[-1] % block != 0:
-        return False
 
-    # Stick to 2D (plain matmul) and 3D (conv with kernel_size as leading dim).
-    # Convs can be quantized in ggml since im2col produces F32 data which
-    # mul_mat handles against Q-weights; but we play it safe and only
-    # quantize the 2D matmul weights where we know ggml_mul_mat is used.
-    if len(shape) != 2:
-        return False
-
-    return True
+    # 2D matmul weights: ggml shape (ne0, ne1) = (reduction_dim, output).
+    # GGUFReader exposes shape in numpy (reversed) order, so the
+    # reduction dim is shape[-1].  Quantization quantises along the
+    # last numpy axis, so shape[-1] must be a multiple of the block.
+    if len(shape) == 2:
+        return shape[-1] % block == 0
+
+    # 3D conv kernels: ggml shape (K, IC, OC) -> numpy (OC, IC, K).
+    # `gguf.quants.quantize` quantises along the LAST numpy axis, which is K
+    # for a conv kernel.  HiFT conv kernels have K in {3, 7, 11, 16}; none
+    # are multiples of any block size we ship here (32).
+    #
+    # Quantising along K*IC instead would need a numpy reshape to
+    # (OC, K*IC) before `quantize` and then storing the result with ggml
+    # shape (K*IC, OC) — i.e. a 2-D on-disk tensor.  But the C++ side's
+    # `conv1d_f32` calls `ggml_im2col(kernel, ...)` which derives the
+    # kernel size from `kernel->ne[0]`; collapsing K into a flattened
+    # (K*IC) ne[0] would silently break im2col window extraction.
+    #
+    # So 3-D quantisation only works when K alone meets the block-size
+    # constraint.  We still gate on it (instead of returning False
+    # outright) so any future converter that ships K-aligned conv
+    # kernels gets the win for free; for the current HiFT stack this
+    # path stays a no-op and the caller logs the kept-as-source-dtype
+    # tensors via stats.kept.
+    if len(shape) == 3:
+        return shape[-1] % block == 0
+
+    return False
 
 
 def main() -> int:
@@ -140,9 +193,19 @@ def main() -> int:
     ap.add_argument("src", type=Path, help="Source GGUF (F32/F16)")
     ap.add_argument("dst", type=Path, help="Output GGUF")
     ap.add_argument("dtype", choices=_QUANT_TYPE.keys(), help="Target quant dtype")
+    ap.add_argument(
+        "--name-filter",
+        default=None,
+        help=("Substring filter on tensor names; only tensors whose name "
+              "contains this substring are touched.  All other tensors "
+              "are passed through at their source dtype.  Useful for "
+              "applying f16 to HiFT conv kernels in a Q4_0 source GGUF "
+              "without disturbing the existing Q4_0 CFM weights."),
+    )
     args = ap.parse_args()
 
     qtype = _QUANT_TYPE[args.dtype]
+    name_filter = args.name_filter
 
     src = gguf.GGUFReader(args.src, "r")
     arch = src.fields.get("general.architecture")
@@ -202,7 +265,10 @@ def main() -> int:
         data = np.asarray(t.data)
         src_bytes += data.nbytes
 
-        if t.tensor_type in _QUANTIZABLE_SRC_DTYPES and should_quantize(t.name, shape, qtype):
+        in_filter = name_filter is None or name_filter in t.name
+        if (in_filter and t.tensor_type in _QUANTIZABLE_SRC_DTYPES
+                      and t.tensor_type != qtype
+                      and should_quantize(t.name, shape, qtype)):
             # Reshape to natural (shape).  GGUF raw data is contiguous in
             # the original order, but reversed() above gives element-shape
             # which is what `quantize()` expects.
@@ -213,10 +279,31 @@ def main() -> int:
             dst_bytes += qdata.nbytes
         else:
             # Pass through unchanged.  Preserve original dtype.
-            arr = data.reshape(shape)
-            writer.add_tensor(t.name, arr, raw_shape=arr.shape, raw_dtype=t.tensor_type)
+            #
+            # For already-quantised inputs (Q-type sources) the GGUF data
+            # is opaque packed bytes (Q4_0: 18 B / 32 elements ≈ 0.56 B
+            # per element), so a numpy-shape reshape against the
+            # element-shape would fail with a size-mismatch.  Float-type
+            # sources have block_size=1 in GGML_QUANT_SIZES so the
+            # reshape works as before.
+            block_size, type_size = gguf.GGML_QUANT_SIZES[t.tensor_type]
+            if block_size == 1:
+                arr = data.reshape(shape)
+                writer.add_tensor(t.name, arr, raw_shape=arr.shape, raw_dtype=t.tensor_type)
+            else:
+                # Q-type passthrough.  gguf-0.18+ `add_tensor_info` treats
+                # `raw_shape` as **byte shape** for uint8 tensors (the
+                # innermost dim is bytes per row, not elements per row).
+                # Convert: byte_inner = elements_inner / block * type_size.
+                # Earlier versions of this script hit
+                # `ValueError: Quantized tensor bytes per row (N) is not a
+                # multiple of Q4_0 type size (18)` when re-quantising a
+                # GGUF that already had Q-type tensors — see §3.26.
+                byte_inner = shape[-1] // block_size * type_size
+                byte_shape = tuple(list(shape[:-1]) + [byte_inner])
+                writer.add_tensor(t.name, data, raw_shape=byte_shape, raw_dtype=t.tensor_type)
             kept_count += 1
-            dst_bytes += arr.nbytes
+            dst_bytes += data.nbytes
 
     writer.write_header_to_file()
     writer.write_kv_data_to_file()
diff --git a/src/chatterbox_cli.cpp b/src/chatterbox_cli.cpp
index 5a93816..038fd6b 100644
--- a/src/chatterbox_cli.cpp
+++ b/src/chatterbox_cli.cpp
@@ -57,6 +57,7 @@
 #include "tts-cpp/tts-cpp.h"
 #include "tts-cpp/chatterbox/s3gen_pipeline.h"
 #include "chatterbox_t3_internal.h"
+#include "t3_mtl.h"
 #include "npy.h"
 #include "voice_features.h"
 #include "voice_encoder.h"
@@ -357,6 +358,12 @@ struct cli_params {
     // to 2 (matches Python's meanflow); setting 1 halves CFM cost at the
     // price of a bit of extra high-frequency noise.
     int32_t stream_cfm_steps          = 0;
+    // Override CFM Euler step count for non-streaming synthesis.  Defaults
+    // to 0 (= use the GGUF's `n_timesteps`: 10 for Multilingual standard
+    // CFM, 2 for Turbo's meanflow).  Lowering N (e.g. 7-8 on Multilingual)
+    // reduces S3Gen wall-clock proportionally; the §3.21 sweep documents
+    // the audio-cosine knee.  Streaming uses --stream-cfm-steps instead.
+    int32_t cfm_steps                 = 0;
 
     // Auto-split the input text into sentences before running the pipeline.
     // Chatterbox Turbo's T3 degrades badly on autoregressive outputs longer
@@ -469,6 +476,12 @@ static void print_usage(const char * argv0) {
     fprintf(stderr, "                          as --stream-chunk-tokens)\n");
     fprintf(stderr, "  --stream-cfm-steps N    CFM Euler step count per chunk.  Python uses 2 for\n");
     fprintf(stderr, "                          meanflow; 1 halves CFM cost.  (default: 0 = 2)\n");
+    fprintf(stderr, "  --cfm-steps N           Non-streaming CFM Euler step count.  Multilingual's\n");
+    fprintf(stderr, "                          standard CFM ships at 10 steps; lower (e.g. 7-8)\n");
+    fprintf(stderr, "                          trades small audio quality for proportional S3Gen\n");
+    fprintf(stderr, "                          speedup.  Turbo's meanflow defaults to 2 steps.\n");
+    fprintf(stderr, "                          See PROGRESS.md §3.21 for the quality knee sweep.\n");
+    fprintf(stderr, "                          (default: 0 = GGUF's n_timesteps)\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "  --input-file PATH       Stream text from PATH as another process writes to it.\n");
     fprintf(stderr, "                          tail -f semantics: each complete sentence (ending in\n");
@@ -589,6 +602,7 @@ static bool parse_args(int argc, char ** argv, cli_params & params) {
         else if (arg == "--stream-chunk-tokens")       { if (!parse_int("--stream-chunk-tokens",       params.stream_chunk_tokens))       return false; }
         else if (arg == "--stream-first-chunk-tokens") { if (!parse_int("--stream-first-chunk-tokens", params.stream_first_chunk_tokens)) return false; }
         else if (arg == "--stream-cfm-steps")          { if (!parse_int("--stream-cfm-steps",          params.stream_cfm_steps))          return false; }
+        else if (arg == "--cfm-steps")                 { if (!parse_int("--cfm-steps",                 params.cfm_steps))                 return false; }
         else if (arg == "--input-file")       { auto v = next("--input-file");       if (!v) return false; params.input_file = v; }
         else if (arg == "--input-eof-marker") { auto v = next("--input-eof-marker"); if (!v) return false; params.input_eof_marker = v; }
         else if (arg == "--input-by-line")    { params.input_by_line = true; }
@@ -819,6 +833,7 @@ int tts_cpp_cli_main(int argc, char ** argv) {
             opts.debug           = params.debug;
             opts.verbose         = params.verbose;
             opts.n_gpu_layers    = params.n_gpu_layers;
+            opts.cfm_steps       = params.cfm_steps;
             if (!params.reference_audio.empty()) {
                 if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf,
                                                 opts.prompt_feat_override,
@@ -1015,19 +1030,27 @@ int tts_cpp_cli_main(int argc, char ** argv) {
             // tears the device down.  (S3Gen's cache registers its own
             // atexit hook; T3 has no such hook, main() is its owner.)
             auto free_t3 = [&]() {
+                if (model.buffer_stack || model.ctx_stack) {
+                    tts_cpp::chatterbox::detail::t3_stack_unregister(
+                        model.buffer_stack, model.ctx_stack);
+                }
                 ggml_backend_buffer_free(model.buffer_w);
                 ggml_backend_buffer_free(model.buffer_kv);
+                if (model.buffer_stack)    ggml_backend_buffer_free(model.buffer_stack);
                 if (model.buffer_override) ggml_backend_buffer_free(model.buffer_override);
                 ggml_backend_free(model.backend);
                 ggml_free(model.ctx_w);
                 ggml_free(model.ctx_kv);
+                if (model.ctx_stack)    ggml_free(model.ctx_stack);
                 if (model.ctx_override) ggml_free(model.ctx_override);
                 model.buffer_w = nullptr;
                 model.buffer_kv = nullptr;
+                model.buffer_stack = nullptr;
                 model.buffer_override = nullptr;
                 model.backend = nullptr;
                 model.ctx_w = nullptr;
                 model.ctx_kv = nullptr;
+                model.ctx_stack = nullptr;
                 model.ctx_override = nullptr;
             };
 
@@ -1079,6 +1102,8 @@ int tts_cpp_cli_main(int argc, char ** argv) {
             opts.debug           = params.debug;
             opts.verbose         = params.verbose;
             opts.n_gpu_layers    = params.n_gpu_layers;
+            // Live-input streaming uses --stream-cfm-steps for chunks.
+            // --cfm-steps is a non-streaming knob; ignored here.
             if (!params.reference_audio.empty()) {
                 if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf,
                                                 opts.prompt_feat_override,
@@ -1863,6 +1888,10 @@ int tts_cpp_cli_main(int argc, char ** argv) {
             opts.debug           = params.debug;
             opts.verbose         = params.verbose;
             opts.n_gpu_layers    = params.n_gpu_layers;
+            // Non-streaming CFM Euler step count (0 = GGUF default).
+            // Streaming chunks honour --stream-cfm-steps instead and copy
+            // this opts struct via `copts` further below.
+            opts.cfm_steps       = params.cfm_steps;
             if (!params.reference_audio.empty()) {
                 if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf,
                                                 opts.prompt_feat_override,
diff --git a/src/chatterbox_t3_internal.h b/src/chatterbox_t3_internal.h
index b3f1a93..ab68cd2 100644
--- a/src/chatterbox_t3_internal.h
+++ b/src/chatterbox_t3_internal.h
@@ -128,6 +128,25 @@ struct llama_layer {
     ggml_tensor * mlp_gate = nullptr;
     ggml_tensor * mlp_up   = nullptr;
     ggml_tensor * mlp_down = nullptr;
+
+    // Phase 15 fused-matmul stack for the Metal hot path. Allocated in
+    // a dedicated persistent buffer at load time; data is memcpy'd in
+    // from the per-tensor wq / wk / wv GGUF tensors which keep their
+    // own backing storage in the weights buffer.
+    //
+    //   wqkv : shape [n_embd, 3 * n_embd]   (Q rows ‖ K rows ‖ V rows)
+    //
+    // Stacking lets each Llama block run ONE Q4_0 mat-mul where it
+    // previously ran three. On a 30-layer × 84-token T3 step pass
+    // that's 30 * 84 * 2 ≈ 5k fewer kernel launches per call inside
+    // each command-buffer commit; the combined mat-mul is also a
+    // wider M dim (3072 vs 1024) which lets ggml-metal's mul_mm tile
+    // (NR0 = 64 row, NR1 = 32 col) saturate better on the tile loop.
+    //
+    // gate / up are NOT stacked: the multilingual T3 GGUF ships
+    // mlp_gate as F16 and mlp_up as Q4_0, and a single ggml_tensor
+    // can't hold mixed element widths.
+    ggml_tensor * wqkv = nullptr;
 };
 
 struct perceiver_weights {
@@ -172,6 +191,28 @@ struct chatterbox_model {
     std::vector<gpt2_layer>  layers;
     std::vector<llama_layer> layers_mtl;
 
+    // KV cache.
+    //
+    // Turbo (GPT-2 Medium) variant: memory_k / memory_v are sized
+    // `head_dim * n_kv_head * n_ctx * n_layer` (single batch).
+    //
+    // Multilingual (Llama-520M) variant: memory_k / memory_v hold the
+    // CFG cond+uncond pair packed into a single backing buffer, size
+    // `2 * head_dim * n_kv_head * n_ctx * n_layer` (B=2). The two halves
+    // are interleaved per-layer so each Llama block reads from one
+    // contiguous 2*kv_layer_elems region: layout per layer is
+    //   [cond:  head_dim, n_ctx, n_kv_head] [uncond: head_dim, n_ctx, n_kv_head]
+    // Layer-offset stride is therefore `2 * kv_layer_elems * sizeof(F)`.
+    // Picking the cond half is `b_offset_elems = 0`; uncond is
+    // `b_offset_elems = kv_layer_elems` (one batch's worth, applied as a
+    // per-layer offset). The B=2 batched step+prompt graphs pack both
+    // batches into the same view via ne[3]=2 + per-batch stride.
+    //
+    // The unified buffer means the existing two-call (B=1) cond/uncond
+    // CPU path keeps using memory_k/memory_v unchanged; it just selects
+    // the right half via `b_offset_elems`. memory_k_uncond / memory_v_uncond
+    // are no longer separate allocations; kept here as nullable view aliases
+    // for legacy call-sites that haven't been migrated.
     ggml_tensor * memory_k = nullptr;
     ggml_tensor * memory_v = nullptr;
 
@@ -186,6 +227,20 @@ struct chatterbox_model {
     ggml_backend_buffer_t buffer_w  = nullptr;
     ggml_backend_buffer_t buffer_kv = nullptr;
 
+    // Phase 15 stacked fused-matmul weights (wqkv per layer) live in
+    // their own backend buffer. Empty on the CPU backend; the CPU path
+    // uses the original wq/wk/wv directly.
+    //
+    // The buffer is registered in a process-wide t3_stack_registry
+    // (see src/t3_mtl.cpp) so an atexit hook can free it before
+    // Metal's static device destructors run, which otherwise asserts
+    // on `[rsets->data count] == 0` because residency sets stay
+    // referenced through buffer_stack. main()'s explicit free_t3()
+    // calls t3_stack_unregister() before freeing the backend so
+    // error-path early-returns don't double-free at exit.
+    ggml_context *        ctx_stack    = nullptr;
+    ggml_backend_buffer_t buffer_stack = nullptr;
+
     ggml_context *        ctx_override    = nullptr;
     ggml_backend_buffer_t buffer_override = nullptr;
 
diff --git a/src/chatterbox_tts.cpp b/src/chatterbox_tts.cpp
index 4f0bdd5..e9cb55a 100644
--- a/src/chatterbox_tts.cpp
+++ b/src/chatterbox_tts.cpp
@@ -458,6 +458,22 @@ static ggml_tensor * conformer_block(ggml_context * ctx, const conformer_w & w,
                                           bd_reshaped->nb[1], bd_reshaped->nb[2], 0);
     bd_final = ggml_cont(ctx, bd_final);
 
+    // Rel-pos Conformer MHA is kept on the classic ggml_soft_max +
+    // separate V mat-mul path rather than ggml_flash_attn_ext because
+    // the f16 cast of the relative-position bias `bd_final` (which
+    // flash_attn_ext requires for its mask argument — ggml.c:5320
+    // GGML_ASSERT(mask->type == GGML_TYPE_F16)) drifts the softmax
+    // output by ~1e-4 per block, which compounds through the
+    // 10-step CFM estimator downstream and fails the WAV quality
+    // gate (cos 0.998647 vs required > 0.9998, md5 differs vs the
+    // §3.22 reference 79002f09bc48dda95ec0c2cfc2b895bd). Measured
+    // speed upside was −13 ms S3Gen / −1.8 % total on M3 Ultra with
+    // Metal, Q4_0, Spanish prompt, seed 42 — real but not worth
+    // trading against the audio quality threshold. See PROGRESS
+    // §3.25 for the full negative-finding writeup. Same pattern
+    // works on parakeet.cpp (see §15.8 there) because parakeet's
+    // downstream is a joint argmax over tokens, which is invariant
+    // to sub-bit-15 precision drift in attention scores.
     ggml_tensor * scores = ggml_add(ctx, ac, bd_final);
     scores = ggml_scale(ctx, scores, 1.0f / std::sqrt((float)HD));
     ggml_tensor * attn = ggml_soft_max(ctx, scores);
@@ -827,10 +843,30 @@ static ggml_tensor * cfm_causal_k3_b(ggml_context * ctx, ggml_tensor * x,
 
 // Compute the time embedding for a single scalar t (or r).
 // Returns (TIME_EMB_DIM=1024,) after sinusoidal + 2-layer MLP.
+//
+// Cached: the graph topology (inputs, weights, output shape) is constant
+// across all 10 CFM steps. Previously each call rebuilt the graph,
+// reserved a fresh gallocr, computed, and freed — burning ~1 ms of
+// dispatch + allocator overhead per step on Metal. Per call (multilingual,
+// 10 CFM steps) that's ~10 ms; for meanflow with `compute_time_mixed`
+// it's slightly more. The cache is keyed on the backend pointer so a
+// fresh model_ctx in another thread doesn't share scaffolding.
+struct time_mlp_cache {
+    ggml_backend_t  backend = nullptr;
+    std::vector<uint8_t> buf;
+    ggml_context *  ctx    = nullptr;
+    ggml_cgraph *   gf     = nullptr;
+    ggml_gallocr_t  allocr = nullptr;
+    ggml_tensor *   x_in   = nullptr;
+    ggml_tensor *   y_out  = nullptr;
+    ~time_mlp_cache() {
+        if (allocr) ggml_gallocr_free(allocr);
+        if (ctx)    ggml_free(ctx);
+    }
+};
+
 static std::vector<float> compute_time_mlp(const model_ctx & m, float t_val) {
     const int TDIM = 320;
-    const int HIDDEN = 1280;
-    const int OUT = 1024;
     std::vector<float> t_sin(TDIM);
     float log_factor = std::log(10000.0f) / (float)(TDIM/2 - 1);
     for (int i = 0; i < TDIM/2; ++i) {
@@ -839,36 +875,40 @@ static std::vector<float> compute_time_mlp(const model_ctx & m, float t_val) {
         t_sin[i] = std::sin(arg);
         t_sin[i + TDIM/2] = std::cos(arg);
     }
-    (void)HIDDEN; (void)OUT;
-
-    static size_t buf_size = 4 * 1024 * 1024;
-    std::vector<uint8_t> buf(buf_size);
-    ggml_init_params gp = { buf_size, buf.data(), true };
-    ggml_context * ctx = ggml_init(gp);
-    ggml_cgraph * gf = ggml_new_graph(ctx);
-
-    ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TDIM);
-    ggml_set_name(x, "x"); ggml_set_input(x);
-    ggml_tensor * l1w = find_tensor(m, "cfm/time_mlp/linear_1/weight");
-    ggml_tensor * l1b = find_tensor(m, "cfm/time_mlp/linear_1/bias");
-    ggml_tensor * l2w = find_tensor(m, "cfm/time_mlp/linear_2/weight");
-    ggml_tensor * l2b = find_tensor(m, "cfm/time_mlp/linear_2/bias");
-    ggml_tensor * y = ggml_add(ctx, ggml_mul_mat(ctx, l1w, x), l1b);
-    y = ggml_silu(ctx, y);
-    y = ggml_add(ctx, ggml_mul_mat(ctx, l2w, y), l2b);
-    ggml_set_name(y, "out"); ggml_set_output(y);
-    ggml_build_forward_expand(gf, y);
-
-    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend));
-    ggml_gallocr_reserve(allocr, gf);
-    ggml_gallocr_alloc_graph(allocr, gf);
-    ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "x"), t_sin.data(), 0, t_sin.size()*sizeof(float));
-    compute(m.backend, gf);
 
-    std::vector<float> out(ggml_nelements(y));
-    ggml_backend_tensor_get(y, out.data(), 0, ggml_nbytes(y));
-    ggml_gallocr_free(allocr);
-    ggml_free(ctx);
+    thread_local time_mlp_cache cache;
+    if (cache.ctx == nullptr || cache.backend != m.backend) {
+        if (cache.allocr) { ggml_gallocr_free(cache.allocr); cache.allocr = nullptr; }
+        if (cache.ctx)    { ggml_free(cache.ctx); cache.ctx = nullptr; }
+        cache.buf.assign(4 * 1024 * 1024, 0);
+        ggml_init_params gp = { cache.buf.size(), cache.buf.data(), true };
+        cache.ctx = ggml_init(gp);
+        cache.gf  = ggml_new_graph(cache.ctx);
+
+        cache.x_in = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, TDIM);
+        ggml_set_name(cache.x_in, "x"); ggml_set_input(cache.x_in);
+        ggml_tensor * l1w = find_tensor(m, "cfm/time_mlp/linear_1/weight");
+        ggml_tensor * l1b = find_tensor(m, "cfm/time_mlp/linear_1/bias");
+        ggml_tensor * l2w = find_tensor(m, "cfm/time_mlp/linear_2/weight");
+        ggml_tensor * l2b = find_tensor(m, "cfm/time_mlp/linear_2/bias");
+        ggml_tensor * y = ggml_add(cache.ctx, ggml_mul_mat(cache.ctx, l1w, cache.x_in), l1b);
+        y = ggml_silu(cache.ctx, y);
+        y = ggml_add(cache.ctx, ggml_mul_mat(cache.ctx, l2w, y), l2b);
+        ggml_set_name(y, "out"); ggml_set_output(y);
+        cache.y_out = y;
+        ggml_build_forward_expand(cache.gf, cache.y_out);
+
+        cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend));
+        ggml_gallocr_reserve(cache.allocr, cache.gf);
+        cache.backend = m.backend;
+    }
+
+    ggml_gallocr_alloc_graph(cache.allocr, cache.gf);
+    ggml_backend_tensor_set(cache.x_in, t_sin.data(), 0, t_sin.size() * sizeof(float));
+    compute(m.backend, cache.gf);
+
+    std::vector<float> out(ggml_nelements(cache.y_out));
+    ggml_backend_tensor_get(cache.y_out, out.data(), 0, ggml_nbytes(cache.y_out));
     return out;
 }
 
@@ -1380,8 +1420,13 @@ static std::vector<float> run_hift_decode(const model_ctx & m,
     std::vector<int> src_rb_ksizes = {7, 7, 11};
     std::vector<std::vector<int>> src_rb_dils = {{1,3,5},{1,3,5},{1,3,5}};
 
-    static size_t buf_size = 64 * 1024 * 1024;
-    std::vector<uint8_t> buf(buf_size);
+    // Thread-local arena: previously this was a fresh `std::vector<uint8_t>
+    // buf(64 MB)` per HiFT call, which forced a 64 MB memset on every
+    // generate (~5–10 ms on M3 Ultra). The buffer is reused across calls;
+    // each ggml_init resets the arena pointer, so we never accumulate stale
+    // tensor metadata between invocations.
+    static const size_t buf_size = 64 * 1024 * 1024;
+    thread_local std::vector<uint8_t> buf(buf_size);
     ggml_init_params gp = { buf_size, buf.data(), true };
     ggml_context * ctx = ggml_init(gp);
     ggml_cgraph * gf = ggml_new_graph_custom(ctx, 131072, false);
diff --git a/src/t3_mtl.cpp b/src/t3_mtl.cpp
index 4a63d1d..9d2e833 100644
--- a/src/t3_mtl.cpp
+++ b/src/t3_mtl.cpp
@@ -34,7 +34,9 @@
 #include <cmath>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
+#include <mutex>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -43,6 +45,67 @@ namespace tts_cpp::chatterbox::detail {
 
 namespace {
 
+// Process-wide registry of the Phase-15 stacked-weight buffers, with an
+// atexit hook that frees them before Metal's static device destructors
+// run. Without this Metal asserts on `[rsets->data count] == 0` because
+// `buffer_stack` is still live when the ggml-metal dylib tears down.
+// Mirrors `s3gen_model_cache_release` in chatterbox_tts.cpp; the
+// existing buffer_w / buffer_kv get cleaned up by other paths
+// (explicit free_t3() in error returns, dylib finaliser via the
+// model_ctx cache for s3gen, etc.) — only the new buffer_stack needs
+// to be added to the atexit chain.
+struct t3_stack_entry {
+    ggml_backend_buffer_t buffer = nullptr;
+    ggml_context *        ctx    = nullptr;
+};
+std::mutex                  t3_stack_mu;
+std::vector<t3_stack_entry> t3_stack_registry;
+bool                        t3_stack_atexit_registered = false;
+
+void t3_stack_release_atexit() {
+    std::lock_guard<std::mutex> lk(t3_stack_mu);
+    for (auto & e : t3_stack_registry) {
+        if (e.buffer) {
+            ggml_backend_buffer_free(e.buffer);
+            e.buffer = nullptr;
+        }
+        if (e.ctx) {
+            ggml_free(e.ctx);
+            e.ctx = nullptr;
+        }
+    }
+    t3_stack_registry.clear();
+}
+
+}  // anonymous namespace
+
+void t3_stack_register(ggml_backend_buffer_t buf, ggml_context * ctx) {
+    std::lock_guard<std::mutex> lk(t3_stack_mu);
+    t3_stack_registry.push_back({buf, ctx});
+    if (!t3_stack_atexit_registered) {
+        std::atexit(t3_stack_release_atexit);
+        t3_stack_atexit_registered = true;
+    }
+}
+
+// Drop a (buffer, ctx) pair from the atexit registry without freeing.
+// Used by free_t3() in main on error-path early-returns: free_t3 itself
+// frees buffer_stack + ctx_stack so the backend can shut down cleanly in
+// the same scope; the atexit hook would otherwise double-free dangling
+// pointers if we didn't pull them out of the registry first.
+void t3_stack_unregister(ggml_backend_buffer_t buf, ggml_context * ctx) {
+    std::lock_guard<std::mutex> lk(t3_stack_mu);
+    for (auto it = t3_stack_registry.begin(); it != t3_stack_registry.end(); ) {
+        if (it->buffer == buf && it->ctx == ctx) {
+            it = t3_stack_registry.erase(it);
+        } else {
+            ++it;
+        }
+    }
+}
+
+namespace {
+
 int64_t require_key(const gguf_context * ctx, const char * key) {
     int64_t id = gguf_find_key(ctx, key);
     if (id < 0) throw std::runtime_error(std::string("missing GGUF key: ") + key);
@@ -181,13 +244,24 @@ ggml_tensor * build_perceiver(ggml_context * ctx,
 // One Llama transformer block.  Writes K/V into the selected KV cache
 // tensors at positions [n_past, n_past + N).
 //
-// inpL:       (n_embd, N)
-// memory_k/v: 1D F32 buffers of size (head_dim * n_kv_head * n_ctx * n_layer)
+// inpL:       (n_embd, N)            for B=1
+//             (n_embd, N, 2)         for B=2 (cond + uncond packed as ne[2])
+// memory_k/v: 1D F32 buffer holding the **cond+uncond pair** for MTL:
+//             size = 2 * head_dim * n_kv_head * n_ctx * n_layer.
+//             Per-layer slab is `2 * kv_layer_elems`; cond at offset 0
+//             within the slab, uncond at offset kv_layer_elems.
+//
+// b_offset_elems selects which half is touched in the B=1 path:
+//             0            → cond pass writes/reads the cond slab
+//             kv_layer_elems → uncond pass writes/reads the uncond slab
+// In the B=2 path b_offset_elems is ignored: ne[3]=2 spans both halves
+// and per-batch stride is `kv_layer_elems * sizeof(float)`.
 ggml_tensor * build_llama_block(ggml_context * ctx, ggml_cgraph * gf,
                                 const chatterbox_model & m,
                                 int il,
                                 ggml_tensor * inpL,
-                                int n_past, int N,
+                                int n_past, int N, int B,
+                                size_t b_offset_elems,
                                 ggml_tensor * memory_k,
                                 ggml_tensor * memory_v,
                                 ggml_tensor * pos_ids,
@@ -200,26 +274,88 @@ ggml_tensor * build_llama_block(ggml_context * ctx, ggml_cgraph * gf,
     const int n_ctx = hp.n_ctx;
     const int64_t L = n_past + N;
 
-    const size_t kv_head_stride  = (size_t) HD * n_ctx * sizeof(float);
-    const size_t kv_pos_stride   = (size_t) HD * sizeof(float);
-    const size_t kv_layer_elems  = (size_t) HD * n_ctx * NKV;
-    const size_t layer_off = (size_t) il * kv_layer_elems * sizeof(float);
+    // KV strides are sized off the cache dtype (F32 historically; F16
+    // since Phase 2 to halve KV bandwidth) so the same builder works for
+    // either precision without re-deriving offsets per-call.
+    const size_t kv_ts = ggml_type_size(memory_k->type);
+    const size_t kv_head_stride   = (size_t) HD * n_ctx * kv_ts;
+    const size_t kv_pos_stride    = (size_t) HD * kv_ts;
+    const size_t kv_layer_elems   = (size_t) HD * n_ctx * NKV;       // one batch slab
+    const size_t kv_batch_stride  = kv_layer_elems * kv_ts;          // step from cond to uncond
+    const size_t kv_layer_stride  = (size_t) 2 * kv_batch_stride;    // per-layer slab is 2x
+    const size_t layer_off = (size_t) il * kv_layer_stride
+                           + b_offset_elems * kv_ts;
 
     // Pre-attention RMSNorm (no bias).
     ggml_tensor * cur = ggml_rms_norm(ctx, inpL, hp.eps);
     cur = ggml_mul(ctx, cur, l.ln_attn_g);
 
-    ggml_tensor * Qlin = ggml_mul_mat(ctx, l.wq, cur);  // (n_embd, N)
-    ggml_tensor * Klin = ggml_mul_mat(ctx, l.wk, cur);
-    ggml_tensor * Vlin = ggml_mul_mat(ctx, l.wv, cur);
+    // Q/K/V mat-muls.  When the Phase-15 stacked W_qkv is available
+    // (Metal hot path) we run ONE Q4_0 mat-mul producing
+    // (3 * n_embd, N, B), then slice Q/K/V via strided views straight
+    // into the (HD, NH, N[, B]) shape that RoPE expects — no
+    // ggml_reshape (would require a contiguous source) and no
+    // ggml_cont (would defeat the saving). RoPE's metal kernel walks
+    // src via per-element nb00/nb01/nb02/nb03 strides so it handles
+    // the non-contiguous N stride on the slice transparently.
+    const int n_embd_t = hp.n_embd;
+    ggml_tensor * Qlin;
+    ggml_tensor * Klin;
+    ggml_tensor * Vlin;
+    bool used_stacked_qkv = false;
+    if (l.wqkv) {
+        ggml_tensor * QKV = ggml_mul_mat(ctx, l.wqkv, cur);  // (3*n_embd, N) or (3*n_embd, N, B)
+        used_stacked_qkv = true;
+        const size_t f = sizeof(float);
+        const size_t row_stride   = (size_t) 3 * n_embd_t * f;
+        const size_t batch_stride = row_stride * (size_t) N;
+        const size_t off_q = 0 * (size_t) n_embd_t * f;
+        const size_t off_k = 1 * (size_t) n_embd_t * f;
+        const size_t off_v = 2 * (size_t) n_embd_t * f;
+        if (B == 1) {
+            Qlin = ggml_view_2d(ctx, QKV, n_embd_t, N, row_stride, off_q);
+            Klin = ggml_view_2d(ctx, QKV, n_embd_t, N, row_stride, off_k);
+            Vlin = ggml_view_2d(ctx, QKV, n_embd_t, N, row_stride, off_v);
+        } else {
+            Qlin = ggml_view_3d(ctx, QKV, n_embd_t, N, B, row_stride, batch_stride, off_q);
+            Klin = ggml_view_3d(ctx, QKV, n_embd_t, N, B, row_stride, batch_stride, off_k);
+            Vlin = ggml_view_3d(ctx, QKV, n_embd_t, N, B, row_stride, batch_stride, off_v);
+        }
+    } else {
+        Qlin = ggml_mul_mat(ctx, l.wq, cur);
+        Klin = ggml_mul_mat(ctx, l.wk, cur);
+        Vlin = ggml_mul_mat(ctx, l.wv, cur);
+    }
 
-    // Reshape to (HD, n_head, N). ggml_rope_ext requires ne[2] == len(pos_ids),
-    // so sequence must be on ne[2] at the rope call.
-    ggml_tensor * Q = ggml_reshape_3d(ctx, Qlin, HD, NH,  N);  // (HD, NH,  N)
-    ggml_tensor * K = ggml_reshape_3d(ctx, Klin, HD, NKV, N);  // (HD, NKV, N)
-    ggml_tensor * V = ggml_reshape_3d(ctx, Vlin, HD, NKV, N);  // (HD, NKV, N)
+    // Reshape to (HD, n_head, N) [B=1] or (HD, n_head, N, B) [B=2].
+    // ggml_rope_ext requires ne[2] == len(pos_ids), so sequence stays on
+    // ne[2] at the rope call; the optional batch dim sits at ne[3].
+    //
+    // Use ggml_view_3d/4d (not ggml_reshape) so the same code path
+    // works whether Q/K/V came from contiguous per-head mul_mats
+    // (un-stacked path) or from strided slices of the W_qkv mul_mat
+    // (Phase-15 stacked path). RoPE's metal kernel walks src via
+    // per-element nb01/nb02/nb03 strides so the strided N step is
+    // transparent.
+    ggml_tensor * Q;
+    ggml_tensor * K;
+    ggml_tensor * V;
+    {
+        const size_t f = sizeof(float);
+        if (B == 1) {
+            Q = ggml_view_3d(ctx, Qlin, HD, NH,  N, HD * f, Qlin->nb[1], 0);
+            K = ggml_view_3d(ctx, Klin, HD, NKV, N, HD * f, Klin->nb[1], 0);
+            V = ggml_view_3d(ctx, Vlin, HD, NKV, N, HD * f, Vlin->nb[1], 0);
+        } else {
+            Q = ggml_view_4d(ctx, Qlin, HD, NH,  N, B, HD * f, Qlin->nb[1], Qlin->nb[2], 0);
+            K = ggml_view_4d(ctx, Klin, HD, NKV, N, B, HD * f, Klin->nb[1], Klin->nb[2], 0);
+            V = ggml_view_4d(ctx, Vlin, HD, NKV, N, B, HD * f, Vlin->nb[1], Vlin->nb[2], 0);
+        }
+    }
+    (void) used_stacked_qkv;
 
     // RoPE on Q and K (NEOX-style half-split convention used by Llama).
+    // ggml_rope_ext broadcasts cleanly over an optional batch dim at ne[3].
     const int rope_mode = GGML_ROPE_TYPE_NEOX;
     Q = ggml_rope_ext(ctx, Q, pos_ids, m.rope_freq_factors,
                       HD, rope_mode, hp.rope_orig_max_pos,
@@ -228,55 +364,98 @@ ggml_tensor * build_llama_block(ggml_context * ctx, ggml_cgraph * gf,
                       HD, rope_mode, hp.rope_orig_max_pos,
                       hp.rope_theta, 1.0f, 0.0f, 1.0f, 32.0f, 1.0f);
 
-    // Flash attention (Turbo-style) expects (HD, N, NH).  Permute from
-    // (HD, NH, N) -> (HD, N, NH) and then the KV cache keeps the same
-    // [HD, n_ctx, n_head] layout used in src/main.cpp, so flash_attn can
-    // read a contiguous slice without another permute at read time.
-    Q = ggml_cont(ctx, ggml_permute(ctx, Q, 0, 2, 1, 3));  // (HD, N,  NH)
-    K = ggml_cont(ctx, ggml_permute(ctx, K, 0, 2, 1, 3));  // (HD, N,  NKV)
-    V = ggml_cont(ctx, ggml_permute(ctx, V, 0, 2, 1, 3));  // (HD, N,  NKV)
+    // Flash attention expects (HD, N, NH[, B]).  Permute (0, 2, 1, 3) lifts
+    // N to ne[1] so the KV cache keeps a [HD, n_ctx, n_kv_head] inner-3D
+    // layout that flash_attn can read contiguously per (head, batch).
+    Q = ggml_cont(ctx, ggml_permute(ctx, Q, 0, 2, 1, 3));
+    K = ggml_cont(ctx, ggml_permute(ctx, K, 0, 2, 1, 3));
+    V = ggml_cont(ctx, ggml_permute(ctx, V, 0, 2, 1, 3));
 
     // Write K/V into the cache at [n_past : n_past+N) for this layer.
     {
-        ggml_tensor * k_dst = ggml_view_3d(ctx, memory_k,
-            HD, N, NKV,
-            kv_pos_stride, kv_head_stride,
-            layer_off + (size_t) n_past * kv_pos_stride);
-        ggml_tensor * v_dst = ggml_view_3d(ctx, memory_v,
-            HD, N, NKV,
-            kv_pos_stride, kv_head_stride,
-            layer_off + (size_t) n_past * kv_pos_stride);
+        ggml_tensor * k_dst;
+        ggml_tensor * v_dst;
+        if (B == 1) {
+            k_dst = ggml_view_3d(ctx, memory_k,
+                HD, N, NKV,
+                kv_pos_stride, kv_head_stride,
+                layer_off + (size_t) n_past * kv_pos_stride);
+            v_dst = ggml_view_3d(ctx, memory_v,
+                HD, N, NKV,
+                kv_pos_stride, kv_head_stride,
+                layer_off + (size_t) n_past * kv_pos_stride);
+        } else {
+            k_dst = ggml_view_4d(ctx, memory_k,
+                HD, N, NKV, B,
+                kv_pos_stride, kv_head_stride, kv_batch_stride,
+                layer_off + (size_t) n_past * kv_pos_stride);
+            v_dst = ggml_view_4d(ctx, memory_v,
+                HD, N, NKV, B,
+                kv_pos_stride, kv_head_stride, kv_batch_stride,
+                layer_off + (size_t) n_past * kv_pos_stride);
+        }
         ggml_build_forward_expand(gf, ggml_cpy(ctx, K, k_dst));
         ggml_build_forward_expand(gf, ggml_cpy(ctx, V, v_dst));
     }
 
     // Attention: read the full [0, L) slice from the cache.
-    ggml_tensor * Kfull = ggml_view_3d(ctx, memory_k,
-        HD, L, NKV,
-        kv_pos_stride, kv_head_stride,
-        layer_off);
-    ggml_tensor * Vfull = ggml_view_3d(ctx, memory_v,
-        HD, L, NKV,
-        kv_pos_stride, kv_head_stride,
-        layer_off);
+    ggml_tensor * Kfull;
+    ggml_tensor * Vfull;
+    if (B == 1) {
+        Kfull = ggml_view_3d(ctx, memory_k,
+            HD, L, NKV,
+            kv_pos_stride, kv_head_stride,
+            layer_off);
+        Vfull = ggml_view_3d(ctx, memory_v,
+            HD, L, NKV,
+            kv_pos_stride, kv_head_stride,
+            layer_off);
+    } else {
+        Kfull = ggml_view_4d(ctx, memory_k,
+            HD, L, NKV, B,
+            kv_pos_stride, kv_head_stride, kv_batch_stride,
+            layer_off);
+        Vfull = ggml_view_4d(ctx, memory_v,
+            HD, L, NKV, B,
+            kv_pos_stride, kv_head_stride, kv_batch_stride,
+            layer_off);
+    }
 
     const float scale = 1.0f / std::sqrt((float) HD);
     ggml_tensor * attn = ggml_flash_attn_ext(ctx, Q, Kfull, Vfull, kq_mask,
                                              scale, 0.0f, 0.0f);
-    // attn: (HD, NH, N, 1) -> (n_embd, N)
-    cur = ggml_reshape_2d(ctx, attn, hp.n_embd, N);
+    // attn ne=[HD, NH, N, B].  Reshape back to (n_embd, N[, B]).
+    if (B == 1) {
+        cur = ggml_reshape_2d(ctx, attn, hp.n_embd, N);
+    } else {
+        cur = ggml_reshape_3d(ctx, attn, hp.n_embd, N, B);
+    }
 
     // O-proj + residual.
     cur = ggml_mul_mat(ctx, l.wo, cur);
     cur = ggml_add(ctx, cur, inpL);
 
     // MLP (SwiGLU) with pre-norm + residual.
+    //
+    // Phase 15 stacks `[W_gate ‖ W_up]` along the M dim so a single
+    // Q4_0 mat-mul produces (2 * n_ff, N, B); ggml_swiglu (the
+    // single-arg variant, GGML_GLU_OP_SWIGLU on the stacked tensor)
+    // splits the result internally and fuses
+    // `silu(first_half) * second_half` into one Metal kernel
+    // (kernel_swiglu_f32). Net effect per layer per step: 2 mat-muls
+    // + 1 swiglu instead of 2 mat-muls + 1 swiglu_split, **plus**
+    // one fewer mul_mat dispatch.
+    //
+    // Pre-norm `mul(rms_norm(x), g)` is already auto-fused upstream
+    // by ggml-metal's `can_fuse(RMS_NORM, MUL)` path
+    // (kernel_rms_norm_mul_f32) — leave it written as the obvious
+    // two ops so CPU + non-Metal backends get the same shape.
     ggml_tensor * inpFF = cur;
     ggml_tensor * norm2 = ggml_mul(ctx, ggml_rms_norm(ctx, cur, hp.eps), l.ln_mlp_g);
-    ggml_tensor * gate  = ggml_silu(ctx, ggml_mul_mat(ctx, l.mlp_gate, norm2));
-    ggml_tensor * up    = ggml_mul_mat(ctx, l.mlp_up, norm2);
-    ggml_tensor * mlp   = ggml_mul(ctx, gate, up);
-    ggml_tensor * down  = ggml_mul_mat(ctx, l.mlp_down, mlp);
+    ggml_tensor * gate = ggml_mul_mat(ctx, l.mlp_gate, norm2);
+    ggml_tensor * up   = ggml_mul_mat(ctx, l.mlp_up,   norm2);
+    ggml_tensor * mlp  = ggml_swiglu_split(ctx, gate, up);
+    ggml_tensor * down = ggml_mul_mat(ctx, l.mlp_down, mlp);
     return ggml_add(ctx, inpFF, down);
 }
 
@@ -390,12 +569,16 @@ ggml_cgraph * build_prompt_graph_mtl(const chatterbox_model & model,
     inp = ggml_concat(ctx, inp, speech_emb_out, /*dim=*/1);
     inp = ggml_concat(ctx, inp, speech_emb_out, /*dim=*/1);
 
-    // 5. Run 30 Llama layers.
-    ggml_tensor * mem_k = is_uncond ? model.memory_k_uncond : model.memory_k;
-    ggml_tensor * mem_v = is_uncond ? model.memory_v_uncond : model.memory_v;
+    // 5. Run 30 Llama layers.  Cond/uncond share one memory_k/memory_v
+    // buffer (size 2 * kv_layer_elems per layer); pick the right half via
+    // b_offset_elems.
+    const size_t kv_layer_elems = (size_t) hp.head_dim * hp.n_kv_head * hp.n_ctx;
+    const size_t b_off = is_uncond ? kv_layer_elems : 0;
     ggml_tensor * cur = inp;
     for (int il = 0; il < hp.n_layer; ++il) {
-        cur = build_llama_block(ctx, gf, model, il, cur, /*n_past=*/0, N, mem_k, mem_v,
+        cur = build_llama_block(ctx, gf, model, il, cur, /*n_past=*/0, N,
+                                /*B=*/1, b_off,
+                                model.memory_k, model.memory_v,
                                 pos_ids, kq_mask);
     }
 
@@ -413,6 +596,157 @@ ggml_cgraph * build_prompt_graph_mtl(const chatterbox_model & model,
     return gf;
 }
 
+// B=2 prompt graph: pack cond + uncond into a single forward over the
+// batch dim (ne[2]).  cond_emb (spkr+perceiver+emotion) is identical
+// between the two passes, so we just duplicate it; the text-token
+// embedding differs (uncond zeroes the token part but keeps the learned
+// positional embedding).  Output: (n_speech_vocab, 1, 2) with cond at
+// b=0 and uncond at b=1.  Mirrors the use_b2 pattern from
+// src/chatterbox_tts.cpp:1994 (S3Gen CFM CFG).
+ggml_cgraph * build_prompt_graph_mtl_b2(const chatterbox_model & model,
+                                        int n_text_tokens) {
+    const auto & hp = model.hparams;
+    const int len_cond = 1 + hp.perceiver_queries + (hp.emotion_adv ? 1 : 0);
+    const int N = len_cond + n_text_tokens + 2;
+
+    static size_t buf_size = ggml_tensor_overhead() * CHBX_MAX_NODES +
+                             ggml_graph_overhead_custom(CHBX_MAX_NODES, false);
+    thread_local std::vector<uint8_t> buf(buf_size);
+    ggml_init_params p = { buf_size, buf.data(), true };
+    ggml_context * ctx = ggml_init(p);
+    ggml_cgraph * gf = ggml_new_graph_custom(ctx, CHBX_MAX_NODES, false);
+
+    ggml_tensor * text_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_text_tokens);
+    ggml_set_name(text_tokens, "text_tokens");  ggml_set_input(text_tokens);
+
+    ggml_tensor * speech_bos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+    ggml_set_name(speech_bos, "speech_bos");  ggml_set_input(speech_bos);
+
+    ggml_tensor * pos_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
+    ggml_set_name(pos_ids, "pos_ids");  ggml_set_input(pos_ids);
+
+    ggml_tensor * text_pos_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_text_tokens);
+    ggml_set_name(text_pos_ids, "text_pos_ids");  ggml_set_input(text_pos_ids);
+
+    ggml_tensor * speech_pos0 = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+    ggml_set_name(speech_pos0, "speech_pos0");  ggml_set_input(speech_pos0);
+
+    ggml_tensor * exaggeration = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
+    ggml_set_name(exaggeration, "exaggeration");  ggml_set_input(exaggeration);
+
+    ggml_tensor * kq_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, N, N);
+    ggml_set_name(kq_mask, "kq_mask");  ggml_set_input(kq_mask);
+
+    // Cond fragment (n_embd, 34) — shared between cond + uncond passes.
+    ggml_tensor * cond_emb = build_cond_emb(ctx, model, exaggeration);
+
+    // Text embedding diverges between the two passes:
+    //   cond:   speech_emb[tokens] + text_pos_emb[0..T)
+    //   uncond: text_pos_emb[0..T) only (text-token contribution zeroed)
+    ggml_tensor * text_pos_emb_seq = ggml_get_rows(ctx, model.text_pos_emb, text_pos_ids);
+    ggml_tensor * text_tok_emb     = ggml_get_rows(ctx, model.text_emb, text_tokens);
+    ggml_tensor * text_cond   = ggml_add(ctx, text_tok_emb, text_pos_emb_seq);
+    ggml_tensor * text_uncond = text_pos_emb_seq;
+
+    // Speech BOS embeddings (shared between passes).
+    ggml_tensor * speech_tok_emb    = ggml_get_rows(ctx, model.speech_emb, speech_bos);
+    ggml_tensor * speech_pos_emb_0  = ggml_get_rows(ctx, model.speech_pos_emb, speech_pos0);
+    ggml_tensor * speech_emb_out    = ggml_add(ctx, speech_tok_emb, speech_pos_emb_0);
+
+    // Per-batch input assembly (matches the B=1 prompt graph's order):
+    //   [cond_emb | text_emb_X | speech_emb | speech_emb] → (n_embd, N)
+    auto assemble_one = [&](ggml_tensor * text) {
+        ggml_tensor * inp = ggml_concat(ctx, cond_emb, text, /*dim=*/1);
+        inp = ggml_concat(ctx, inp, speech_emb_out, /*dim=*/1);
+        inp = ggml_concat(ctx, inp, speech_emb_out, /*dim=*/1);
+        return inp;
+    };
+    ggml_tensor * inp_cond   = assemble_one(text_cond);
+    ggml_tensor * inp_uncond = assemble_one(text_uncond);
+
+    // Stack along the batch dim: (n_embd, N, 1) + (n_embd, N, 1) → (n_embd, N, 2).
+    ggml_tensor * inp_b2 = ggml_concat(ctx,
+        ggml_reshape_3d(ctx, inp_cond,   hp.n_embd, N, 1),
+        ggml_reshape_3d(ctx, inp_uncond, hp.n_embd, N, 1),
+        /*dim=*/2);
+
+    ggml_tensor * cur = inp_b2;
+    for (int il = 0; il < hp.n_layer; ++il) {
+        cur = build_llama_block(ctx, gf, model, il, cur, /*n_past=*/0, N,
+                                /*B=*/2, /*b_offset_elems=*/0,
+                                model.memory_k, model.memory_v,
+                                pos_ids, kq_mask);
+    }
+
+    // Final norm + head.  cur ne=[n_embd, N, 2]; take last position only,
+    // resulting in (n_embd, 1, 2), then mat_mul with speech_head (which
+    // broadcasts over batch) to (n_speech_vocab, 1, 2).
+    cur = ggml_mul(ctx, ggml_rms_norm(ctx, cur, hp.eps), model.norm_g);
+    ggml_tensor * last = ggml_view_3d(ctx, cur,
+        hp.n_embd, 1, 2,
+        cur->nb[1], cur->nb[2],
+        (size_t)(N - 1) * cur->nb[1]);
+    last = ggml_cont(ctx, last);  // mat_mul wants contiguous src1 over batches
+    ggml_tensor * logits = ggml_mul_mat(ctx, model.speech_head, last);
+    ggml_set_name(logits, "logits");  ggml_set_output(logits);
+    ggml_build_forward_expand(gf, logits);
+
+    ggml_free(ctx);
+    return gf;
+}
+
+// B=2 step graph: same input speech token + position fed into both cond
+// and uncond passes (the sampler combined the previous logits and chose a
+// single token).  The two batches diverge only via the KV cache, which
+// already differs from the B=2 prompt graph that wrote them.
+ggml_cgraph * build_step_graph_mtl_b2(const chatterbox_model & model,
+                                      int n_past) {
+    const auto & hp = model.hparams;
+
+    static size_t buf_size = ggml_tensor_overhead() * CHBX_MAX_NODES +
+                             ggml_graph_overhead_custom(CHBX_MAX_NODES, false);
+    thread_local std::vector<uint8_t> buf(buf_size);
+    ggml_init_params p = { buf_size, buf.data(), true };
+    ggml_context * ctx = ggml_init(p);
+    ggml_cgraph * gf = ggml_new_graph_custom(ctx, CHBX_MAX_NODES, false);
+
+    ggml_tensor * speech_token = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+    ggml_set_name(speech_token, "speech_token"); ggml_set_input(speech_token);
+
+    ggml_tensor * speech_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+    ggml_set_name(speech_pos, "speech_pos"); ggml_set_input(speech_pos);
+
+    ggml_tensor * pos_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+    ggml_set_name(pos_ids, "pos_ids"); ggml_set_input(pos_ids);
+
+    // inp_b1 = speech_emb[tok] + speech_pos_emb[pos]  → (n_embd, 1).
+    // Both batches see the same input embedding; broadcast to (n_embd, 1, 2)
+    // via ggml_concat.  The materialization cost is ~4 KB per token and
+    // amortises across 30 Llama layers.
+    ggml_tensor * inp_b1 = ggml_add(ctx,
+        ggml_get_rows(ctx, model.speech_emb,     speech_token),
+        ggml_get_rows(ctx, model.speech_pos_emb, speech_pos));
+    ggml_tensor * inp_b1_3d = ggml_reshape_3d(ctx, inp_b1, hp.n_embd, 1, 1);
+    ggml_tensor * inp = ggml_concat(ctx, inp_b1_3d, inp_b1_3d, /*dim=*/2);
+
+    ggml_tensor * cur = inp;
+    for (int il = 0; il < hp.n_layer; ++il) {
+        cur = build_llama_block(ctx, gf, model, il, cur, n_past, /*N=*/1,
+                                /*B=*/2, /*b_offset_elems=*/0,
+                                model.memory_k, model.memory_v,
+                                pos_ids, /*kq_mask=*/nullptr);
+    }
+    cur = ggml_mul(ctx, ggml_rms_norm(ctx, cur, hp.eps), model.norm_g);
+
+    // cur ne=[n_embd, 1, 2] → speech_head @ cur → (n_speech_vocab, 1, 2)
+    ggml_tensor * logits = ggml_mul_mat(ctx, model.speech_head, cur);
+    ggml_set_name(logits, "logits"); ggml_set_output(logits);
+    ggml_build_forward_expand(gf, logits);
+
+    ggml_free(ctx);
+    return gf;
+}
+
 ggml_cgraph * build_step_graph_mtl(const chatterbox_model & model,
                                    int n_past,
                                    bool is_uncond) {
@@ -438,12 +772,14 @@ ggml_cgraph * build_step_graph_mtl(const chatterbox_model & model,
         ggml_get_rows(ctx, model.speech_emb, speech_token),
         ggml_get_rows(ctx, model.speech_pos_emb, speech_pos));
 
-    ggml_tensor * mem_k = is_uncond ? model.memory_k_uncond : model.memory_k;
-    ggml_tensor * mem_v = is_uncond ? model.memory_v_uncond : model.memory_v;
+    const size_t kv_layer_elems = (size_t) hp.head_dim * hp.n_kv_head * hp.n_ctx;
+    const size_t b_off = is_uncond ? kv_layer_elems : 0;
 
     ggml_tensor * cur = inp;
     for (int il = 0; il < hp.n_layer; ++il) {
-        cur = build_llama_block(ctx, gf, model, il, cur, n_past, /*N=*/1, mem_k, mem_v,
+        cur = build_llama_block(ctx, gf, model, il, cur, n_past, /*N=*/1,
+                                /*B=*/1, b_off,
+                                model.memory_k, model.memory_v,
                                 pos_ids, /*kq_mask=*/nullptr);
     }
     cur = ggml_mul(ctx, ggml_rms_norm(ctx, cur, hp.eps), model.norm_g);
@@ -481,10 +817,7 @@ bool run_prompt_pass(const chatterbox_model & model,
     prompt_len_out = N;
 
     ggml_cgraph * gf = build_prompt_graph_mtl(model, (int) text_tokens.size(), is_uncond);
-    if (!ggml_gallocr_reserve(allocr, gf)) {
-        fprintf(stderr, "run_prompt_pass: gallocr_reserve failed\n");
-        return false;
-    }
+    // alloc_graph reserves lazily; see run_step_pass_b2 comment.
     if (!ggml_gallocr_alloc_graph(allocr, gf)) {
         fprintf(stderr, "run_prompt_pass: gallocr_alloc_graph failed (graph topology exceeded reserved budget?)\n");
         return false;
@@ -536,6 +869,121 @@ bool run_prompt_pass(const chatterbox_model & model,
     return true;
 }
 
+// Run the prompt graph as a single batch=2 forward (cond on b=0, uncond
+// on b=1).  Output logits shape: (n_speech_vocab, 1, 2); we read the
+// cond half into logits_cond and the uncond half into logits_uncond.
+bool run_prompt_pass_b2(const chatterbox_model & model,
+                        ggml_gallocr_t allocr,
+                        int n_threads,
+                        const std::vector<int32_t> & text_tokens,
+                        float exaggeration,
+                        std::vector<float> & logits_cond_out,
+                        std::vector<float> & logits_uncond_out,
+                        int & prompt_len_out) {
+    const auto & hp = model.hparams;
+    const int len_cond = 1 + hp.perceiver_queries + (hp.emotion_adv ? 1 : 0);
+    const int N = len_cond + (int) text_tokens.size() + 2;
+    prompt_len_out = N;
+
+    ggml_cgraph * gf = build_prompt_graph_mtl_b2(model, (int) text_tokens.size());
+    // alloc_graph below already reserves lazily via ggml_gallocr_needs_realloc;
+    // see run_step_pass_b2 for the rationale on dropping the explicit
+    // ggml_gallocr_reserve(allocr, gf) call here.
+    if (!ggml_gallocr_alloc_graph(allocr, gf)) {
+        fprintf(stderr, "run_prompt_pass_b2: gallocr_alloc_graph failed (graph topology exceeded reserved budget?)\n");
+        return false;
+    }
+
+    auto set_in = [&](const char * name, const void * data, size_t bytes) {
+        ggml_tensor * t = ggml_graph_get_tensor(gf, name);
+        if (t) ggml_backend_tensor_set(t, data, 0, bytes);
+    };
+    set_in("text_tokens", text_tokens.data(), text_tokens.size() * sizeof(int32_t));
+    int32_t bos = hp.start_speech_token;
+    set_in("speech_bos", &bos, sizeof(bos));
+
+    std::vector<int32_t> pos(N);
+    for (int i = 0; i < N; ++i) pos[i] = i;
+    set_in("pos_ids", pos.data(), pos.size() * sizeof(int32_t));
+
+    std::vector<int32_t> text_pos(text_tokens.size());
+    for (size_t i = 0; i < text_tokens.size(); ++i) text_pos[i] = (int32_t) i;
+    set_in("text_pos_ids", text_pos.data(), text_pos.size() * sizeof(int32_t));
+
+    int32_t sp0 = 0;
+    set_in("speech_pos0", &sp0, sizeof(sp0));
+
+    const int cond_prompt_len = hp.cond_prompt_len;
+    std::vector<int32_t> cond_pos(cond_prompt_len);
+    for (int i = 0; i < cond_prompt_len; ++i) cond_pos[i] = i;
+    set_in("cond_prompt_pos_ids", cond_pos.data(), cond_pos.size() * sizeof(int32_t));
+
+    float exag = exaggeration;
+    set_in("exaggeration", &exag, sizeof(exag));
+
+    std::vector<ggml_fp16_t> mask;
+    fill_causal_mask_f16(mask, N);
+    set_in("kq_mask", mask.data(), mask.size() * sizeof(ggml_fp16_t));
+
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    }
+    ggml_backend_graph_compute(model.backend, gf);
+
+    ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits");
+    // logits ne=[n_speech_vocab, 1, 2], contiguous.  Cond at b=0, uncond at b=1.
+    const size_t per_batch_bytes = (size_t) hp.n_speech_vocab * sizeof(float);
+    logits_cond_out.resize(hp.n_speech_vocab);
+    logits_uncond_out.resize(hp.n_speech_vocab);
+    ggml_backend_tensor_get(logits, logits_cond_out.data(),   0,                per_batch_bytes);
+    ggml_backend_tensor_get(logits, logits_uncond_out.data(), per_batch_bytes, per_batch_bytes);
+    return true;
+}
+
+// B=2 step pass: one forward producing both cond + uncond logits.
+bool run_step_pass_b2(const chatterbox_model & model,
+                      ggml_gallocr_t allocr,
+                      int n_threads,
+                      int n_past,
+                      int32_t token,
+                      std::vector<float> & logits_cond_out,
+                      std::vector<float> & logits_uncond_out) {
+    const auto & hp = model.hparams;
+
+    ggml_cgraph * gf = build_step_graph_mtl_b2(model, n_past);
+    // Skip the explicit ggml_gallocr_reserve(allocr, gf) call here:
+    // alloc_graph below already calls ggml_gallocr_needs_realloc, and
+    // only re-runs the topology analysis when the graph actually grew
+    // (single-buffer single-backend case — the default for chatterbox).
+    // The per-step graph keeps the same node count + per-node tensor
+    // shapes for every n_past >= 1, so after the first call alloc_graph
+    // is a fast O(n_nodes) buffer-reset; the explicit reserve forced an
+    // unnecessary topology re-walk on every one of the 84 step calls.
+    if (!ggml_gallocr_alloc_graph(allocr, gf)) {
+        fprintf(stderr, "run_step_pass_b2: gallocr_alloc_graph failed (n_past=%d)\n", n_past);
+        return false;
+    }
+
+    ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "speech_token"), &token, 0, sizeof(token));
+    int32_t sp = n_past;
+    ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "speech_pos"), &sp, 0, sizeof(sp));
+    int32_t pos = n_past;
+    ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "pos_ids"), &pos, 0, sizeof(pos));
+
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    }
+    ggml_backend_graph_compute(model.backend, gf);
+
+    ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits");
+    const size_t per_batch_bytes = (size_t) hp.n_speech_vocab * sizeof(float);
+    logits_cond_out.resize(hp.n_speech_vocab);
+    logits_uncond_out.resize(hp.n_speech_vocab);
+    ggml_backend_tensor_get(logits, logits_cond_out.data(),   0,                per_batch_bytes);
+    ggml_backend_tensor_get(logits, logits_uncond_out.data(), per_batch_bytes, per_batch_bytes);
+    return true;
+}
+
 bool run_step_pass(const chatterbox_model & model,
                    ggml_gallocr_t allocr,
                    int n_threads,
@@ -544,10 +992,7 @@ bool run_step_pass(const chatterbox_model & model,
                    bool is_uncond,
                    std::vector<float> & logits_out) {
     ggml_cgraph * gf = build_step_graph_mtl(model, n_past, is_uncond);
-    if (!ggml_gallocr_reserve(allocr, gf)) {
-        fprintf(stderr, "run_step_pass: gallocr_reserve failed\n");
-        return false;
-    }
+    // alloc_graph reserves lazily; see run_step_pass_b2 comment.
     if (!ggml_gallocr_alloc_graph(allocr, gf)) {
         fprintf(stderr, "run_step_pass: gallocr_alloc_graph failed (n_past=%d)\n", n_past);
         return false;
@@ -688,14 +1133,15 @@ ggml_cgraph * build_stage_layers_graph(const chatterbox_model & m, int N,
         ggml_set_name(kq_mask, "kq_mask"); ggml_set_input(kq_mask);
     }
 
-    ggml_tensor * mem_k = is_uncond ? m.memory_k_uncond : m.memory_k;
-    ggml_tensor * mem_v = is_uncond ? m.memory_v_uncond : m.memory_v;
+    const size_t kv_layer_elems = (size_t) hp.head_dim * hp.n_kv_head * hp.n_ctx;
+    const size_t b_off = is_uncond ? kv_layer_elems : 0;
 
     ggml_tensor * cur = inp;
     const int actual_layers = std::min(n_layers, hp.n_layer);
     for (int il = 0; il < actual_layers; ++il) {
         cur = build_llama_block(ctx, gf, m, il, cur, /*n_past=*/0, N,
-                                mem_k, mem_v, pos_ids, kq_mask);
+                                /*B=*/1, b_off,
+                                m.memory_k, m.memory_v, pos_ids, kq_mask);
     }
     ggml_set_name(cur, "layers_out");
     ggml_set_output(cur);
@@ -878,19 +1324,119 @@ bool load_model_gguf_mtl(const std::string & path,
             l.mlp_down = require_tensor(model, (lp + "/mlp/down/w").c_str());
         }
 
+        // Single unified KV buffer holding the cond+uncond pair.
+        // Layout per layer: 2x kv_layer_elems contiguous floats, with the
+        // cond half at offset 0 and the uncond half at offset kv_layer_elems.
+        // The B=1 single-pass code addresses the right half via the
+        // `b_offset_elems` parameter to build_llama_block; the B=2 batched
+        // path views ne[3]=2 over the same memory with batch_stride=
+        // kv_layer_elems * sizeof(float).
         ggml_init_params kv_params = { ggml_tensor_overhead() * 4, nullptr, true };
         model.ctx_kv = ggml_init(kv_params);
-        const int64_t kv_elements = (int64_t) hp.head_dim * hp.n_kv_head * hp.n_ctx * hp.n_layer;
-        model.memory_k        = ggml_new_tensor_1d(model.ctx_kv, GGML_TYPE_F32, kv_elements);
-        model.memory_v        = ggml_new_tensor_1d(model.ctx_kv, GGML_TYPE_F32, kv_elements);
-        model.memory_k_uncond = ggml_new_tensor_1d(model.ctx_kv, GGML_TYPE_F32, kv_elements);
-        model.memory_v_uncond = ggml_new_tensor_1d(model.ctx_kv, GGML_TYPE_F32, kv_elements);
+        const int64_t kv_elements_b2 =
+            (int64_t) 2 * hp.head_dim * hp.n_kv_head * hp.n_ctx * hp.n_layer;
+        // KV dtype is kept at F32 here.  Phase-2 of §3.21 tried F16 KV —
+        // build_llama_block already routes ggml_type_size(memory_k->type)
+        // into the strides, ggml_flash_attn_ext consumes F16 K/V
+        // directly, and the per-step ggml_cpy converts F32→F16 for
+        // free — but on M3 Ultra it was a wash (Q4_0 502 → 507 ms,
+        // F16 within noise) and produced byte-exact audio, suggesting
+        // ggml-metal's flash-attn was already running its matmul at
+        // F16 internally regardless of storage dtype.  We keep F32
+        // storage to match the §3.19 numerics envelope.  Memory-bound
+        // backends (e.g. M4 with 10 GPU cores) may still benefit; flip
+        // this to GGML_TYPE_F16 to try that.
+        model.memory_k = ggml_new_tensor_1d(model.ctx_kv, GGML_TYPE_F32, kv_elements_b2);
+        model.memory_v = ggml_new_tensor_1d(model.ctx_kv, GGML_TYPE_F32, kv_elements_b2);
+        // Legacy aliases for any caller that hasn't been migrated yet
+        // (none on the MTL hot path; kept nullable on purpose).
+        model.memory_k_uncond = nullptr;
+        model.memory_v_uncond = nullptr;
         model.buffer_kv = ggml_backend_alloc_ctx_tensors(model.ctx_kv, model.backend);
         if (!model.buffer_kv) {
             throw std::runtime_error("load_model_gguf_mtl: ggml_backend_alloc_ctx_tensors failed for "
                                      "KV-cache buffer (backend out of memory?)");
         }
 
+        // Phase 15: per-layer fused-matmul stacks for the Metal hot path.
+        //
+        //   wqkv      : (n_embd, 3 * n_embd)  rows  [Q ‖ K ‖ V]
+        //   w_gate_up : (n_embd, 2 * n_ff)    rows  [gate ‖ up]
+        //
+        // Each Llama block previously dispatched 3 separate Q4_0 mat-muls
+        // for Q/K/V plus 2 for gate/up; stacking them collapses those into
+        // 1 + 1 = 2 dispatches per block, saving (3-1) + (2-1) = 3 kernel
+        // launches per block per step inside the same compute_graph
+        // commit. On a 30-layer × 84-token T3 step pass that's ~7.5k
+        // fewer kernel launches per call. The combined mat-mul also
+        // gives the Metal mul_mm shader a wider M dimension, which is
+        // what its tiling expects (NR0 = 64).
+        //
+        // CPU backend keeps the original wq/wk/wv path because
+        // ggml-cpu's per-kernel overhead is already negligible and the
+        // extra weight memory footprint (~75 MB for the multilingual
+        // T3) trades unfavourably with thread-cache locality there.
+        if (!ggml_backend_is_cpu(model.backend)) {
+            const int n_embd = hp.n_embd;
+            const int n_ff   = hp.intermediate_size;
+
+            const size_t stack_meta = ggml_tensor_overhead() * (size_t) (2 * hp.n_layer + 4);
+            ggml_init_params sp = { stack_meta, nullptr, true };
+            model.ctx_stack = ggml_init(sp);
+            if (!model.ctx_stack) {
+                throw std::runtime_error("load_model_gguf_mtl: ggml_init failed for stacked-weights ctx");
+            }
+
+            // QKV stack: Q4_0 in the multilingual T3 GGUF (q.w / k.w / v.w
+            // all Q4_0 for every layer). gate/up CAN'T be stacked because
+            // the converter ships gate as F16 and up as Q4_0 — different
+            // element widths can't share a single ggml_tensor.
+            for (int i = 0; i < hp.n_layer; ++i) {
+                auto & l = model.layers_mtl[i];
+                if (l.wq->type != l.wk->type || l.wq->type != l.wv->type) {
+                    fprintf(stderr, "load_model_gguf_mtl: skipping QKV stack on layer %d "
+                                    "(mixed types Q=%s K=%s V=%s)\n",
+                            i, ggml_type_name(l.wq->type), ggml_type_name(l.wk->type),
+                            ggml_type_name(l.wv->type));
+                    l.wqkv = nullptr;
+                    continue;
+                }
+                l.wqkv = ggml_new_tensor_2d(model.ctx_stack, l.wq->type, n_embd, 3 * n_embd);
+            }
+            (void) n_ff;
+            model.buffer_stack = ggml_backend_alloc_ctx_tensors(model.ctx_stack, model.backend);
+            if (!model.buffer_stack) {
+                throw std::runtime_error("load_model_gguf_mtl: ggml_backend_alloc_ctx_tensors failed for "
+                                         "stacked-weights buffer (backend out of memory?)");
+            }
+            t3_stack_register(model.buffer_stack, model.ctx_stack);
+
+            // Copy Q/K/V rows into wqkv via host scratch. Q4_0 row
+            // layout is M-major (rows packed contiguously), so we just
+            // append wq's rows, then wk's, then wv's.
+            size_t scratch_bytes = 0;
+            for (int i = 0; i < hp.n_layer; ++i) {
+                auto & l = model.layers_mtl[i];
+                if (!l.wqkv) continue;
+                scratch_bytes = std::max(scratch_bytes, ggml_nbytes(l.wq));
+            }
+            std::vector<char> scratch(scratch_bytes);
+            for (int i = 0; i < hp.n_layer; ++i) {
+                auto & l = model.layers_mtl[i];
+                if (!l.wqkv) continue;
+                size_t off = 0;
+                auto copy_into = [&](ggml_tensor * src, ggml_tensor * dst) {
+                    const size_t nb = ggml_nbytes(src);
+                    ggml_backend_tensor_get(src, scratch.data(), 0, nb);
+                    ggml_backend_tensor_set(dst, scratch.data(), off, nb);
+                    off += nb;
+                };
+                copy_into(l.wq, l.wqkv);
+                copy_into(l.wk, l.wqkv);
+                copy_into(l.wv, l.wqkv);
+            }
+        }
+
         {
             const int64_t jk = gguf_find_key(gguf_ctx, "tokenizer.ggml.mtl_json");
             const int64_t lk = gguf_find_key(gguf_ctx, "tokenizer.ggml.mtl_languages");
@@ -934,7 +1480,7 @@ bool load_model_gguf_mtl(const std::string & path,
                     hp.n_ctx, hp.n_embd, hp.n_layer, hp.n_head, hp.n_kv_head,
                     hp.head_dim, hp.intermediate_size,
                     hp.n_text_vocab, hp.n_speech_vocab, hp.cond_prompt_len);
-            fprintf(stderr, "load_model_gguf_mtl: weights=%.2f MB  KV=%.2f MB (2x for CFG) "
+            fprintf(stderr, "load_model_gguf_mtl: weights=%.2f MB  KV=%.2f MB (cond+uncond unified) "
                             "tokenizer_json=%zu bytes  languages=%zu\n",
                     ggml_backend_buffer_get_size(model.buffer_w) / (1024.0*1024.0),
                     ggml_backend_buffer_get_size(model.buffer_kv) / (1024.0*1024.0),
@@ -959,6 +1505,19 @@ bool eval_prompt_mtl(const chatterbox_model & model,
                      std::vector<float> & logits_cond_out,
                      std::vector<float> & logits_uncond_out,
                      int & prompt_len) {
+    // Metal: dispatch the cond+uncond pair through a single B=2 graph so
+    // the 30 Llama-block weight reads + Metal kernel dispatches are
+    // amortised over both batches.  CPU keeps the two-call path (each
+    // op processes B=2 in a tight loop, so batching just doubles the
+    // per-op work without saving ops; mirrors §3.20's S3Gen B=2 finding
+    // that on CPU the two-call path stayed the winner).
+    const bool use_b2 = !ggml_backend_is_cpu(model.backend);
+    if (use_b2) {
+        return run_prompt_pass_b2(model, allocr, n_threads, text_tokens,
+                                  exaggeration, logits_cond_out,
+                                  logits_uncond_out, prompt_len);
+    }
+
     int plen_c = 0, plen_u = 0;
     if (!run_prompt_pass(model, allocr, n_threads, text_tokens, exaggeration,
                          /*is_uncond=*/false, logits_cond_out, plen_c)) return false;
@@ -997,6 +1556,12 @@ bool eval_step_mtl(const chatterbox_model & model,
                 n_past, model.hparams.max_speech_tokens);
         return false;
     }
+    // Metal: cond+uncond batched into a single forward.  See eval_prompt_mtl.
+    const bool use_b2 = !ggml_backend_is_cpu(model.backend);
+    if (use_b2) {
+        return run_step_pass_b2(model, allocr, n_threads, n_past, token,
+                                logits_cond_out, logits_uncond_out);
+    }
     if (!run_step_pass(model, allocr, n_threads, n_past, token, /*uncond=*/false,
                        logits_cond_out)) return false;
     if (!run_step_pass(model, allocr, n_threads, n_past, token, /*uncond=*/true,
diff --git a/src/t3_mtl.h b/src/t3_mtl.h
index af7b8b6..4769c90 100644
--- a/src/t3_mtl.h
+++ b/src/t3_mtl.h
@@ -7,9 +7,15 @@
 
 #include "chatterbox_t3_internal.h"
 #include "ggml.h"
+#include "ggml-backend.h"
 
 namespace tts_cpp::chatterbox::detail {
 
+// Phase 15: drop a (buffer_stack, ctx_stack) pair from the process-wide
+// atexit registry. Called from main()'s free_t3() lambda on error-path
+// early-returns so we don't double-free at process exit.
+void t3_stack_unregister(ggml_backend_buffer_t buf, ggml_context * ctx);
+
 // Each builder returns a ggml_cgraph*; the caller uses ggml_gallocr_reserve +
 // alloc_graph and sets input tensors by name before compute.
 
diff --git a/src/test_metal_ops.cpp b/src/test_metal_ops.cpp
index a033b17..f211cf0 100644
--- a/src/test_metal_ops.cpp
+++ b/src/test_metal_ops.cpp
@@ -1,6 +1,8 @@
 // Standalone validation for the Metal kernels we added/fixed in ggml:
 //   - GGML_OP_DIAG_MASK_INF
 //   - GGML_OP_PAD with non-zero front-pad offsets (lp0..lp3)
+//   - GGML_OP_MUL_MAT + GGML_OP_ADD(bias) [+ GGML_OP_UNARY(GELU_ERF)]
+//     fusion in kernel_mul_mm (PROGRESS §3.27, §3.28)
 //
 // Runs each op twice (once on CPU, once on Metal) with the same input and
 // compares element-by-element.  Exits non-zero on mismatch.
@@ -219,6 +221,120 @@ static int test_conv_transpose_1d(ggml_backend_t cpu, ggml_backend_t gpu,
     return 1;
 }
 
+// Test the MUL_MAT + ADD(bias) [+ GELU_ERF] fusion in kernel_mul_mm.
+// Builds the 2- or 3-op subgraph on both CPU and GPU backends, dispatches,
+// and compares output element-wise.  On the GPU side, ggml-metal's fusion
+// system (FC_MUL_MM + 2 / +3 / +4, PROGRESS §3.27 / §3.28) collapses these
+// into a single `kernel_mul_mm_..._bias=1_res=X_gelu=Y` dispatch; the CPU
+// path is always the unfused triple.  Any numerical drift beyond atol
+// indicates either a kernel bug or a shape-handling mismatch.
+//
+// Uses Q4_0 weights to match the chatterbox CFM hot path — that's the
+// shape the fused kernel is specifically targeting.  K must be %32 for
+// Q4_0 blocks; N / T are unconstrained.
+//
+// fuse_mode: 0 = MUL_MAT + ADD(bias), 1 = MUL_MAT + ADD(bias) + GELU_ERF.
+static int test_mul_mm_fused(ggml_backend_t cpu, ggml_backend_t gpu,
+                             int K, int N, int T, int B, int fuse_mode,
+                             const char * label) {
+    fprintf(stderr, "[mul_mm_fused %s] ", label);
+
+    std::mt19937 rng(42);
+    std::uniform_real_distribution<float> dist(-0.25f, 0.25f);
+    // W: (K, N) in ggml layout → src0 of shape [K, N] = ggml ne=[K, N].
+    //    Quantized to Q4_0 — block of 32 in the K (innermost) dim.
+    // X: (K, T, B) → src1 of shape [K, T, B] in ggml ne=[K, T, B].
+    // Output: (N, T, B).
+    // bias: (N,) — broadcast over T, B.
+    std::vector<float> W_f32(K * N);
+    std::vector<float> X_f32(K * T * B);
+    std::vector<float> bias_f32(N);
+    for (auto & v : W_f32)    v = dist(rng);
+    for (auto & v : X_f32)    v = dist(rng);
+    for (auto & v : bias_f32) v = dist(rng);
+
+    auto run_one = [&](ggml_backend_t backend) {
+        static size_t buf_size = 32 * 1024 * 1024;
+        std::vector<uint8_t> buf(buf_size);
+        ggml_init_params p = { buf_size, buf.data(), true };
+        ggml_context * ctx = ggml_init(p);
+        ggml_cgraph * gf = ggml_new_graph(ctx);
+
+        ggml_tensor * W    = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, K, N);
+        ggml_tensor * X    = (B == 1) ? ggml_new_tensor_2d(ctx, GGML_TYPE_F32, K, T)
+                                      : ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K, T, B);
+        ggml_tensor * bias = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, N);
+        ggml_set_name(W,    "W");    ggml_set_input(W);
+        ggml_set_name(X,    "X");    ggml_set_input(X);
+        ggml_set_name(bias, "bias"); ggml_set_input(bias);
+
+        ggml_tensor * mm   = ggml_mul_mat(ctx, W, X);
+        ggml_tensor * mmb  = ggml_add(ctx, mm, bias);
+        ggml_tensor * out  = (fuse_mode == 1) ? ggml_gelu_erf(ctx, mmb) : mmb;
+        ggml_set_name(out, "out"); ggml_set_output(out);
+        ggml_build_forward_expand(gf, out);
+
+        auto * allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+        ggml_gallocr_reserve(allocr, gf);
+        ggml_gallocr_alloc_graph(allocr, gf);
+
+        // Quantise W to Q4_0 into the backend buffer.
+        {
+            std::vector<uint8_t> qbuf(ggml_nbytes(ggml_graph_get_tensor(gf, "W")));
+            ggml_quantize_chunk(GGML_TYPE_Q4_0, W_f32.data(), qbuf.data(), 0, N, K, nullptr);
+            ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "W"),
+                                    qbuf.data(), 0, qbuf.size());
+        }
+        ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "X"),    X_f32.data(),    0, X_f32.size()    * sizeof(float));
+        ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "bias"), bias_f32.data(), 0, bias_f32.size() * sizeof(float));
+
+        ggml_backend_graph_compute(backend, gf);
+        ggml_tensor * out_t = ggml_graph_get_tensor(gf, "out");
+        std::vector<float> res(ggml_nelements(out_t));
+        ggml_backend_tensor_get(out_t, res.data(), 0, ggml_nbytes(out_t));
+
+        ggml_gallocr_free(allocr);
+        ggml_free(ctx);
+        return res;
+    };
+
+    auto ref = run_one(cpu);
+    auto got = run_one(gpu);
+
+    int bad = 0;
+    float max_err = 0.f, max_rel = 0.f;
+    for (size_t i = 0; i < ref.size(); ++i) {
+        const float d = std::fabs(got[i] - ref[i]);
+        const float r = d / std::max(std::fabs(ref[i]), 1e-6f);
+        if (d > max_err) max_err = d;
+        if (r > max_rel) max_rel = r;
+        // Tolerance: the CPU reference and the GPU kernel both dequantize
+        // Q4_0 then do f32 mul_mat, but in different accumulation orders
+        // (CPU walks rows scalarly, Metal kernel_mul_mm uses cooperative
+        // matmul on 8x8 tiles).  Observed max abs ~5e-3 on Q4_0 shapes
+        // in the 256..1024 range.  Fail only if abs diff exceeds 2e-2
+        // — that's 4x the Q4_0 noise floor, catches real kernel bugs
+        // (like §3.29's reverted direct-store RMW which would have
+        // shown up as wholesale >1e-1 drift) without flagging
+        // accumulation-order drift.
+        if (d > 2e-2f) {
+            if (bad < 5) {
+                fprintf(stderr, "\n  mismatch @ %zu: cpu=%.6g gpu=%.6g diff=%.3e rel=%.3e",
+                        i, ref[i], got[i], d, r);
+            }
+            ++bad;
+        }
+    }
+    if (bad == 0) {
+        fprintf(stderr, "OK (K=%d N=%d T=%d B=%d fuse=%s, max_abs=%.1e max_rel=%.1e)\n",
+                K, N, T, B, fuse_mode == 1 ? "gelu" : "bias", max_err, max_rel);
+        return 0;
+    }
+    fprintf(stderr, "\n[mul_mm_fused %s] FAIL: %d / %zu mismatched (max_err=%.3e max_rel=%.3e)\n",
+            label, bad, ref.size(), max_err, max_rel);
+    return 1;
+}
+
 int main() {
     ggml_backend_t cpu = ggml_backend_cpu_init();
     if (!cpu) { fprintf(stderr, "CPU backend init failed\n"); return 1; }
@@ -243,6 +359,23 @@ int main() {
     // A small sanity case too.
     rc |= test_conv_transpose_1d(cpu, gpu, /*IL=*/10,   /*IC=*/3,   /*OC=*/4,   /*K=*/5,  /*s0=*/2, "tiny");
 
+    // MUL_MAT + ADD(bias) fusion (PROGRESS §3.27): CFM transformer hot shapes.
+    //   K=256, N=256 — attn to_q / to_k / to_v
+    //   K=256, N=512 — attn to_out
+    //   K=256, N=1024 — FF gate (ff0; also tested with gelu)
+    //   K=1024, N=256 — FF down (ff2)
+    // T=87, B=2 matches CFM's use_b2=true configuration.
+    rc |= test_mul_mm_fused(cpu, gpu, /*K=*/ 256, /*N=*/ 256, /*T=*/87, /*B=*/2, /*fuse=*/0, "cfm-attn-qkv");
+    rc |= test_mul_mm_fused(cpu, gpu, /*K=*/ 256, /*N=*/ 512, /*T=*/87, /*B=*/2, /*fuse=*/0, "cfm-attn-out");
+    rc |= test_mul_mm_fused(cpu, gpu, /*K=*/ 256, /*N=*/1024, /*T=*/87, /*B=*/2, /*fuse=*/0, "cfm-ff-gate-bias");
+    rc |= test_mul_mm_fused(cpu, gpu, /*K=*/ 256, /*N=*/1024, /*T=*/87, /*B=*/2, /*fuse=*/1, "cfm-ff-gate-bias+gelu");
+    rc |= test_mul_mm_fused(cpu, gpu, /*K=*/1024, /*N=*/ 256, /*T=*/87, /*B=*/2, /*fuse=*/0, "cfm-ff-down");
+    // Batch=1 sanity — exercises the non-batch path of the dispatcher.
+    rc |= test_mul_mm_fused(cpu, gpu, /*K=*/ 256, /*N=*/ 512, /*T=*/87, /*B=*/1, /*fuse=*/0, "cfm-b1");
+    // Non-64-multiple N to exercise the bounds-checked (bco=1) shmem path.
+    rc |= test_mul_mm_fused(cpu, gpu, /*K=*/ 256, /*N=*/ 320, /*T=*/87, /*B=*/2, /*fuse=*/0, "bco-bias");
+    rc |= test_mul_mm_fused(cpu, gpu, /*K=*/ 256, /*N=*/ 320, /*T=*/87, /*B=*/2, /*fuse=*/1, "bco-gelu");
+
     ggml_backend_free(gpu);
     ggml_backend_free(cpu);
     return rc;