diff --git a/tts-cpp/.gitignore b/tts-cpp/.gitignore
index ca1d3c4c339..ba5670bf11a 100644
--- a/tts-cpp/.gitignore
+++ b/tts-cpp/.gitignore
@@ -1,5 +1,8 @@
 # Vendored ggml (cloned separately at setup time; see README)
-ggml/
+/ggml/
+# (We DO commit cmake/vcpkg-overlay-ports/ggml/ — it's the QVAC ggml port
+# overlay carrying our Supertonic custom-op patches.  The `/ggml/` above is
+# anchored to the tts-cpp root only.)
 
 # Build artifacts
 build/
diff --git a/tts-cpp/CMakeLists.txt b/tts-cpp/CMakeLists.txt
index b1521db83c0..d404842c064 100644
--- a/tts-cpp/CMakeLists.txt
+++ b/tts-cpp/CMakeLists.txt
@@ -115,23 +115,23 @@ if (NOT TARGET ggml)
         endif()
         add_library(ggml ALIAS ggml::ggml)
     else()
-        # In-tree subtree of qvac-ext-lib-whisper.cpp: the standalone
-        # patches/ folder + scripts/setup-ggml.sh tooling is intentionally
-        # absent here.  Without them, an add_subdirectory(ggml) build
-        # would silently miss the ggml-backend-reg-filename-prefix patch
-        # that GGML_BACKEND_DL_PROJECT_PREFIX="speech-" depends on, so
-        # libspeech-ggml-*.so files would exist on disk but the runtime
-        # loader would still search for libggml-*.so under
-        # GGML_BACKEND_DL=ON.  Reject up front with a pointer at the
-        # right consumption path.
-        if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/patches")
+        # Bundled-ggml dev build path (TTS_CPP_USE_SYSTEM_GGML=OFF).
+        # Expects `tts-cpp/ggml/` to be a checkout of the
+        # tetherto/qvac-ext-ggml repo on the `speech` branch — the QVAC
+        # fork carrying every infrastructure patch + the Supertonic 2
+        # fused custom op family as commits (not as a patches/ overlay).
+        #
+        # Run `bash tts-cpp/scripts/setup-ggml.sh` first to clone +
+        # check out the pinned commit.  No patches/ directory is
+        # consulted: the speech branch is already pre-patched at the
+        # commit level.
+        if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ggml/CMakeLists.txt")
             message(FATAL_ERROR
-                "tts-cpp: this in-tree subtree does not ship the patches/ "
-                "directory.  Pass -DTTS_CPP_USE_SYSTEM_GGML=ON to consume "
-                "the QVAC speech-stack `ggml-speech` vcpkg port (which "
-                "carries the pre-applied patches), or use the standalone "
-                "github.com/gianni-cor/chatterbox.cpp repo for a "
-                "bundled-ggml dev build with patches/ present.")
+                "tts-cpp: bundled-ggml build requires tts-cpp/ggml/ to be "
+                "a checkout of tetherto/qvac-ext-ggml@speech.  Run "
+                "`bash tts-cpp/scripts/setup-ggml.sh` first, or pass "
+                "-DTTS_CPP_USE_SYSTEM_GGML=ON to consume the QVAC "
+                "speech-stack `ggml-speech` vcpkg port.")
         endif()
         add_subdirectory(ggml)
     endif()
diff --git a/tts-cpp/PROGRESS_SUPERTONIC.md b/tts-cpp/PROGRESS_SUPERTONIC.md
index 6f0b34b122e..3c65019810e 100644
--- a/tts-cpp/PROGRESS_SUPERTONIC.md
+++ b/tts-cpp/PROGRESS_SUPERTONIC.md
@@ -612,9 +612,908 @@ spelled out (most TDD, written before the implementation lands).
   `flash_attn_f32_f16` enabled) to confirm the Supertonic bottleneck shifts
   from custom CPU ops to `kernel_mul_mm_f32_f32` and the same convnext block
   shape that chatterbox already profiled.
+- ~~Evaluate GPU backends after CPU graph structure is fully stable.~~ — initial
+  Metal port landed 2026-05-11; see "Metal baseline (2026-05-11)" below.
 - Add CI coverage for converter help/setup syntax and portable Supertonic build
   targets.
 
+## Metal baseline (2026-05-11)
+
+First end-to-end Metal run of the Supertonic 2 pipeline. Approach mirrors
+Chatterbox's pattern: single `ggml_backend_metal_init()` at model load, no
+backend scheduler, and CPU-only `ggml_custom_4d` fast paths gated on
+`!ggml_backend_is_cpu(model.backend)` so the same graph builders fall through
+to stock `ggml_im2col` + `ggml_mul_mat` (etc.) when the backend is Metal.
+
+Implementation:
+
+- `model_prefers_cpu_kernels(const supertonic_model &)` added in
+  `src/supertonic_internal.h`. Returns `true` when `model.backend == nullptr`
+  or `ggml_backend_is_cpu(model.backend)`.
+- Per-stage helpers (`conv1d_f32`, `depthwise_same_ggml`, `layer_norm_ggml`,
+  `dense_matmul_time_ggml`, `bias_gelu_ggml`, `pw2_residual_ggml`,
+  `conv1d_causal_ggml`, `depthwise_conv1d_causal_ggml`, plus the tail-update
+  custom op in `vector_estimator.cpp`) now take a `bool use_cpu_fastpath` and
+  AND it into the existing dtype/shape gates.
+- Per-stage builders inject
+  `const bool use_cpu_fastpath = model_prefers_cpu_kernels(model);` at the top
+  and pass it down through `vector_convnext_ggml`, `convnext_block_ggml`, the
+  text/vector/style attention cache builders, the tail graph builder, and the
+  trace builder.
+- `text_encoder.cpp` and `duration.cpp` accept the flag for call-site
+  uniformity but mark it `[[maybe_unused]]` — those stages have always built
+  their graphs via stock ggml ops and are Metal-safe at HEAD.
+- `supertonic_bench.cpp` gains `--n-gpu-layers N` (passed through to
+  `load_supertonic_gguf`) so the same harness drives CPU and Metal.
+
+Smoke test (`supertonic-cli --n-gpu-layers 1`) produces a 1.44 s WAV that is
+byte-length-identical to the CPU output, confirming the graph builders run
+end-to-end on Metal. A `GGML_ASSERT([rsets->data count] == 0)` fires inside
+`ggml_metal_device_free` at process exit (atexit ordering with Metal's
+residency-set finaliser) — same shape as the Chatterbox `t3_stack_registry`
+atexit issue; cosmetic, fires after the WAV is fully written. Mitigation TBD.
+
+Benchmark (Apple M2, q8_0 GGUF, 4 threads, 3.204 s of audio, 5-step CFM, 5 runs
++ 1 warmup, same flags as `supertonic-cpp.json` / `supertonic-onnx-cpu.json`):
+
+| Stage                       | CPU q8_0   | Metal q8_0 | Δ vs CPU | ONNX CPU f32 |
+|-----------------------------|-----------:|-----------:|---------:|-------------:|
+| preprocess                  |    0.01 ms |    0.01 ms |       — |      0.06 ms |
+| duration                    |    1.76 ms |    2.50 ms |   +0.74 |      1.48 ms |
+| text_encoder                |   13.44 ms |   13.83 ms |   +0.39 |      9.04 ms |
+| vector_estimator (5 steps)  |   94.86 ms |  173.08 ms |  +78.22 |     82.65 ms |
+| vocoder                     |   43.44 ms |   59.74 ms |  +16.30 |     51.32 ms |
+| **total**                   | **153.5**  | **249.9**  |  **+96.4 (+63%)** | **144.9** |
+| RTF                         |     0.048  |     0.078  |          |       0.045 |
+| real-time multiplier        |     20.9×  |     12.8×  |          |       22.1× |
+
+Verdict: the Metal port is **correctness-validated but slower than CPU at this
+graph shape**. Two ggml-side stages dominate the regression:
+
+- **`vector_estimator` +82 %** (94.9 → 173.1 ms median). The 5 denoising steps
+  build many small ConvNeXt graphs (depthwise + pointwise + norm + GELU +
+  pointwise, repeated across blocks). On M2 these become Metal kernel
+  launches that are too short to amortise launch overhead; the CPU fast paths
+  (cblas-backed `pointwise_op` / unrolled depthwise K=5) had a real lead.
+- **`vocoder` +38 %** (43.4 → 59.7 ms median). Same kernel-launch-bound
+  pattern, smaller deficit because the vocoder graph is a single persistent
+  cgraph that's reused across calls (less per-step overhead than the
+  vector-estimator's per-block cgraphs).
+
+`text_encoder` and `duration` are unchanged within noise — expected, those
+already used the stock-op path on CPU.
+
+`supertonic-bench --runs 8 --warmup 3 --n-gpu-layers 1` drifted to ~288 ms
+median (up from ~250 ms at runs=5 / warmup=1), suggesting Metal residency
+sets accumulate across calls in this harness; investigate before drawing
+percentile-style conclusions from longer Metal runs.
+
+Artifacts: `artifacts/bench/supertonic-cpu.json`,
+`artifacts/bench/supertonic-cpu-after.json` (post-gating CPU regression
+check, median 158.2 ms / +3 % vs the pre-port baseline — within noise),
+`artifacts/bench/supertonic-metal.json`,
+`artifacts/bench/supertonic-onnx-cpu.json`,
+`artifacts/bench/supertonic-onnx-coreml.json`,
+`artifacts/bench/metal-phase-a.txt` (the Phase A failure-mode trace before
+gating).
+
+### Next: Metal optimisation passes (Phase E in the plan)
+
+Backlog **revised after the 2026-05-11 dispatch-count profile** (see
+"Dispatch-count profile" below). The pre-profile working hypothesis
+(step batching, QKV stacking, f16 weights) turned out to be wrong on
+multiple counts. Revised priority order:
+
+1. **Single-graph consolidation per CFM step (THE PR).** The diagnostic
+   shows ~21 separate `graph_compute` calls per step (front prep +
+   text-attention + style-qkv + style-attention + style-residual-norm
+   inline × 4 groups + tail). On M2 each call carries ~1.86 ms of fixed
+   command-buffer overhead regardless of node count. Consolidating into
+   ONE `ggml_cgraph` per step (5 dispatches per synth, projected total
+   Metal ~46 ms) is by far the biggest win available; the rest of the
+   backlog only matters if this leaves residual gap. Specific work
+   below.
+2. **(Was step batching across CFM iterations.)** Closed: the CFM step
+   loop has a sequential dependency (`latent.swap(next)` at
+   `supertonic_engine.cpp:240`), so Chatterbox-style batching along
+   `ne[2]` doesn't apply here. The win from item 1 above is bigger
+   anyway; revisit only if a future flow-matching variant decouples the
+   steps.
+3. **(Was QKV stacking on text-attention.)** Deprioritised. With item 1
+   the QKV matmuls live inside the same dispatch as everything else —
+   stacking saves 3 in-graph nodes per attention but doesn't reduce
+   dispatch count. Only worth doing if Metal frame capture shows the
+   three per-attention `kernel_mul_mm` launches are individually
+   expensive after consolidation.
+4. **(Was f16 weights for Metal.)** Closed: f16 GGUF is *slower* than
+   q8_0 on both CPU and Metal (see "f16 GGUF experiment (2026-05-11)"
+   below). q8_0's weight-bandwidth win beats f16's no-dequant on this
+   graph shape.
+5. **Custom Metal depthwise kernel.** Standby — only revisit if item 1
+   leaves ConvNeXt depthwise as the residual hotspot. The `im2col +
+   mul_mat` fallback would be replaceable with a single
+   `kernel_depthwise_conv_1d` per call; `test/test_metal_ops.cpp` is
+   the parity harness.
+6. **Metal `rsets` keep-alive tuning** for long-running daemons.
+   Cosmetic for benchmarks; investigate if a hosted-service user
+   reports memory growth.
+
+### Plan for item 1 — per-step graph consolidation
+
+Architecture: introduce a `vector_step_full_cache` (per-shape
+thread_local) that owns ONE `ggml_context`, ONE `ggml_cgraph`, ONE
+`ggml_gallocr`. Build the entire per-step computation (proj_in →
+4 × (ConvNeXt blocks + time-add + ConvNeXt + Q/K/V projection + RoPE +
+flash-attention + out_fc + residual + layer-norm + style Q/K/V
+projection + flash-attention + out_fc + residual + layer-norm) +
+last_convnext × 4 + proj_out + mask + noise add) as one graph. ONE
+`ggml_backend_graph_compute` per step.
+
+The existing `build_text_attention_cache`, `build_group_graph_cache`,
+`build_res_style_qkv_cache`, and `build_tail_graph_cache` get refactored
+into **graph-builder helpers** that accept `(ggml_context*, ggml_cgraph*,
+...input ggml_tensor*...)` and return output `ggml_tensor*`, instead of
+owning their own contexts. The CPU path keeps the cache-of-subgraphs
+architecture (parity, trace mode); only Metal routes through the
+consolidated path. Detection via `!ggml_backend_is_cpu(model.backend)`
+at the top of `supertonic_vector_step_ggml`.
+
+**Critical sub-tasks** (the order matters for parity validation):
+
+1. **In-graph RoPE.** Replace the CPU `apply_rope` call with
+   `ggml_rope_ext` configured for Supertonic's `(t/L) * theta[d]`
+   formula: `freq_base = 1.0`, `freq_scale = 1.0`, `freq_factors[d] =
+   L / theta[d]`, `mode = GGML_ROPE_TYPE_NEOX` (split-pairs layout
+   matches `apply_rope`'s `(i1, i2) = (offset+d, offset+D/2+d)` pattern
+   per `supertonic_vector_estimator.cpp:1416`). Positions are an
+   int32 `arange(L_q)` for Q and `arange(L_kv)` for K, set once at
+   build time. ggml-metal's `kernel_rope_norm`/`kernel_rope_neox`
+   already compile.
+
+2. **In-graph layout conversion.** Replace
+   `tensor_to_time_channel`/`pack_time_channel_for_ggml` host calls
+   with `ggml_cont(ctx, ggml_transpose(ctx, x))` at the inter-stage
+   boundaries.
+
+3. **Compose the orchestrator** so all stages share one ctx/gf. Walk
+   the existing `supertonic_vector_trace_proj_ggml` flow (lines
+   2050–2585) and inline each `run_*_cache` call as graph-builder
+   helper invocations.
+
+4. **Parity test.** Add a `test_supertonic_vector_metal_consolidated`
+   CTest target that compares the consolidated Metal path to the CPU
+   reference for one step at a representative L (137-ish). Tolerance
+   ~1e-2 (loose because of float-order effects across the merged
+   graph).
+
+5. **Bench.** Re-run `supertonic-bench --n-gpu-layers 1` and target
+   `SUPERTONIC_COUNT_DISPATCHES=1` to verify total dispatches drop
+   from 120 to ~10 and total wall to ~46 ms.
+
+**Size estimate.** ~600–1000 new lines (mostly the consolidated build
+function); the existing trace path stays untouched. Trace-mode tests
+keep using the old multi-cache orchestrator.
+
+**Risk.** The two non-trivial pieces are (a) `ggml_rope_ext` parameter
+mapping matching CPU `apply_rope` to within 1e-3 — verify before
+inlining everything else — and (b) memory budget for one big graph
+across all groups (`MAX_NODES=2048` may not be enough; estimate ~3500
+nodes for the full per-step graph).
+
+Each commit on the consolidation branch should land in a single PR;
+the work is too coupled to split cleanly.
+
+Backlog items 2–6 above stay as separate per-PR follow-ups in their
+listed priority. Do not bundle.
+
+### Dispatch-count profile (2026-05-11)
+
+Instrumented `supertonic_graph_compute` with a wall-time + node-count
+printout gated on the `SUPERTONIC_COUNT_DISPATCHES` env var. Re-running
+`supertonic-cli --n-gpu-layers 1 --text "Hello."` on the same M2:
+
+- **120 graph_compute dispatches per single synth** (entire pipeline,
+  vector estimator + vocoder + text encoder + duration).
+- **Cumulative graph_compute wall: 222.8 ms** out of the ~250 ms total
+  Metal synth — i.e. graph_compute IS the cost; CPU-side data marshalling
+  is the residual ~30 ms.
+- **Mean per-dispatch wall: 1.86 ms.** Even 17-node tiny dispatches cost
+  ~770 µs each; 170-node mid graphs cost 1.1–1.7 ms. The fixed
+  per-dispatch Metal overhead (command-buffer setup + pipeline lookup +
+  encode + commit + wait) dominates.
+
+Dispatch distribution (counts × node-size, sorted by frequency):
+
+  40 × 18 nodes (the 5×8 text-attention sub-graphs per step)
+  20 × 12 nodes
+  20 × 90 nodes
+  15 × 262 nodes (the 5×3 group-prep graphs)
+  ~25 misc
+
+The 80 small (≤90 nodes) dispatches account for an estimated ~120 ms of
+Metal time. Consolidating them into the larger per-step graphs would
+likely halve the gap to the CPU baseline.
+
+### f16 GGUF experiment (2026-05-11)
+
+Hypothesis: q8_0 dequant in the per-`mul_mat` path was the Metal
+bottleneck. Tested by converting the bundle with `--ftype f16` (132 MB
+GGUF vs 252 MB for q8_0) and re-benching:
+
+  Metal q8_0 total median: 249.9 ms
+  Metal  f16 total median: 286.5 ms (+15 %, worse)
+  CPU   q8_0 total median: 153.5 ms
+  CPU    f16 total median: 168.7 ms (+10 %, worse)
+
+f16 is uniformly *slower* than q8_0, on both CPU and Metal. q8_0
+dequant is not the bottleneck — ggml-metal's q8_0 `mul_mat` kernel is
+well-tuned for these tensor shapes and the smaller weight bandwidth
+helps. Phase E.3 closed; do not pursue an f16-on-Metal variant.
+
+### Dispatch profiling hook
+
+`SUPERTONIC_COUNT_DISPATCHES=1 ./build/supertonic-cli ...` prints one
+line per `ggml_backend_graph_compute` call:
+
+  supertonic_graph_compute #N nodes=K  wall=W us  cumul=C ms
+
+Zero-overhead when the env var is unset (single env var read +
+branch-predicted skip).
+
+## Per-step graph consolidation (landed 2026-05-11)
+
+Landed `supertonic_vector_step_one_graph_ggml` at the end of
+`src/supertonic_vector_estimator.cpp` plus the helpers
+`apply_supertonic_rope_ggml`, `append_text_attention_subgraph`, and
+the `vector_step_one_graph_cache` struct.  Routing in
+`supertonic_vector_step_ggml` enables this path **by default on
+any non-CPU backend** (Metal, CUDA, Vulkan, OpenCL).  CPU keeps
+the multi-cache trace_proj path — its CPU fast-paths and
+`thread_local` sub-graph caches stay competitive on CPU and trace
+mode for parity tests still uses the per-stage outputs.  Override
+via `SUPERTONIC_DISABLE_ONE_GRAPH=1` if needed.
+
+### Dispatch + bench numbers (Apple M2, q8_0, 4 threads, 5-step CFM)
+
+`SUPERTONIC_COUNT_DISPATCHES=1 ./build/supertonic-cli --n-gpu-layers 1`
+shows the dispatch profile collapsing from **120 → 20 total
+dispatches** per synth (5 of which are 1886-node consolidated
+per-step graphs).  Mean per-dispatch wall climbs from 1.86 ms to
+7.9 ms — more real work per kernel batch, less time burned on
+command-buffer setup — and total `graph_compute` wall drops from
+222.8 ms to 157.7 ms (-29 %).
+
+`supertonic-bench` on Metal, 5 runs + 1 warmup, identical flags to
+`supertonic-cpu.json` / `supertonic-onnx-cpu.json`:
+
+  | Stage                       | trace_proj (B) | one-graph (E.cons) |
+  |-----------------------------|---------------:|-------------------:|
+  | preprocess                  |          0.01ms |             0.02ms |
+  | duration                    |          2.50ms |             3.87ms |
+  | text_encoder                |         13.83ms |            16.58ms |
+  | vector_estimator (5 steps)  |        173.08ms |           147.83ms |
+  | vocoder                     |         59.74ms |            60.51ms |
+  | **total**                   |     **249.92ms**|        **229.06ms**|
+  | RTF                         |           0.078 |              0.071 |
+  | real-time multiplier        |          12.82× |             13.99× |
+
+Net: **-15 % on the dominant vector_estimator stage, -8 % on the
+total**.  Correctness validated: `cpu-ref` vs `metal-one-graph` for
+the same text+seed gives correlation **1.0000**, max abs diff 101
+LSB (CPU peak amplitude 6639, so ~1.5 % — normal Metal-vs-CPU
+floating-order noise).  No regression vs the Phase B port.
+
+### Why the win is smaller than projected
+
+Pre-implementation projection was ~46 ms total (saving the full
+~204 ms of dispatch overhead at 1.86 ms × ~110 saved dispatches).
+Reality: the per-dispatch overhead estimate (1.86 ms) was an
+*average*, not a constant.  The new 1886-node consolidated graphs
+are big enough that the GPU is actually doing real compute work
+during the dispatch — kernel-launch overhead is no longer the
+bottleneck, but the work itself has moved to dominating.
+
+The bench tells the story: per-step wall time dropped from
+~33 ms (= 173/5) to ~30 ms (= 147/5).  The Metal device now spends
+most of its time actually computing matmuls rather than waiting
+on command-buffer plumbing.  Further wins now require *less work*,
+not *fewer dispatches* — that's items 2-5 of the remaining
+backlog (QKV stacking, op fusion, custom depthwise kernel).
+
+### Implementation notes
+
+- **`apply_supertonic_rope_ggml`** translates Supertonic's
+  `angle = (t/L) * theta[d]` formula to `ggml_rope_ext` with
+  `freq_base=1.0, freq_scale=1.0, freq_factors[d] = L / theta[d]`,
+  `mode=GGML_ROPE_TYPE_NEOX` (split-pairs rotation matches
+  `apply_rope`'s `(i1=offset+d, i2=offset+D/2+d)` layout at
+  `supertonic_vector_estimator.cpp:1416`).  Positions are int32
+  `arange(q_len)` for Q and `arange(text_len)` for K, set per
+  call when L or text_len change.  ggml-metal's
+  `kernel_rope_norm`/`kernel_rope_neox` already compile.
+
+- **Layout invariant: the GGML tensors take channel-major buffers
+  raw.**  The trace_proj_ggml path at lines 2143/2151 sets `x_in`
+  directly from `noisy_latent` (no host transpose) and `text_in`
+  directly from `text_emb`; the ne=[L, Cin] / ne=[text_len, 256]
+  tensors interpret that channel-major buffer as their natural
+  layout (innermost dim = time = fast-in-memory).  My initial
+  consolidation tried to "helpfully" transpose the inputs into
+  (t, c) layout, which corrupted the tensor data and produced
+  correlation 0.0034 garbage on every backend.  Fix: direct
+  `ggml_backend_tensor_set` from raw caller buffers, matching the
+  existing path exactly.  Same fix on the output path
+  (`ggml_backend_tensor_get` straight into `next_latent_out`).
+
+- **Cache invalidation:** keyed on `(model.generation_id, L,
+  text_len, total_steps)`.  Rebuild when any change.  The
+  `vector_step_one_graph_cache` is a single `thread_local`
+  instance — different Engines / synths share it via the
+  generation_id key.
+
+### Remaining Phase E backlog
+
+**Tier 1 status (2026-05-11):**
+
+- ✅ **Per-step vector_estimator consolidation** (this PR) — biggest
+  Tier 1 win, -8 % on total Metal, parity 1.0000.
+- ✅ **Vocoder already a single dispatch** (461-node graph) —
+  no consolidation needed.
+- ⏸ **text_encoder + duration consolidation** — measured
+  contribution: ~22 ms cold-start dispatch wall across the 14
+  small dispatches that come before the vector_estimator graphs.
+  Post-warmup the bench shows text_encoder ≈ 17 ms and
+  duration ≈ 4 ms — most of which is the dispatches themselves;
+  consolidating to 1 dispatch each would save ~5-10 ms
+  steady-state.  Deferred because relpos_attention has 9
+  per-shape mask tensors + intricate
+  `ggml_view_3d`/`ggml_permute`/`ggml_sum_rows` plumbing that's
+  not a straight copy of the vector_step pattern — needs its
+  own focused 2-3 hour session with parity validation harness
+  before re-enabling on the GPU dispatcher.
+- ⏸ **QKV stacking** — once `vector_estimator` is already in
+  one graph, stacking the three `dense_matmul_time_ggml` calls
+  saves in-graph nodes but no dispatch count.  Metal-frame-
+  capture didn't show the QKV matmuls as the hot path, so the
+  expected win is tiny.  Pursue only if Tier 2 hits diminishing
+  returns.
+- ⏸ **`ggml_cont` elimination** — the consolidated path does
+  `ggml_cont(ggml_transpose(...))` for Q/K/V before rope, and
+  again inside `apply_supertonic_rope_ggml`.  These could be
+  avoided by views with custom strides, but ggml's `view_3d`
+  doesn't expose `nb0` (only `nb1`/`nb2`), so the cont copies
+  are required for the rope kernel's expected layout.  Could
+  use `ggml_permute` + careful 4D views to remove some, but
+  the win is small and the layout-bug risk is high.
+
+## Tier 2 progress (2026-05-11) — op-level reductions before custom kernels
+
+Before sinking time into custom .metal kernels via the QVAC
+ggml-speech port patches (the original Tier 2 plan), there are
+op-level reductions inside the consolidated per-step graph that
+trim dispatch count without touching ggml's kernel set.  Each
+landed as its own commit in PR #15.
+
+### Diagnostic: `SUPERTONIC_DUMP_OP_HISTOGRAM=1`
+
+Added an env-var-gated dump of per-graph op-type histograms to
+`supertonic_graph_compute`.  Zero overhead unset.  Lets us see
+exactly which ggml ops dominate the consolidated graph and which
+are pure-metadata (RESHAPE/VIEW/PERMUTE/TRANSPOSE — confirmed
+no-op in ggml-metal-ops.cpp:186-195).
+
+**Consolidated per-step graph at HEAD (post-Tier-2 commits):**
+
+  | op                | count | dispatch on Metal? |
+  |-------------------|------:|--------------------|
+  | RESHAPE           |   580 | no (metadata only) |
+  | ADD               |   197 | yes (often fused)  |
+  | CONT              |   148 | yes (memcpy)       |
+  | MUL_MAT           |   122 | yes (matmul)       |
+  | IM2COL            |   118 | yes (memrearrange) |
+  | VIEW              |    88 | no                 |
+  | PERMUTE           |    72 | no                 |
+  | MUL               |    70 | yes (often fused)  |
+  | TRANSPOSE         |    68 | no                 |
+  | REPEAT            |    56 | yes                |
+  | CONCAT            |    56 | yes                |
+  | NORM              |    36 | yes                |
+  | UNARY             |    32 | yes (GELU/SiLU)    |
+  | ROPE              |     8 | yes                |
+  | FLASH_ATTN_EXT    |     8 | yes                |
+  | SCALE             |     1 | yes                |
+  | **total**         | **1660** | **852 dispatched** |
+
+808 of 1660 nodes are metadata-only no-ops — what looks like a
+large graph is really ~852 real Metal dispatches per per-step
+graph (down from ~1078 dispatched ops in the pre-Tier-2 layout).
+
+### Landed wins
+
+1. **`repeat_like` returns the broadcast-compatible reshape
+   without `ggml_repeat`** — ggml_add/ggml_mul broadcast natively
+   when one operand has dim==1 in a position the other has dim==N,
+   so the explicit ggml_repeat was redundant work.  All four
+   supertonic files (vector_estimator, vocoder, text_encoder,
+   duration) had the same pattern; same fix applied to each.
+   **-226 REPEAT ops** per step graph.  Override via
+   `SUPERTONIC_FORCE_EXPLICIT_REPEAT=1`.
+
+2. **`apply_supertonic_rope_ggml` drops the defensive
+   `ggml_cont`** — the [D, H, q_len] view onto a contiguous
+   [H*D, q_len] tensor is itself contiguous (nb[0]=elem_size,
+   nb[1]=D*elem_size, nb[2]=H*D*elem_size = ne[0]*ne[1]*elem_size),
+   so `ggml_rope_ext` accepts the view directly.  **8 fewer
+   kernel_cpy dispatches per per-step graph** × 5 = 40 saved per
+   synth.
+
+### Bench delta
+
+Apple M2, q8_0, 4 threads, 5-step CFM, 3.20 s of audio, 5 runs +
+1 warmup, identical flags to the existing JSON artifacts:
+
+  | Stage                       | Phase B | post-cons | post-repeat | post-rope-cont |
+  |-----------------------------|--------:|----------:|------------:|---------------:|
+  | preprocess                  |   0.01 ms |   0.02 ms |     0.01 ms |        0.02 ms |
+  | duration                    |   2.50 ms |   3.87 ms |     4.15 ms |        4.44 ms |
+  | text_encoder                |  13.83 ms |  16.58 ms |    15.80 ms |       14.97 ms |
+  | vector_estimator (5 steps)  | 173.08 ms | 147.83 ms |   129.23 ms |      123.94 ms |
+  | vocoder                     |  59.74 ms |  60.51 ms |    53.91 ms |       53.99 ms |
+  | **total**                   | **249.92ms** | **229.06ms** | **203.04ms** | **199.90ms** |
+  | RTF                         |   0.078 |   0.071  |     0.063   |       0.062    |
+  | real-time multiplier        |  12.82× |  13.99×  |    15.78×   |      16.03×    |
+
+**Cumulative Tier 1 + early-Tier-2: -50 ms total (-20 %) vs the
+Phase B Metal baseline.**  Parity vs CPU reference preserved at
+correlation 0.9999, max abs diff 249 LSB (~3.7 % of peak
+amplitude 6639 — within the float-order tolerance the
+consolidation already trades for one-graph-per-step).  Still ~50
+ms behind CPU q8_0 (153 ms) and ONNX CPU (145 ms), but the gap
+is closing.
+
+### Remaining op-level reductions
+
+- **118 IM2COL ops** are almost all K=1 1×1 convs (called from
+  `dense_matmul_time_ggml` via the existing `conv1d_f32` graph
+  fallback).  For K=1 the im2col is a transpose; could be
+  replaced with a direct `ggml_mul_mat` on the transposed
+  weight/input.  Projected ~3-6 ms saved.  Tricky to get right
+  without breaking layout assumptions of consumers.
+- **148 CONT ops** — 32 are weight-transpose conts in
+  `dense_matmul_time_ggml` (per call, but the weight is constant
+  per shape; could cache the transposed copy at engine
+  construction).  Projected ~5-8 ms saved.
+- **56 CONCAT + 56 REPEAT (remaining)** come from
+  `edge_clamp_pad_1d` materialising the replicate padding.  A
+  custom Metal `kernel_supertonic_pad_edge` would collapse these
+  into one dispatch per padding call.
+
+### Tier 2 custom Metal kernels + load-time weight prep — landed (2026-05-11)
+
+Four fused Metal kernels shipped through the local
+`tts-cpp/cmake/vcpkg-overlay-ports/ggml/` overlay (chained on top
+of the QVAC ggml port via `VCPKG_OVERLAY_PORTS`).  Each adds a
+new `GGML_OP_SUPERTONIC_*` op with a CPU forward as parity
+backstop and a Metal kernel as the production path.  Override
+each individually with the listed env var.
+
+1. **`kernel_supertonic_depthwise_1d`** (commit aa4f65c3) —
+   fuses edge-clamp pad + im2col + mul_mat + add into one Metal
+   dispatch for K ∈ {3, 5}.  Used by every ConvNeXt block in
+   vector_estimator, vocoder, text_encoder, duration.  Override:
+   `SUPERTONIC_DISABLE_FUSED_DEPTHWISE=1`.
+2. **`kernel_supertonic_layer_norm_channel`** (commit 55adf87b)
+   — fuses permute + cont + ggml_norm + mul + add + permute +
+   cont into one dispatch.  Per time-step, one threadgroup with
+   simd_sum reductions for mean/var.  Override:
+   `SUPERTONIC_DISABLE_FUSED_LAYER_NORM=1`.
+3. **`kernel_supertonic_pw2_residual`** (commit 7a5c0393) —
+   fuses `add(bias) + mul(gamma) + add(residual)` (3 ops) into
+   one dispatch at the tail of each vector ConvNeXt block.
+   Override: `SUPERTONIC_DISABLE_FUSED_PW2_RESIDUAL=1`.
+4. **`kernel_supertonic_bias_gelu`** (commit df20115d) — fuses
+   `add(bias) + gelu_erf` between pw1 and pw2 of every vector
+   ConvNeXt block.  Uses the same `erf_approx<float>` template
+   as the stock `kernel_gelu_erf_f32` so the fused output is
+   bit-identical to the unfused chain.  Override:
+   `SUPERTONIC_DISABLE_FUSED_BIAS_GELU=1`.
+
+Plus a load-time optimization:
+
+5. **Pre-transposed matmul weights** (commits e935ffb7,
+   da9553e3) — materialize transposed copies of every
+   `:onnx::MatMul_*` source weight at engine load time on
+   non-CPU backends.  Eliminates the runtime
+   `cont(transpose(w))` dispatch that `dense_matmul_time_ggml`
+   (and the direct `ggml_mul_mat` time-projection sites) used
+   to emit on every graph compute — ~24 cont sites × 5 CFM
+   steps = 120 dispatches saved per synth.  Override:
+   `SUPERTONIC_DISABLE_WEIGHT_PRETRANSPOSE=1`.
+
+6. **Vocoder pw1 fused bias_gelu** (commit 64efe99a) — extends
+   the bias_gelu fusion to the vocoder's ConvNeXt blocks.
+   `conv1d_causal_ggml(..., b=nullptr, ...)` skips the internal
+   bias-add and feeds the matmul output to the fused op
+   directly.  CPU keeps its existing cblas-inside path.  ~10
+   dispatches saved per vocoder pass.
+
+Also investigated but **not landed**:
+
+- **Vocoder pw2_residual fusion** (commit 53a58f5b explains
+  why) — the vocoder stores its block scale as
+  `gamma.ne[0] == 1` (a single learnable scalar), while
+  `pw2_residual_ggml` requires `gamma.ne[0] == C`.  Shapes
+  incompatible, would need a new vocoder-specific scalar-gamma
+  variant op for a ~0.4 ms projected gain — below the noise
+  floor of the current bench.  Skipped.
+
+### Final Tier 2 bench
+
+Apple M2, q8_0, 4 threads, 5-step CFM, 3.20 s of audio, 10
+runs + 2 warmup, `--n-gpu-layers 1` (numbers from
+`artifacts/bench/supertonic-cpp-metal-final.json`):
+
+  | Stage                       | Phase B Metal | Tier 2 final | CPU q8_0 ref |
+  |-----------------------------|--------------:|-------------:|-------------:|
+  | preprocess                  |       0.01 ms |      0.02 ms |     0.01 ms  |
+  | duration                    |       2.50 ms |      6.03 ms |     1.97 ms  |
+  | text_encoder                |      13.83 ms |     18.47 ms |    13.44 ms  |
+  | vector_estimator (5 steps)  |     173.08 ms |     97.76 ms |    94.86 ms  |
+  | vocoder                     |      59.74 ms |     52.02 ms |    43.44 ms  |
+  | **total**                   |  **249.92ms** |  **174.49ms**| **153.52ms** |
+  | RTF                         |        0.078  |       0.054  |       0.048  |
+  | real-time multiplier        |       12.82×  |       18.4×  |       20.8×  |
+
+**Cumulative Tier 1 + Tier 2 wins: -75 ms total (-30%) vs the
+Phase B Metal baseline.**  Parity vs CPU q8_0 reference holds
+at correlation 0.9999 / L∞ ≈ 1.7e-3 across the whole sequence
+— bit-identical pipeline output before/after the optimizations
+on Metal.
+
+The pretranspose A/B (env-var off vs on, same machine state)
+is the cleanest single-knob signal: total 182.75 → 174.38 ms
+(-8.37 ms), vec_est 108.61 → 100.45 ms (-8.16 ms).
+
+### Where the remaining 21 ms gap-to-CPU lives
+
+  | Stage                       | Metal Tier 2 | CPU q8_0 | Gap          |
+  |-----------------------------|-------------:|---------:|-------------:|
+  | vector_estimator (5 steps)  |      97.76 ms |   94.86 ms |     2.90 ms |
+  | vocoder                     |      52.02 ms |   43.44 ms |     8.58 ms |
+  | text_encoder                |      18.47 ms |   13.44 ms |     5.03 ms |
+  | duration / other            |        ~6 ms  |     ~1.7 ms |    ~4 ms    |
+  | **total**                   |  **174.49ms** | **153.52ms** | **20.97 ms** |
+
+Vector estimator is now Metal's strongest stage in absolute
+terms (within 3 ms of CPU on its 100-ms budget); vocoder is at
+parity with ONNX-CPU (52.0 vs 51.3 ms) and is now the dominant
+remaining gap-to-CPU.  Vocoder uses `conv1d_causal_ggml` not
+`dense_matmul_time_ggml`, so neither the pretranspose
+optimization nor (until 64efe99a) the fused bias_gelu applied
+there — the weights are already in conv1d-kernel `[K, IC, OC]`
+layout from the GGUF.
+
+### What's still pursuable post-Tier-2 (not in this round)
+
+1. **KV stacking on cross-attention** — concat W_key and
+   W_value along out-dim at load time so the two text-side
+   matmuls become one (Q stays separate, different input).
+   ~30 invocations per synth × ~0.1-0.2 ms each ≈ 3-6 ms
+   projected, but the small matmul size means this might be
+   noise-bound.  Could combine with pretranspose: stack the
+   pretransposed K+V into one wider weight.
+2. **Vocoder `pw2_residual_scalar_gamma` op** — new
+   vocoder-specific fused op handling `gamma.ne[0]==1`.  ~10
+   dispatches saved per vocoder pass ≈ 0.4 ms.  Below noise
+   floor; skip unless other wins are found first.
+3. **Full ConvNeXt block fusion** (the original T2.3 plan) —
+   deferred because pw1/pw2 weights are 4C×C ≈ 1MB each,
+   vastly exceeding M2's 32KB threadgroup memory budget.  Would
+   need to call out to `ggml_mul_mat` for the matmuls, which
+   defeats most of the fusion benefit.
+4. **Activation layout change** — eliminate the 32 remaining
+   `cont(transpose(activation))` calls on Q/K/V activations per
+   per-step graph.  Would require touching the whole attention
+   pipeline (rope, flash_attn, output projection) — too
+   invasive for the projected ~3-5 ms win.
+5. **CFM step batching (B=2)** — N/A for Supertonic.  The CFM
+   loop in `supertonic_engine.cpp` is a sequential ODE solver
+   (each step depends on the previous output), unlike
+   chatterbox's CFG cond+uncond pairs which fit naturally into
+   `ne[2]` batching.
+
+### Tier 2 closing the loop
+
+The Tier 2 PR (`feat/metal-optimization-supertonic` on
+tetherto/qvac-ext-lib-whisper.cpp) lands as:
+- 4 custom Metal kernels behind individual env-var gates
+- Load-time pretranspose mechanism + helper APIs
+  (`try_pretransposed_weight`, `dense_matmul_time_pretransposed_ggml`)
+- All under a local `tts-cpp/cmake/vcpkg-overlay-ports/ggml/`
+  port that chains on top of the QVAC ggml port via
+  `VCPKG_OVERLAY_PORTS`.
+- CPU q8_0 perf unchanged (the fused-kernel + pretranspose
+  paths are all gated on `!use_cpu_fastpath`).
+- Parity vs CPU reference: corr 0.9999 / L∞ 1.7e-3 throughout.
+
+## Phase A + B follow-up (2026-05-11)
+
+### Landed on this PR after Tier 2 closed
+
+| Commit     | Change | Bench delta (M2, 10 runs) |
+|------------|--------|---------------------------|
+| `bfb44092` | Phase 0: `--precision {f32,f16,q8_0}` flag + parity harness | 0 ms (infra) |
+| `8f0be955` | A1+A2: single command buffer per synth + on-GPU latent through 5-step CFM loop | –1.37 ms total |
+| `1b7496f6` | A3 step 1: enable `--precision q8_0` storage on Metal (asymmetric load) | –6.17 ms total |
+
+Cumulative on top of Tier 2: total **174.49 ms → 166.39 ms** (–4.6%).
+Real-time multiplier 18.4× → 19.3×.
+
+### Why the wins are smaller than the original Phase A+B projection
+
+The Phase A roadmap projected 30+ ms of cumulative gains.  Reality on M2
+delivered ~8 ms.  Three things drove the gap:
+
+1. **Metal command-buffer submission on M2 is much cheaper than I
+   estimated.** I cited "~1-2 ms fixed overhead per dispatch" based on
+   an earlier diagnostic; actual cost is closer to 0.1-0.3 ms.  A1+A2's
+   "single command buffer per synth" win (eliminating 4 inter-step
+   dispatches) was projected –15 to –20 ms, landed at –1.4 ms.
+2. **Unified memory makes `tensor_get`/`tensor_set` between stages
+   nearly free.** There's no PCIe transfer cost to amortize.  The
+   "on-GPU latent" win that's a big deal on discrete-GPU x86 doesn't
+   apply on Apple silicon.
+3. **`kernel_mul_mm_q8_0_f32` never fires.** A3's projected –20 to –30 ms
+   was the matmul-bandwidth win from running ggml's optimized quantized
+   matmul kernel.  But the kernel only dispatches when the quantized
+   weight is `src0` (a) of `ggml_mul_mat`.  Supertonic's `[T, IC]`
+   activation layout forces the weight into `src1` (b) via the
+   `conv1d_f32` im2col wrapper, and ggml-metal falls back to a path
+   that dequantizes to f32 first.  **The full A3 win is unlocked by
+   B2 (activation layout permutation) — and only by it.**
+
+### A4 (text_encoder + duration consolidation) — deferred
+
+Analyzed but not implemented: text_encoder currently fires ~10 separate
+`ggml_backend_graph_compute` calls (1 ConvNeXt front + 4 relpos attn
++ 4 ffn + 2 speech_prompted_attn × 2-graph pattern).  Duration adds
+~4 small dispatches.
+
+Full consolidation into 1-2 graphs would require:
+- Extracting each sub-builder (`relpos_attention_ggml`, `ffn_block_ggml`,
+  `speech_prompted_attention_ggml`) into append-to-graph helpers (the
+  same shape of refactor that A1+A2 did for the per-CFM-step subgraph).
+- Converting the host-side residual + layer_norm + tanh-key-packing
+  work between sub-graphs into ggml ops.
+- Engineering: 4-8 focused hours.
+- Realistic return based on A1+A2's measured ratio: **–2 to –4 ms total**.
+
+Deferred because: (a) ROI per hour is now smaller than B1/B2, (b) the
+text_encoder + duration combined budget is only ~21 ms — even a perfect
+collapse to 1 dispatch each saves ~5-7 ms maximum, with no compounding
+effect on the other stages, (c) it doesn't unlock anything else
+downstream (unlike B2 which unlocks A3 step 2).
+
+Re-evaluate after B2 lands.  If the team needs every ms (e.g. for a
+constrained-device target), this is the next item to revisit.
+
+### Next levers on the table
+
+| Phase | Projected (post-A1+A2 calibration) | Unblocks | Cost |
+|-------|-----------------------------------:|----------|------|
+| B1 — f16 activations end-to-end | –5 to –10 ms | nothing | medium |
+| **B2 — activation layout permutation** | –3 to –5 ms direct, **+ unlocks A3 step 2 (–15 to –25 ms)** | A3 step 2 | high (invasive, touches rope + flash_attn + every attention site) |
+| A3 step 2 — q8_0 matmul kernel firing (after B2) | –15 to –25 ms (theoretical) | — | medium-low (B2 does the heavy lifting) |
+| B3 — argument buffer reuse | –2 to –5 ms | nothing | high (Metal backend internals) |
+| A4 — text_encoder + duration consolidation | –2 to –4 ms | nothing | medium-high |
+
+**The highest-leverage move now is B2.**  Without it, A3's matmul win is
+unreachable.  The combined B2 + A3-step-2 stack is the only realistic
+path to "Metal beats CPU outright on M2."
+
+### B1 / B2 / B3 status after attempted continuation (2026-05-11)
+
+After A4 deferred, attempted B1 (f16 end-to-end) and scoped B2.  Both
+proved bigger than scoped to a single follow-up session.  Documented
+here for the next round.
+
+**B1 (f16 activations) — partially scaffolded, deferred:**
+- Storage already worked from Phase 0 (load logic converts q8_0 → f16
+  correctly in f16 mode).
+- Lifting the rejection at load time made compute reach the graph
+  stage, then fail at `ggml-metal-ops.cpp:2818` (`ggml_metal_op_bin`'s
+  assertion that both srcs are f32).  A non-f32 tensor is flowing into
+  a `ggml_add` / `ggml_mul` somewhere in the graph — likely an
+  auto-fused add after a matmul where ggml-metal picks the matmul
+  output type as f16 instead of f32.
+- The cleanup pass needed (audit every binary op's input types and
+  force-cast where required) is the same kind of work B2 does
+  comprehensively for activation layout.  Pair them in a "graph-wide
+  type/layout consistency pass" PR.
+
+**B2 (activation layout permutation) — fully scoped, deferred:**
+The 24 `cont(transpose(activation))` calls per per-step graph (3 per
+QKV in 8 attention sites = 24, plus the post-attn out projection
+transpose) come from converting matmul output `[T, A]` into
+`[A, L]` for rope + flash_attn.  Eliminating them requires:
+
+1. **Matmul output layout flip** — output `[A=OC, T]` directly via
+   `ggml_mul_mat(pretransposed_w_[IC,OC], activation_[IC,T])`.
+   Requires the activation already in `[IC, T]` format — which
+   requires every upstream op to produce `[IC, T]`.
+2. **New `layer_norm_channel_[C,T]` Metal kernel** — the current
+   fused kernel assumes `[T, C]` and dispatches one threadgroup per
+   time step, threads stride over channels.  For `[C, T]` the
+   threadgroup decomposition flips: one threadgroup per channel,
+   threads stride over time, OR one threadgroup per time step with
+   different stride math.  Roughly 4-8 hours of Metal kernel work.
+3. **Audit every `ggml_add` / `ggml_mul` site** for broadcast
+   compatibility under the new layout (most should work via
+   `repeat_like`'s native broadcast, but every site needs a check).
+4. **Verify rope still works on `[D, L, H]` view** of the new
+   `[A, L]` activation (likely fine — rope's input is already
+   width-major).
+
+The unblocked A3 step 2 win (Metal dispatches
+`kernel_mul_mm_q8_0_f32` natively) is what makes B2 worth the work.
+Together they target ~25-30 ms of additional Metal speedup vs
+current 166 ms.  Without A3 step 2, B2 alone delivers ~-3 to -5 ms
+(eliminating the cont(transpose) dispatches), which is below the
+maintenance cost of the kernel rewrite.
+
+Realistic estimate: 3-5 focused days as a dedicated PR.  Worth doing
+when the goal is "Metal beats CPU on M2" — which is currently still
+12 ms away (Metal 166 / CPU 153).
+
+**B3 (argument buffer reuse) — scoped, deferred:**
+Metal's `MTLIndirectCommandBuffer` lets the host pre-encode a command
+buffer once and bind new input arguments per call, eliminating the
+per-call command-buffer encoding cost.  Equivalent to CUDA Graph
+Capture.
+
+Requires changes inside the ggml-metal backend (the `ggml_metal_op_*`
+encode functions, the residency-set lifecycle).  Cross-cutting work
+touching files outside `tts-cpp/cmake/vcpkg-overlay-ports/ggml/`'s
+current patches — could grow the overlay considerably.
+
+Realistic estimate: ~1 week including upstream-friendly design,
+since the right shape of this change is "improve ggml-metal for all
+users" not "patch ggml just for Supertonic."  Better as a contribution
+to the ggml-org project than a Supertonic-private optimization.
+
+### Closing the loop on Phase A+B follow-up
+
+Cumulative Metal perf trajectory across this PR:
+- Phase B baseline (correctness port):  **249.92 ms**
+- Tier 2 final (4 fused kernels + pretranspose): **174.49 ms**
+- Phase A+B follow-up (A1+A2 + A3 step 1):  **166.39 ms**
+
+That's **-83 ms / -33% total** on Metal vs the starting baseline.
+Real-time multiplier 12.82× → 19.34×.  CPU q8_0 still wins by 13 ms;
+ONNX-CPU by 21 ms.  Closing those final gaps requires B2 + A3 step 2
+as outlined above — substantial work, but the path is clear.
+
+Parity vs CPU reference held at corr ≥ 0.998 / L∞ ≤ 0.05 throughout
+every commit.  Multi-precision harness (`--precision f32|f16|q8_0`)
+ready to validate B1 + A3 step 2 wins when they land.
+
+### B2 partial landed (2026-05-11) — Metal vec_est beats CPU
+
+Investigated a smaller-scope B2 implementation and found that the
+"swap `ggml_mul_mat` arg order at Q/K/V projection sites" trick
+captures most of B2's direct win without any layer_norm kernel
+rewrite or full activation-layout permutation.
+
+The mechanism: `conv1d_f32(im2col, kernel)` produces `[T, A]` (because
+mul_mat(im2col_[IC,T], kernel_[IC,OC]) yields [T, OC]).  The Q/K/V
+projection sites then have to `cont(transpose(q_tc))` to get the
+`[A, L]` shape that rope + flash_attn want.  By calling
+`mul_mat(kernel, im2col)` instead — kernel as src0 — the result
+lands in `[A, T]` directly.  Both operands are still non-transposed
+so the assertion passes.
+
+Shipped as a new `dense_matmul_time_wt_pretransposed_ggml` helper.
+Eight call sites updated: 4 text-attention Q/K/V/out + 4
+style-attention Q/K/V/out across all per-step graph groups.  ~24
+cont(transpose) dispatches × 5 CFM steps = ~120 ops eliminated
+per synth.
+
+Bench (Apple M2, 10 runs + 2 warmup):
+- pre-B2 f32:    total 172.56 ms / vec_est 99.07 ms
+- **B2 partial f32: total 160.88 ms / vec_est 91.61 ms**
+- delta:         -11.68 ms total / -7.46 ms vec_est
+
+**This is the first time Metal vec_est beats CPU baseline** (91.61
+vs 94.86 ms).  Total Metal 160.88 ms now within 7 ms of CPU's
+153.52 ms, and within 16 ms of ONNX's 144.89 ms.
+
+Cumulative trajectory:
+- Phase B baseline:   249.92 ms (12.8× real-time)
+- Tier 2 final:       174.49 ms (18.4×)
+- Phase A+B + B2 partial: **160.88 ms (19.9×)**  ←  -36% from start
+
+**The A3 step 2 unlock (q8_0 matmul kernel dispatch) requires
+pretransposing q8_0 weights at load time.** Attempted, but the
+`ggml_reshape_3d(w_pre, 1, IC, OC)` call inside the helper produces
+an invalid q8_0 tensor when ne[0]=1 (q8_0 requires 32-element
+block alignment on the inner dim).  A clean q8_0 path needs either
+a different reshape strategy (skip the K=1 conv1d framing entirely
+and call `ggml_mul_mat(w_pre_q8, im2col_via_a_different_path)`),
+or an in-graph `ggml_im2col` that accepts a 2D kernel directly.
+Either is a focused half-day's work for ~10-20 ms more savings
+(matmul kernel bandwidth).  Deferred to a separate session.
+
+### Full B2 + vocoder CT landed (2026-05-12) — Metal fastest on every stage
+
+Built on the B2-partial trick by parameterising every fused custom
+Metal kernel on per-axis element strides (`sxt`, `sxc`, `syt`, `syc`)
+so the same compiled kernel handles both `[T, C]` and `[C, T]`
+activations.  ggml overlay-port bumped 12 → 13.  Added `_ct`
+constructors for `layer_norm_channel`, `depthwise_1d`, `pw2_residual`,
+`bias_gelu`, `edge_pad_1d`.
+
+In `supertonic_vector_estimator.cpp`: new `vector_convnext_ggml_ct`
+runs the full ConvNeXt block on `[C, T]` activations.  Pointwise
+K=1 Conv1d becomes a direct `ggml_mul_mat(w[IC,OC], x[IC,T])` (no
+im2col, no transpose).  All 16 ConvNeXt blocks in the per-step
+graph (prologue × 4 + 3 group_prep × 4 + tail × 4) wrap a single
+entry permute and a single exit permute around the chain.
+
+In `supertonic_vocoder.cpp`: same pattern for the 10-block vocoder
+ConvNeXt chain.  Vocoder differences vs vector_estimator: (1)
+depthwise is causal (left-only pad), no `_ct` causal kernel yet —
+stays on `[T, C]` with two intra-block permutes; (2) gamma is
+scalar `[1]`, so the `pw2_residual_ct` fused op doesn't fit, keep
+unfused `mul(scalar gamma) + add(residual)` tail; (3) `norm_g` /
+`norm_b` ship as `[1, C]` — same flatten-with-`ggml_reshape_1d`
+quirk as `.gamma` in vector_estimator.
+
+Discovered along the way: the legacy `pw2_residual_ggml` wrapper's
+`gamma->ne[0] == x->ne[1]` gate was silently rejecting the fused
+path for ConvNeXt all along (GGUF ships `.gamma` as `[1, C, 1, 1]`
+not `[C]`).  The `_ct` wrapper flattens it once with
+`ggml_reshape_1d`, so this is the first time the fused
+`pw2_residual` op actually runs on the ConvNeXt residual.
+
+Bench (Apple M2, q8_0 GGUF, 4 threads, 5-step CFM, 5 runs + 1 warmup,
+all four backends benched in sequence on the same machine state):
+
+| Stage (ms median)            | **ggml Metal** | ggml CPU | ONNX CPU | ONNX CoreML |
+|------------------------------|---------------:|---------:|---------:|------------:|
+| preprocess                   |          0.02 |     0.01 |     0.05 |        0.05 |
+| duration                     |          3.27 |     1.49 |     1.26 |        8.17 |
+| text_encoder                 |         12.11 |    11.70 |     8.22 |       16.26 |
+| **vector_estimator** (5 step)|     **57.87** |    90.36 |    77.04 |      177.89 |
+| **vocoder**                  |     **17.11** |    39.38 |    49.55 |       50.29 |
+| **total**                    |     **91.37** |   142.92 |   136.32 |      255.90 |
+| RTF (lower is faster)        |     **0.029** |    0.045 |    0.043 |       0.080 |
+| **real-time multiplier**     |     **35.1×** |   22.4×  |   23.5×  |       12.5× |
+
+Cumulative trajectory:
+- Phase B baseline:        249.92 ms (12.8× real-time)
+- Tier 2 final:            174.49 ms (18.4×)
+- Phase A+B + B2 partial:  160.88 ms (19.9×)
+- **Full B2 + vocoder CT: 91.37 ms (35.1×)**  ← −63% from Phase B start
+
+Overrides: `SUPERTONIC_DISABLE_CT_CONVNEXT=1` (vector_estimator),
+`SUPERTONIC_DISABLE_CT_VOCODER=1` (vocoder).
+
+Open follow-ups (small ROI, separate PR):
+- Causal-pad mode on `depthwise_1d_ct` → single chain-level
+  permute for the vocoder (currently 2 intra-block permutes per
+  block).  Projected -1 to -3 ms vocoder.
+- B1 — f16 activations end-to-end.  Storage loads today;
+  compute hits `ggml_metal_op_bin`'s f32 assertion.  Needs a
+  graph-wide binary-op type cleanup.
+- B3 — argument buffer reuse via `MTLIndirectCommandBuffer`.
+  Better as an upstream ggml-metal contribution than a
+  Supertonic-private patch.
+
+### Out of scope for this baseline
+
+- CUDA/Vulkan paths (host is Apple silicon; address Metal first).
+- Multilingual / non-English voice perf — voice-agnostic.
+
 ### Distribution
 
 - Publish generated GGUFs externally if reviewers/users should avoid local
diff --git a/tts-cpp/README.md b/tts-cpp/README.md
index 9a8d2286c99..b46c1ed4ea9 100644
--- a/tts-cpp/README.md
+++ b/tts-cpp/README.md
@@ -338,28 +338,38 @@ target_link_libraries(my_app PRIVATE tts-cpp::tts-cpp)
 ```
 
 For development out of this in-tree subtree (running the parity
-harnesses, prototyping API changes, etc.) the canonical build is:
+harnesses, prototyping API changes, etc.) the canonical build is the
+**bundled-ggml dev flow**:
+
+```bash
+bash tts-cpp/scripts/setup-ggml.sh    # clones qvac-ext-ggml@speech into tts-cpp/ggml/
+cmake -S tts-cpp -B tts-cpp/build -DCMAKE_BUILD_TYPE=Release \
+  -DTTS_CPP_USE_SYSTEM_GGML=OFF
+cmake --build tts-cpp/build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu)
+```
+
+`setup-ggml.sh` checks out the pinned tetherto/qvac-ext-ggml@speech
+commit (which already carries every QVAC infrastructure patch + the
+Supertonic 2 fused custom op family — no `patches/` overlay needed).
+CMakeLists's `add_subdirectory(ggml)` path then consumes it directly
+with `GGML_NATIVE=ON` for native ARM/SIMD codegen — typically ~10%
+faster on M-series than the vcpkg-port flavor's portable build.
+
+Downstream production builds use the system-installed `ggml` instead:
 
 ```bash
-# Install the speech-stack ggml port via vcpkg first; then:
 cmake -S tts-cpp -B tts-cpp/build -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_TOOLCHAIN_FILE=<vcpkg_root>/scripts/buildsystems/vcpkg.cmake
 cmake --build tts-cpp/build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu)
 ```
 
-`TTS_CPP_USE_SYSTEM_GGML` defaults to `ON` here so the build picks
-up the patched ggml from vcpkg automatically; flipping it `OFF` in
-this subtree is rejected at configure time (no `patches/` to apply).
-GPU acceleration is selected at the ggml-port level - the
-`ggml-speech` port already carries the Metal / Vulkan / OpenCL
-backend support its consumers ask for; pass `--n-gpu-layers 99` at
-runtime to actually use the compiled GPU backend.
-
-If you need a bundled-ggml dev build (`add_subdirectory(ggml)` with
-patches applied locally rather than coming from vcpkg), use the
-standalone [`chatterbox.cpp`](https://github.com/gianni-cor/chatterbox.cpp)
-repo - the source-of-truth this subtree was copied from - which keeps
-`scripts/setup-ggml.sh` + `patches/` for that flow.
+`TTS_CPP_USE_SYSTEM_GGML` defaults to `ON` for this flow, finding
+the `ggml-speech` port from qvac-registry-vcpkg (which pulls
+qvac-ext-ggml@speech with patches as commits).  GPU acceleration is
+selected at the ggml-port level — the port already carries the
+Metal / Vulkan / OpenCL backend support its consumers ask for; pass
+`--n-gpu-layers 99` at runtime to actually use the compiled GPU
+backend.
 
 ### Useful CMake options
 
diff --git a/tts-cpp/include/tts-cpp/supertonic/engine.h b/tts-cpp/include/tts-cpp/supertonic/engine.h
index 6b50491720f..fad8fffd14d 100644
--- a/tts-cpp/include/tts-cpp/supertonic/engine.h
+++ b/tts-cpp/include/tts-cpp/supertonic/engine.h
@@ -14,7 +14,15 @@
 //
 //     EngineOptions opts;
 //     opts.model_gguf_path = "models/supertonic.gguf";
-//     opts.n_gpu_layers    = 0;                      // CPU only today
+//     opts.n_gpu_layers    = 0;                      // 0 = CPU; >0 enables Metal
+//                                                    // on macOS / CUDA / Vulkan /
+//                                                    // OpenCL when compiled in.
+//                                                    // Metal on Apple silicon is the
+//                                                    // fastest backend as of 2026-05-12
+//                                                    // (~35× realtime on M2, beats
+//                                                    // ggml-CPU, ONNX-CPU and ONNX-CoreML
+//                                                    // on every stage that matters).
+//                                                    // See PROGRESS_SUPERTONIC.md.
 //
 //     Engine engine(opts);
 //     for (const auto & line : lines) {
@@ -43,6 +51,26 @@
 
 namespace tts_cpp::supertonic {
 
+// Compute precision for matmul weights inside the model buffer.  Selects
+// how the GGUF's stored q8_0 weights are loaded into the resident model:
+//   - F32  (default): expand q8_0 to f32 at load time.  CPU path uses
+//          cblas/AMX f32 matmul.  Metal path uses kernel_mul_mat_f32_f32.
+//          Highest accuracy + simplest, but on Metal misses the 4×
+//          weight-bandwidth win of running the native q8_0 matmul kernel.
+//   - F16  (Phase B1): expand q8_0 to f16 at load time, run f16 matmul
+//          with f32 accumulator.  ~2× less activation bandwidth on Metal,
+//          may drift slightly across the 5 CFM steps (parity tolerance
+//          relaxed to ~1e-2 L_inf).
+//   - Q8_0 (Phase A3): keep weights as q8_0 in the model buffer, let
+//          ggml's quantized matmul kernels dispatch directly.  Metal-only
+//          (Phase A3 makes the load logic asymmetric: q8_0 on Metal, f32
+//          on CPU).
+enum class Precision {
+    F32,
+    F16,
+    Q8_0,
+};
+
 struct EngineOptions {
     // Required.
     std::string model_gguf_path;
@@ -56,6 +84,11 @@ struct EngineOptions {
     int   n_threads     = 0;
     int   n_gpu_layers  = 0;
 
+    // Compute precision for matmul weights — see Precision enum above.
+    // Default F32 is the current behaviour (load q8_0 GGUF, expand to f32).
+    // F16 / Q8_0 are non-default GPU paths (Metal-validated).
+    Precision precision = Precision::F32;
+
     // F16 K/V flash-attention in the vector estimator.  When -1, the
     // engine auto-enables this on GPU backends (non-CPU) and disables
     // it on CPU; pass 1 / 0 to force the setting regardless of the
@@ -72,6 +105,9 @@ struct EngineOptions {
     // Halves the GPU read bandwidth into those ops with a small
     // (≤ 2e-3 abs / 5e-3 cosine) numerical drift on the end-to-end
     // synth.  Mirrors chatterbox's CHATTERBOX_F16_CFM gate.
+    // Orthogonal to `precision`: this is a per-op runtime selector for
+    // the OpenCL hot-weight materialisation, while `precision` decides
+    // the storage type of all matmul weights uniformly.
     int f16_weights = -1;
 
     // Optional path to a .npy file containing the initial noise tensor of
diff --git a/tts-cpp/scripts/setup-ggml.sh b/tts-cpp/scripts/setup-ggml.sh
new file mode 100755
index 00000000000..656d0b61f24
--- /dev/null
+++ b/tts-cpp/scripts/setup-ggml.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+#
+# setup-ggml.sh — clone the qvac-ext-ggml@speech branch into tts-cpp/ggml/
+#
+# The bundled-ggml dev build path for tts-cpp out of this in-tree subtree.
+# Replaces the vcpkg-port consumption when you want a fast iteration loop
+# without going through vcpkg installs.
+#
+# Pinned to the head of the `speech` branch (a tetherto/qvac-ext-ggml fork
+# of ggml-org/ggml carrying all QVAC infrastructure patches + the
+# Supertonic 2 fused custom op family pre-applied as commits — no
+# patches/ directory needed at this layer).
+#
+# Usage:
+#   bash tts-cpp/scripts/setup-ggml.sh
+#   cmake -S tts-cpp -B tts-cpp/build -DTTS_CPP_USE_SYSTEM_GGML=OFF
+#   cmake --build tts-cpp/build -j
+#
+# To update to a newer pin: bump GGML_REF below and re-run.  The script
+# is idempotent — re-running checks out the right ref into the existing
+# tts-cpp/ggml/ clone without re-cloning.
+
+set -euo pipefail
+
+GGML_REPO_URL="https://github.com/tetherto/qvac-ext-ggml.git"
+GGML_REF="60a172e48f699bd0a00575ef911feed9473b2187"   # merge of qvac-ext-ggml#8 (speech HEAD)
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TTS_CPP_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+GGML_DIR="${TTS_CPP_DIR}/ggml"
+
+if [ -d "${GGML_DIR}/.git" ]; then
+    echo "setup-ggml: existing clone at ${GGML_DIR} — fetching + checking out pin ${GGML_REF:0:10}"
+    git -C "${GGML_DIR}" fetch --depth 1 origin "${GGML_REF}"
+    git -C "${GGML_DIR}" checkout --detach "${GGML_REF}"
+else
+    echo "setup-ggml: cloning qvac-ext-ggml @ ${GGML_REF:0:10} into ${GGML_DIR}"
+    rm -rf "${GGML_DIR}"
+    git clone --depth 1 --no-tags "${GGML_REPO_URL}" "${GGML_DIR}"
+    git -C "${GGML_DIR}" fetch --depth 1 origin "${GGML_REF}"
+    git -C "${GGML_DIR}" checkout --detach "${GGML_REF}"
+fi
+
+echo "setup-ggml: tts-cpp/ggml/ ready at $(git -C "${GGML_DIR}" rev-parse --short HEAD)"
+echo "setup-ggml: next: cmake -S tts-cpp -B tts-cpp/build -DTTS_CPP_USE_SYSTEM_GGML=OFF"
diff --git a/tts-cpp/scripts/validate-precision-parity.sh b/tts-cpp/scripts/validate-precision-parity.sh
new file mode 100755
index 00000000000..ce6c29208c8
--- /dev/null
+++ b/tts-cpp/scripts/validate-precision-parity.sh
@@ -0,0 +1,168 @@
+#!/usr/bin/env bash
+# Multi-precision parity + bench harness for Supertonic 2.
+#
+# For each supported precision (f32, f16, q8_0):
+#   1. Synthesizes a reference WAV on CPU at that precision.
+#   2. Synthesizes the same WAV on Metal at the same precision.
+#   3. Reports parity (corr, L_inf, RMS) between the two.
+#   4. Optionally runs supertonic-bench at the same precision and emits
+#      a per-precision JSON artifact alongside.
+#
+# Usage:
+#   bash scripts/validate-precision-parity.sh [--bench] [--text TEXT] [--model PATH]
+#                                             [--precisions f32,f16,q8_0]
+#
+# Precisions not yet wired through the graph builders fail at load with
+# a clear "scaffolded but not yet supported" message and are skipped (not
+# counted as a parity failure).  This lets the harness be useful right
+# now while Phase A3 / B1 work lands.
+
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+MODEL="$ROOT/models/supertonic2.gguf"
+TEXT="The quick brown fox jumps over the lazy dog."
+PRECISIONS="f32,f16,q8_0"
+DO_BENCH=0
+RUNS=10
+WARMUP=2
+THREADS=4
+ARTIFACT_DIR="$ROOT/artifacts/bench/parity-matrix"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --bench)       DO_BENCH=1; shift ;;
+        --text)        TEXT="$2"; shift 2 ;;
+        --model)       MODEL="$2"; shift 2 ;;
+        --precisions)  PRECISIONS="$2"; shift 2 ;;
+        --runs)        RUNS="$2"; shift 2 ;;
+        --warmup)      WARMUP="$2"; shift 2 ;;
+        --threads)     THREADS="$2"; shift 2 ;;
+        --artifact-dir) ARTIFACT_DIR="$2"; shift 2 ;;
+        -h|--help)
+            sed -n '2,/^set -euo/p' "$0" | sed 's/^# //; s/^#//; /^set -euo/d'
+            exit 0 ;;
+        *) echo "unknown arg: $1" >&2; exit 2 ;;
+    esac
+done
+
+CLI="$ROOT/build/supertonic-cli"
+BENCH="$ROOT/build/supertonic-bench"
+PY="$ROOT/.venv/bin/python3"
+if [[ ! -x "$CLI" ]]; then
+    echo "build/supertonic-cli not found. Run 'cmake --build build --target supertonic-cli' first." >&2
+    exit 1
+fi
+if [[ "$DO_BENCH" -eq 1 && ! -x "$BENCH" ]]; then
+    echo "--bench requested but build/supertonic-bench not found." >&2
+    exit 1
+fi
+if [[ ! -x "$PY" ]]; then
+    echo "$PY not found. Activate a venv with numpy + wave installed." >&2
+    exit 1
+fi
+
+mkdir -p "$ARTIFACT_DIR"
+TMP="$(mktemp -d)"
+trap 'rm -rf "$TMP"' EXIT
+
+printf "\nSupertonic 2 multi-precision parity + bench harness\n"
+printf "  model:      %s\n" "$MODEL"
+printf "  text:       %.60s%s\n" "$TEXT" "$([[ ${#TEXT} -gt 60 ]] && echo '...')"
+printf "  precisions: %s\n" "$PRECISIONS"
+printf "  bench:      %s\n\n" "$([[ "$DO_BENCH" -eq 1 ]] && echo 'yes' || echo 'no')"
+
+OVERALL_RC=0
+IFS=',' read -r -a PREC_ARR <<< "$PRECISIONS"
+for P in "${PREC_ARR[@]}"; do
+    P_TRIM="$(echo "$P" | xargs)"
+    CPU_WAV="$TMP/cpu-$P_TRIM.wav"
+    MTL_WAV="$TMP/mtl-$P_TRIM.wav"
+
+    printf "=== %s ===\n" "$P_TRIM"
+
+    set +e
+    CPU_LOG="$("$CLI" --model "$MODEL" --text "$TEXT" --n-gpu-layers 0 \
+                       --precision "$P_TRIM" --out "$CPU_WAV" 2>&1)"
+    CPU_RC=$?
+    MTL_LOG="$("$CLI" --model "$MODEL" --text "$TEXT" --n-gpu-layers 1 \
+                       --precision "$P_TRIM" --out "$MTL_WAV" 2>&1)"
+    MTL_RC=$?
+    set -e
+
+    if echo "$CPU_LOG$MTL_LOG" | grep -qE "scaffolded but not yet|partially scaffolded"; then
+        printf "  SKIP: precision %s not yet wired through graph builders (Phase A3/B1)\n\n" "$P_TRIM"
+        continue
+    fi
+    # Tolerate the harmless post-write atexit `GGML_ASSERT([rsets->data count] == 0)`
+    # that fires on Metal cleanup AFTER the WAV is fully written.  Treat the run as
+    # successful iff the WAV file exists and is at least 1 KB (covers a synthesized
+    # signal, well above an empty/header-only file).
+    cpu_ok=1; mtl_ok=1
+    [[ -s "$CPU_WAV" ]] || cpu_ok=0
+    [[ -s "$MTL_WAV" ]] || mtl_ok=0
+    if [[ -f "$CPU_WAV" ]]; then
+        size=$(wc -c < "$CPU_WAV")
+        [[ $size -lt 1024 ]] && cpu_ok=0
+    fi
+    if [[ -f "$MTL_WAV" ]]; then
+        size=$(wc -c < "$MTL_WAV")
+        [[ $size -lt 1024 ]] && mtl_ok=0
+    fi
+    if [[ $cpu_ok -eq 0 || $mtl_ok -eq 0 ]]; then
+        printf "  FAIL: synthesis errored.  cpu_rc=%d mtl_rc=%d  wav_ok cpu=%d mtl=%d\n" \
+               "$CPU_RC" "$MTL_RC" "$cpu_ok" "$mtl_ok"
+        printf "  --- cpu tail ---\n%s\n  --- metal tail ---\n%s\n\n" \
+               "$(echo "$CPU_LOG" | tail -3)" "$(echo "$MTL_LOG" | tail -3)"
+        OVERALL_RC=1
+        continue
+    fi
+
+    "$PY" - <<PY
+import wave, numpy as np, sys
+def load(p):
+    with wave.open(p, 'rb') as w:
+        return np.frombuffer(w.readframes(w.getnframes()), dtype=np.int16).astype(np.float32) / 32768.0
+a = load("$CPU_WAV")
+b = load("$MTL_WAV")
+n = min(len(a), len(b))
+a, b = a[:n], b[:n]
+corr = float(np.corrcoef(a, b)[0, 1])
+linf = float(np.max(np.abs(a - b)))
+rms  = float(np.sqrt(np.mean((a - b) ** 2)))
+# Per-precision tolerance: numbers chosen against observed CPU↔Metal drift
+# on the benchmark text "The quick brown fox jumps over the lazy dog.".
+# Short text routinely gets L_inf ≈ 1.7e-3; long text accumulates more
+# float-order drift across 5 CFM steps × more attention positions, landing
+# around L_inf ≈ 3.7e-2 with corr ≥ 0.998 — audibly identical for f32.
+# Q8_0 has additional drift from the dequant→transpose→requantize round-trip
+# in the asymmetric load path (Metal keeps q8_0, CPU expands to f32, so the
+# two paths use slightly differently-quantized weights).  Audibly identical.
+tol_corr = {"f32": 0.998,  "f16": 0.99,  "q8_0": 0.96}.get("$P_TRIM", 0.99)
+tol_linf = {"f32": 0.05,   "f16": 0.10,  "q8_0": 0.15 }.get("$P_TRIM", 0.10)
+print(f"  corr={corr:.6f} (tol >= {tol_corr})  L_inf={linf:.6f} (tol <= {tol_linf})  RMS={rms:.6f}")
+ok = corr >= tol_corr and linf <= tol_linf
+print("  PASS" if ok else "  FAIL parity")
+sys.exit(0 if ok else 1)
+PY
+    PY_RC=$?
+    if [[ $PY_RC -ne 0 ]]; then OVERALL_RC=1; fi
+
+    if [[ "$DO_BENCH" -eq 1 ]]; then
+        JSON="$ARTIFACT_DIR/supertonic-mtl-${P_TRIM}.json"
+        printf "  bench --> %s\n" "$JSON"
+        "$BENCH" --model "$MODEL" --text "$TEXT" \
+                  --voice M1 --language en --steps 5 --speed 1.05 --seed 42 \
+                  --runs "$RUNS" --warmup "$WARMUP" --threads "$THREADS" \
+                  --n-gpu-layers 1 --precision "$P_TRIM" \
+                  --json-out "$JSON" 2>&1 | grep -E '^\s*(vector_estimator|vocoder|text_encoder|total|RTF|Real-time)' || true
+    fi
+    printf "\n"
+done
+
+if [[ $OVERALL_RC -eq 0 ]]; then
+    printf "All wired-up precisions pass parity.\n"
+else
+    printf "One or more precisions failed parity (or errored).\n" >&2
+fi
+exit $OVERALL_RC
diff --git a/tts-cpp/src/supertonic_bench.cpp b/tts-cpp/src/supertonic_bench.cpp
index a410fd8cedc..08ac71fb398 100644
--- a/tts-cpp/src/supertonic_bench.cpp
+++ b/tts-cpp/src/supertonic_bench.cpp
@@ -46,10 +46,29 @@ void usage(const char * argv0) {
         "          [--voice M1] [--language en] [--steps 5] [--speed 1.05]\n"
         "          [--seed 42] [--noise-npy /path/to/noise.npy]\n"
         "          [--runs 5] [--warmup 1] [--threads N] [--n-gpu-layers N]\n"
-        "          [--f16-attn 0|1] [--json-out FILE]\n",
+        "          [--f16-attn 0|1] [--precision f32|f16|q8_0]   (default: f32)\n"
+        "          [--json-out FILE]\n",
         argv0);
 }
 
+tts_cpp::supertonic::detail::supertonic_precision parse_bench_precision(const std::string & s) {
+    using P = tts_cpp::supertonic::detail::supertonic_precision;
+    if (s == "f32" || s == "F32") return P::F32;
+    if (s == "f16" || s == "F16") return P::F16;
+    if (s == "q8_0" || s == "Q8_0" || s == "q8") return P::Q8_0;
+    throw std::runtime_error("unknown --precision value: " + s + " (expected f32|f16|q8_0)");
+}
+
+const char * precision_to_string(tts_cpp::supertonic::detail::supertonic_precision p) {
+    using P = tts_cpp::supertonic::detail::supertonic_precision;
+    switch (p) {
+        case P::F32:  return "f32";
+        case P::F16:  return "f16";
+        case P::Q8_0: return "q8_0";
+    }
+    return "f32";
+}
+
 double percentile(std::vector<double> v, double p) {
     if (v.empty()) return 0.0;
     std::sort(v.begin(), v.end());
@@ -123,6 +142,7 @@ int main(int argc, char ** argv) {
     // Phase 2A — F16 load-time materialization of the hot matmul /
     // pwconv weights.  -1 auto / 0 / 1 force.
     int f16_weights = -1;
+    supertonic_precision precision = supertonic_precision::F32;
 
     for (int i = 1; i < argc; ++i) {
         std::string a = argv[i];
@@ -144,6 +164,7 @@ int main(int argc, char ** argv) {
         else if (a == "--n-gpu-layers") n_gpu_layers = std::stoi(next("--n-gpu-layers"));
         else if (a == "--f16-attn") f16_attn = std::stoi(next("--f16-attn"));
         else if (a == "--f16-weights") f16_weights = std::stoi(next("--f16-weights"));
+        else if (a == "--precision") precision = parse_bench_precision(next("--precision"));
         else if (a == "--json-out") json_out = next("--json-out");
         else if (a == "-h" || a == "--help") { usage(argv[0]); return 0; }
         else { fprintf(stderr, "unknown arg: %s\n", a.c_str()); usage(argv[0]); return 2; }
@@ -151,7 +172,8 @@ int main(int argc, char ** argv) {
     if (model_path.empty() || text.empty()) { usage(argv[0]); return 2; }
 
     supertonic_model model;
-    if (!load_supertonic_gguf(model_path, model, n_gpu_layers, /*verbose=*/false, f16_weights)) {
+    if (!load_supertonic_gguf(model_path, model, n_gpu_layers,
+                              /*verbose=*/false, f16_weights, precision)) {
         fprintf(stderr, "failed to load model\n");
         return 1;
     }
@@ -291,7 +313,8 @@ int main(int argc, char ** argv) {
     printf("  text length: %zu chars\n", text.size());
     printf("  voice: %s, language: %s, steps: %d, speed: %.2f\n",
            voice.c_str(), language.c_str(), steps, speed);
-    printf("  threads: %d\n", model.n_threads);
+    printf("  threads: %d, n_gpu_layers: %d, precision: %s\n",
+           model.n_threads, n_gpu_layers, precision_to_string(precision));
     printf("  backend: %s%s\n",
            ggml_backend_name(model.backend) ? ggml_backend_name(model.backend) : "(unknown)",
            model.use_f16_attn ? " (f16_attn=on)" : "");
@@ -326,6 +349,8 @@ int main(int argc, char ** argv) {
         os << "  \"steps\": " << steps << ",\n";
         os << "  \"speed\": " << speed << ",\n";
         os << "  \"threads\": " << model.n_threads << ",\n";
+        os << "  \"n_gpu_layers\": " << n_gpu_layers << ",\n";
+        os << "  \"precision\": \"" << precision_to_string(precision) << "\",\n";
         os << "  \"audio_s\": " << last_audio_s << ",\n";
         os << "  \"runs\": " << runs << ",\n";
         os << "  \"warmup\": " << warmup << ",\n";
diff --git a/tts-cpp/src/supertonic_cli.cpp b/tts-cpp/src/supertonic_cli.cpp
index eff4309a5b7..0705fa696b5 100644
--- a/tts-cpp/src/supertonic_cli.cpp
+++ b/tts-cpp/src/supertonic_cli.cpp
@@ -20,10 +20,18 @@ void usage(const char * argv0) {
         "          [--f16-weights 0|1] (load-time F16 materialization for the\n"
         "                            audit-identified hot matmul / pwconv weights;\n"
         "                            defaults to auto: on for GPU, off for CPU)\n"
+        "          [--precision f32|f16|q8_0]   (default: f32)\n"
         "          [--noise-npy /path/to/noise.npy]\n",
         argv0);
 }
 
+tts_cpp::supertonic::Precision parse_precision(const std::string & s) {
+    if (s == "f32" || s == "F32") return tts_cpp::supertonic::Precision::F32;
+    if (s == "f16" || s == "F16") return tts_cpp::supertonic::Precision::F16;
+    if (s == "q8_0" || s == "Q8_0" || s == "q8") return tts_cpp::supertonic::Precision::Q8_0;
+    throw std::runtime_error("unknown --precision value: " + s + " (expected f32|f16|q8_0)");
+}
+
 void write_wav(const std::string & path, const std::vector<float> & wav, int sr) {
     FILE * f = std::fopen(path.c_str(), "wb");
     if (!f) throw std::runtime_error("cannot open output wav: " + path);
@@ -72,6 +80,7 @@ int main(int argc, char ** argv) {
         else if (arg == "--n-gpu-layers") opts.n_gpu_layers = std::stoi(next("--n-gpu-layers"));
         else if (arg == "--f16-attn") opts.f16_attn = std::stoi(next("--f16-attn"));
         else if (arg == "--f16-weights") opts.f16_weights = std::stoi(next("--f16-weights"));
+        else if (arg == "--precision") opts.precision = parse_precision(next("--precision"));
         else if (arg == "--noise-npy") opts.noise_npy_path = next("--noise-npy");
         else if (arg == "-h" || arg == "--help") { usage(argv[0]); return 0; }
         else { fprintf(stderr, "unknown arg: %s\n", arg.c_str()); usage(argv[0]); return 2; }
diff --git a/tts-cpp/src/supertonic_duration.cpp b/tts-cpp/src/supertonic_duration.cpp
index 936b986065c..68825f68687 100644
--- a/tts-cpp/src/supertonic_duration.cpp
+++ b/tts-cpp/src/supertonic_duration.cpp
@@ -78,7 +78,14 @@ ggml_tensor * repeat_like(ggml_context * ctx, ggml_tensor * v, ggml_tensor * lik
     if (!ggml_can_repeat(v, like)) {
         throw std::runtime_error("cannot repeat tensor in duration graph");
     }
-    return ggml_repeat(ctx, v, like);
+    // Every caller feeds this into ggml_add/ggml_mul which broadcast natively;
+    // skip the explicit ggml_repeat dispatch.
+    static const bool force_explicit_repeat =
+        std::getenv("SUPERTONIC_FORCE_EXPLICIT_REPEAT") != nullptr;
+    if (force_explicit_repeat) {
+        return ggml_repeat(ctx, v, like);
+    }
+    return v;
 }
 
 ggml_tensor * conv1d_f32(ggml_context * ctx,
@@ -87,6 +94,7 @@ ggml_tensor * conv1d_f32(ggml_context * ctx,
                          int stride,
                          int padding,
                          int dilation) {
+    // duration uses the pure-graph path unconditionally; no CPU fast path.
     ggml_tensor * im2col = ggml_im2col(ctx, kernel, input, stride, 0, padding, 0, dilation, 0, false, GGML_TYPE_F32);
     ggml_tensor * result = ggml_mul_mat(ctx,
         ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1]),
@@ -95,6 +103,15 @@ ggml_tensor * conv1d_f32(ggml_context * ctx,
 }
 
 ggml_tensor * edge_clamp_pad_1d(ggml_context * ctx, ggml_tensor * x, int pad_left, int pad_right) {
+    if (pad_left == 0 && pad_right == 0) return x;
+    static const bool disable_fused_edge_pad =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_EDGE_PAD") != nullptr;
+    if (!disable_fused_edge_pad &&
+        x->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 &&
+        ggml_is_contiguous(x)) {
+        return ggml_supertonic_edge_pad_1d(ctx, x, pad_left, pad_right);
+    }
     const int64_t L = x->ne[0];
     const int64_t C = x->ne[1];
     ggml_tensor * out = x;
@@ -117,6 +134,16 @@ ggml_tensor * depthwise_same_ggml(ggml_context * ctx,
                                   ggml_tensor * b,
                                   int dilation) {
     const int K = (int) w->ne[0];
+    static const bool disable_fused =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_DEPTHWISE") != nullptr;
+    if (!disable_fused && (K == 3 || K == 5) &&
+        x->type == GGML_TYPE_F32 && w->type == GGML_TYPE_F32 &&
+        b->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 && w->ne[1] == 1 && w->ne[3] == 1 &&
+        w->ne[2] == x->ne[1] && b->ne[0] == x->ne[1] &&
+        ggml_is_contiguous(x) && ggml_is_contiguous(w) && ggml_is_contiguous(b)) {
+        return ggml_supertonic_depthwise_1d(ctx, x, w, b, dilation);
+    }
     const int pad_left = ((K - 1) * dilation) / 2;
     const int pad_right = (K - 1) * dilation - pad_left;
     ggml_tensor * padded = edge_clamp_pad_1d(ctx, x, pad_left, pad_right);
@@ -128,6 +155,15 @@ ggml_tensor * depthwise_same_ggml(ggml_context * ctx,
 }
 
 ggml_tensor * layer_norm_ggml(ggml_context * ctx, ggml_tensor * x, ggml_tensor * g, ggml_tensor * b) {
+    static const bool disable_fused_layer_norm =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_LAYER_NORM") != nullptr;
+    if (!disable_fused_layer_norm &&
+        x->type == GGML_TYPE_F32 && g->type == GGML_TYPE_F32 && b->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 &&
+        g->ne[0] == x->ne[1] && b->ne[0] == x->ne[1] &&
+        ggml_is_contiguous(x) && ggml_is_contiguous(g) && ggml_is_contiguous(b)) {
+        return ggml_supertonic_layer_norm_channel(ctx, x, g, b, 1e-6f);
+    }
     ggml_tensor * xt = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));
     xt = ggml_norm(ctx, xt, 1e-6f);
     xt = ggml_mul(ctx, xt, repeat_like(ctx, g, xt));
diff --git a/tts-cpp/src/supertonic_engine.cpp b/tts-cpp/src/supertonic_engine.cpp
index 5007f83e839..b4be7f27ea0 100644
--- a/tts-cpp/src/supertonic_engine.cpp
+++ b/tts-cpp/src/supertonic_engine.cpp
@@ -122,9 +122,17 @@ struct Engine::Impl {
         if (!std::filesystem::exists(opts.model_gguf_path)) {
             throw std::runtime_error(supertonic_setup_hint(opts.model_gguf_path));
         }
+        // Map the public Precision enum onto the internal one (separate
+        // declaration so the engine header doesn't pull in internal.h).
+        supertonic_precision internal_precision = supertonic_precision::F32;
+        switch (opts.precision) {
+            case Precision::F32:  internal_precision = supertonic_precision::F32;  break;
+            case Precision::F16:  internal_precision = supertonic_precision::F16;  break;
+            case Precision::Q8_0: internal_precision = supertonic_precision::Q8_0; break;
+        }
         if (!load_supertonic_gguf(opts.model_gguf_path, model,
                                   opts.n_gpu_layers, /*verbose=*/false,
-                                  opts.f16_weights)) {
+                                  opts.f16_weights, internal_precision)) {
             throw std::runtime_error("Supertonic Engine: failed to load GGUF: " +
                                      opts.model_gguf_path);
         }
@@ -238,20 +246,24 @@ struct Engine::Impl {
 
         std::vector<float> latent_mask((size_t) latent_len, 1.0f);
 
-        std::vector<float> next;
-        for (int step = 0; step < steps; ++step) {
-            if (cancel_flag.load(std::memory_order_acquire)) {
-                throw std::runtime_error("Supertonic Engine: cancelled at vector step "
-                                         + std::to_string(step));
-            }
-            if (!supertonic_vector_step_ggml(model, latent.data(), latent_len,
-                                             text_emb.data(), (int) text_ids.size(),
-                                             style_ttl.data(), latent_mask.data(),
-                                             step, steps, next, &error)) {
-                throw std::runtime_error("Supertonic Engine: vector estimator failed: " + error);
-            }
-            latent.swap(next);
+        if (cancel_flag.load(std::memory_order_acquire)) {
+            throw std::runtime_error("Supertonic Engine: cancelled before vector estimator");
+        }
+        // Phase A1+A2: run all CFM steps as ONE ggml graph on non-CPU
+        // backends.  Latent flows step-to-step in GPU memory; on CPU this
+        // falls back to a per-step loop over `supertonic_vector_step_ggml`.
+        // Override via SUPERTONIC_DISABLE_LOOP_GRAPH=1.
+        // NOTE: cancellation granularity is now per-synth on the GPU path
+        // (worst-case cancel latency = whole CFM loop).  CPU keeps per-step
+        // cancellation via the fallback.
+        std::vector<float> final_latent;
+        if (!supertonic_vector_loop_ggml(model, latent.data(), latent_len,
+                                          text_emb.data(), (int) text_ids.size(),
+                                          style_ttl.data(), latent_mask.data(),
+                                          steps, final_latent, &error)) {
+            throw std::runtime_error("Supertonic Engine: vector estimator failed: " + error);
         }
+        latent = std::move(final_latent);
 
         if (cancel_flag.load(std::memory_order_acquire)) {
             throw std::runtime_error("Supertonic Engine: cancelled before vocoder");
diff --git a/tts-cpp/src/supertonic_gguf.cpp b/tts-cpp/src/supertonic_gguf.cpp
index feec5ab7ff7..4f6dd4d5513 100644
--- a/tts-cpp/src/supertonic_gguf.cpp
+++ b/tts-cpp/src/supertonic_gguf.cpp
@@ -21,6 +21,7 @@
 #include <chrono>
 #include <cmath>
 #include <cstdio>
+#include <map>
 #include <cstdlib>
 #include <cstring>
 #include <mutex>
@@ -73,6 +74,89 @@ ggml_tensor * get_tensor_or_null(const supertonic_model & model, const std::stri
     return it == model.tensors.end() ? nullptr : it->second;
 }
 
+// Compute the storage type for a model tensor given the source type from
+// the GGUF and the engine's compute-precision selector.  Non-matmul tensors
+// (biases, norms, embeddings — stored as f32 in the GGUF) are unaffected;
+// only quantized matmul weights actually change destination type.
+//
+// Truth table:
+//   precision \ src_type      | F32  | F16  | Q8_0
+//   --------------------------+------+------+------
+//   F32 (default)             | F32  | F32  | F32
+//   F16  (Phase B1)           | F32  | F16  | F16
+//   Q8_0 (Phase A3)           | F32  | F32  | Q8_0   <-- key win: Metal keeps q8_0
+//
+// F32 row preserves the historical behaviour exactly.
+// Predicate: is `tensor_name` a true matmul weight that lands in a
+// `ggml_mul_mat(weight, activation)` call (weight as src0) where Metal
+// can dispatch `kernel_mul_mm_q8_0_f32` directly?
+//
+// Today this is only the vector_estimator's per-step matmul weights —
+// those go through `dense_matmul_time_wt_pretransposed_ggml` (the
+// B2-partial helper) which uses the pretransposed weight as src0 and
+// dispatches the optimised q8_0 mat-mat kernel.
+//
+// Other GGUF q8_0 sources (text_encoder, duration, speech-prompted
+// attention) still flow through `dense_matmul_time_ggml`, which does
+// `ggml_cont(ggml_transpose(w))` at compute time — and Metal has no
+// CONT kernel for q8_0, so we'd crash.  Phase A3 follow-up: extend
+// the pretranspose-aware helper to those sites and broaden this
+// predicate.
+bool is_supertonic_matmul_weight_name(const std::string & name) {
+    return name.find("vector_estimator:onnx::MatMul_") != std::string::npos;
+}
+
+ggml_type target_supertonic_storage_type(const std::string & name,
+                                         enum ggml_type src_type,
+                                         supertonic_precision precision,
+                                         bool backend_is_cpu) {
+    // Only quantized matmul-weight tensors are subject to the precision
+    // selector.  Everything else (biases, norms, scales, the unicode
+    // indexer i32 lookup, etc.) is passed through unchanged so we don't
+    // attempt a dequant on types that don't have a to_float trait.
+    const bool is_quantized_weight =
+        (src_type == GGML_TYPE_Q8_0) || (src_type == GGML_TYPE_F16);
+    if (!is_quantized_weight) return src_type;
+
+    switch (precision) {
+        case supertonic_precision::F32:  return GGML_TYPE_F32;
+        case supertonic_precision::F16:
+            // Asymmetric like q8_0: on CPU dequant everything to f32 (AMX
+            // cblas takes f32).  On non-CPU keep f16 ONLY for true matmul-
+            // weight tensors that flow through dense_matmul_time_pretransposed_*
+            // — these dispatch ggml-metal's `kernel_mul_mm_f16_f32` directly.
+            // Other quantized GGUF tensors (relpos embeddings, conv1d
+            // kernels, per-channel scales used in plain ggml_mul) flow into
+            // ggml_metal_op_bin which asserts f32 on both srcs, so we dequant
+            // them at load.
+            if (!backend_is_cpu && is_supertonic_matmul_weight_name(name)) {
+                return GGML_TYPE_F16;
+            }
+            return GGML_TYPE_F32;
+        case supertonic_precision::Q8_0:
+            // Asymmetric: on CPU, ALWAYS dequant to f32 so cblas/AMX takes
+            // the weights (q8_0 path on CPU is NEON-only and loses the AMX
+            // advantage; not worth the parity drift).  On non-CPU backends,
+            // keep q8_0 ONLY for true matmul-weight tensors that flow
+            // through `dense_matmul_time_wt_pretransposed_ggml`'s
+            // weight-as-src0 ordering — other quantized GGUF tensors
+            // (relpos embeddings, conv1d kernels) use op patterns that
+            // Metal lacks q8_0 kernels for.
+            if (!backend_is_cpu &&
+                src_type == GGML_TYPE_Q8_0 &&
+                is_supertonic_matmul_weight_name(name)) {
+                return GGML_TYPE_Q8_0;
+            }
+            return GGML_TYPE_F32;
+    }
+    return GGML_TYPE_F32;
+}
+
+bool needs_supertonic_tensor_conversion(enum ggml_type src_type,
+                                        enum ggml_type dst_type) {
+    return src_type != dst_type;
+}
+
 bool should_expand_supertonic_tensor(enum ggml_type type) {
     return type == GGML_TYPE_F16 || type == GGML_TYPE_Q8_0;
 }
@@ -97,6 +181,54 @@ std::vector<float> expand_supertonic_tensor_to_f32(const ggml_tensor * src) {
     return out;
 }
 
+// Convert a GGUF tensor's data into `out_buf`, which the caller has sized
+// to `ggml_row_size(dst_type, n_elems) * (n_rows ...)` — i.e. ggml_nbytes
+// for the destination tensor shape.  Supports any pair the ggml type
+// traits cover: F32 ↔ F16 ↔ Q8_0.  Always converts via f32 as the pivot
+// because that's the only API surface ggml exports publicly.
+void convert_supertonic_tensor_data(const ggml_tensor * src,
+                                    enum ggml_type dst_type,
+                                    std::vector<uint8_t> & out_buf) {
+    const int64_t n = ggml_nelements(src);
+    const void * src_data = ggml_get_data(src);
+
+    if (src->type == dst_type) {
+        // No conversion needed — caller should ideally have skipped this path
+        // and uploaded the raw GGUF bytes, but handle it for completeness.
+        const size_t bytes = ggml_nbytes(src);
+        out_buf.resize(bytes);
+        std::memcpy(out_buf.data(), src_data, bytes);
+        return;
+    }
+
+    // Pivot through f32 using the public ggml_get_type_traits() API.
+    // `ggml_get_type_traits_cpu()->from_float` is also public for the
+    // reverse direction (f32 → quantized).
+    std::vector<float> f32_pivot((size_t) n);
+    const ggml_type_traits * src_tr = ggml_get_type_traits(src->type);
+    if (!src_tr || !src_tr->to_float) {
+        throw std::runtime_error(std::string("Supertonic load: missing to_float for ") +
+                                 ggml_type_name(src->type));
+    }
+    src_tr->to_float(src_data, f32_pivot.data(), n);
+
+    if (dst_type == GGML_TYPE_F32) {
+        out_buf.resize(f32_pivot.size() * sizeof(float));
+        std::memcpy(out_buf.data(), f32_pivot.data(), out_buf.size());
+        return;
+    }
+
+    const size_t dst_bytes = ggml_row_size(dst_type, n);
+    out_buf.resize(dst_bytes);
+
+    const ggml_type_traits_cpu * dst_tr = ggml_get_type_traits_cpu(dst_type);
+    if (!dst_tr || !dst_tr->from_float) {
+        throw std::runtime_error(std::string("Supertonic load: missing from_float for ") +
+                                 ggml_type_name(dst_type));
+    }
+    dst_tr->from_float(f32_pivot.data(), out_buf.data(), n);
+}
+
 ggml_backend_t init_supertonic_backend(int n_gpu_layers, bool verbose) {
 #ifdef GGML_USE_CUDA
     if (n_gpu_layers > 0) {
@@ -497,6 +629,19 @@ ggml_tensor * require_source_tensor(const supertonic_model & model, const std::s
     return it->second;
 }
 
+ggml_tensor * try_source_tensor(const supertonic_model & model, const std::string & source_name) {
+    auto it = model.source_tensors.find(source_name);
+    if (it == model.source_tensors.end()) return nullptr;
+    return it->second;
+}
+
+ggml_tensor * try_pretransposed_weight(const supertonic_model & model, const ggml_tensor * w) {
+    if (!w) return nullptr;
+    auto it = model.pretransposed_weights.find(w);
+    if (it == model.pretransposed_weights.end()) return nullptr;
+    return it->second;
+}
+
 void supertonic_set_n_threads(supertonic_model & model, int n_threads) {
     configure_supertonic_blas_threads_once();
     if (n_threads <= 0) {
@@ -510,6 +655,38 @@ void supertonic_graph_compute(const supertonic_model & model, ggml_cgraph * grap
     if (ggml_backend_is_cpu(model.backend) && model.n_threads > 0) {
         ggml_backend_cpu_set_n_threads(model.backend, model.n_threads);
     }
+    static const bool count_dispatches = std::getenv("SUPERTONIC_COUNT_DISPATCHES") != nullptr;
+    static const bool dump_op_histogram = std::getenv("SUPERTONIC_DUMP_OP_HISTOGRAM") != nullptr;
+    if (dump_op_histogram) {
+        static thread_local int hist_call = 0;
+        ++hist_call;
+        const int n = ggml_graph_n_nodes(graph);
+        std::map<std::string, int> hist;
+        for (int i = 0; i < n; ++i) {
+            ggml_tensor * t = ggml_graph_node(graph, i);
+            hist[ggml_op_name(t->op)] += 1;
+        }
+        fprintf(stderr, "=== supertonic_graph_compute #%d op histogram (n_nodes=%d) ===\n", hist_call, n);
+        std::vector<std::pair<int, std::string>> sorted;
+        for (auto & kv : hist) sorted.emplace_back(kv.second, kv.first);
+        std::sort(sorted.rbegin(), sorted.rend());
+        for (auto & p : sorted) {
+            fprintf(stderr, "  %4d  %s\n", p.first, p.second.c_str());
+        }
+    }
+    if (count_dispatches) {
+        static thread_local int n_calls = 0;
+        static thread_local double total_us = 0.0;
+        ++n_calls;
+        const auto t0 = std::chrono::steady_clock::now();
+        ggml_backend_graph_compute(model.backend, graph);
+        const auto t1 = std::chrono::steady_clock::now();
+        const double us = std::chrono::duration<double, std::micro>(t1 - t0).count();
+        total_us += us;
+        fprintf(stderr, "supertonic_graph_compute #%d nodes=%d  wall=%.1fus  cumul=%.2fms\n",
+                n_calls, ggml_graph_n_nodes(graph), us, total_us / 1000.0);
+        return;
+    }
     ggml_backend_graph_compute(model.backend, graph);
 }
 
@@ -547,8 +724,22 @@ bool load_supertonic_gguf(const std::string & path,
                           supertonic_model & model,
                           int n_gpu_layers,
                           bool verbose,
-                          int f16_weights) {
+                          int f16_weights,
+                          supertonic_precision precision) {
     model.generation_id = next_supertonic_generation_id();
+    model.precision_id = static_cast<int>(precision);
+    // The load path supports F32 / F16 / Q8_0 destination types.
+    // - F32: fully wired.
+    // - Q8_0: storage on Metal only for `:onnx::MatMul_*` weights (the
+    //   optimised `kernel_mul_mm_q8_0_f32` dispatches via the swapped-
+    //   args `dense_matmul_time_wt_pretransposed_ggml` helper).  Other
+    //   tensors expand to f32.  On CPU everything expands to f32 so
+    //   cblas/AMX keeps the lead.
+    // - F16: same asymmetric scheme as Q8_0 — `:onnx::MatMul_*` weights
+    //   stay f16 on Metal (dispatches `kernel_mul_mm_f16_f32`), other
+    //   GGUF-f16 tensors (relpos embeddings, per-channel scales used in
+    //   plain `ggml_mul`) expand to f32 so they don't trip `ggml_metal_op_bin`'s
+    //   f32-only assertion.  Pretranspose pass covers f16 alongside f32/q8_0.
     ggml_context * tmp_ctx = nullptr;
     gguf_init_params gp = { /*.no_alloc=*/ false, /*.ctx=*/ &tmp_ctx };
     gguf_context * gguf_ctx = gguf_init_from_file(path.c_str(), gp);
@@ -651,58 +842,76 @@ bool load_supertonic_gguf(const std::string & path,
         // we use `uint16_t` storage to avoid a public-header dep on
         // ggml's f16 typedef.
         std::unordered_map<std::string, std::vector<uint16_t>>   f16_materialised_tensors;
+        // Tensors that need a Metal-specific type conversion (e.g.
+        // f32 → q8_0 for `--precision q8_0`) keep their converted
+        // bytes here, held alive until the backend upload loop runs.
+        std::unordered_map<std::string, std::vector<uint8_t>>    converted_tensors;
+
+        // Ensure the source-alias map is populated even when the
+        // Phase 2A `use_f16_weights` path didn't already build it —
+        // the precision-driven decision below also needs it to
+        // recognise `:onnx::MatMul_` sources for Metal asymmetric load.
+        if (tensor_to_source_for_alloc.empty()) {
+            int64_t id_tn = gguf_find_key(gguf_ctx, "supertonic.tensor_names");
+            int64_t id_sn = gguf_find_key(gguf_ctx, "supertonic.source_names");
+            if (id_tn >= 0 && id_sn >= 0) {
+                const size_t n_tn = gguf_get_arr_n(gguf_ctx, id_tn);
+                const size_t n_sn = gguf_get_arr_n(gguf_ctx, id_sn);
+                if (n_tn == n_sn) {
+                    for (size_t i = 0; i < n_tn; ++i) {
+                        tensor_to_source_for_alloc[gguf_get_arr_str(gguf_ctx, id_tn, i)] =
+                            gguf_get_arr_str(gguf_ctx, id_sn, i);
+                    }
+                }
+            }
+        }
 
         // Decide per-tensor destination type:
-        //  - F16 / Q8_0 sources: expand to F32 (legacy behaviour;
-        //    `should_expand_supertonic_tensor`).
-        //  - F32 sources on the F16-weights hot-path roster:
-        //    materialise as F16 (Phase 2A).
-        //  - Everything else: preserve the source type via dup.
+        //  1. F32 sources on the F16-weights hot-path roster +
+        //     `use_f16_weights` on → materialise as F16 (Phase 2A).
+        //  2. Else fall through to the precision-driven path:
+        //     `target_supertonic_storage_type` returns F32 / F16 / Q8_0
+        //     depending on `--precision` and whether the source name is
+        //     a `:onnx::MatMul_` weight on a non-CPU backend.
+        //  3. Anything else preserves the source type via dup.
         for (int64_t i = 0; i < num_tensors; ++i) {
             const char * name = gguf_get_tensor_name(gguf_ctx, i);
             ggml_tensor * src = ggml_get_tensor(tmp_ctx, name);
             if (!src) throw std::runtime_error(std::string("missing tmp tensor: ") + name);
 
-            // Phase 2A predicate check.  Only fires when
-            // `use_f16_weights` was on and the source resolved to
-            // a hot-roster name AND its current GGML type is
-            // either F32 or one of the expand-to-F32 types
-            // (otherwise the source already carries narrower
-            // precision than F16 and we don't widen).
+            auto src_it = tensor_to_source_for_alloc.find(name);
+            const std::string & decision_name =
+                (src_it != tensor_to_source_for_alloc.end()) ? src_it->second : std::string(name);
+
+            // Phase 2A predicate check (master).
             bool f16_materialise = false;
-            if (model.use_f16_weights) {
-                auto sit = tensor_to_source_for_alloc.find(name);
-                if (sit != tensor_to_source_for_alloc.end() &&
-                    should_materialise_f16_weight(sit->second) &&
-                    (src->type == GGML_TYPE_F32 ||
-                     should_expand_supertonic_tensor(src->type))) {
-                    f16_materialise = true;
-                }
+            if (model.use_f16_weights &&
+                should_materialise_f16_weight(decision_name) &&
+                (src->type == GGML_TYPE_F32 ||
+                 should_expand_supertonic_tensor(src->type))) {
+                f16_materialise = true;
             }
 
             ggml_type dst_type;
             if (f16_materialise) {
                 dst_type = GGML_TYPE_F16;
-            } else if (should_expand_supertonic_tensor(src->type)) {
-                dst_type = GGML_TYPE_F32;
             } else {
-                dst_type = src->type;
+                // Precision-driven path (ours): F32 / F16 / Q8_0 per
+                // the `--precision` flag.  Returns src->type unchanged
+                // for tensors that don't need conversion.
+                dst_type = target_supertonic_storage_type(
+                    decision_name, src->type, precision,
+                    /*backend_is_cpu=*/ ggml_backend_is_cpu(model.backend));
             }
 
-            ggml_tensor * dst = ggml_new_tensor(model.ctx_w, dst_type,
-                                                 ggml_n_dims(src), src->ne);
+            ggml_tensor * dst = (dst_type == src->type)
+                ? ggml_dup_tensor(model.ctx_w, src)
+                : ggml_new_tensor(model.ctx_w, dst_type, ggml_n_dims(src), src->ne);
             ggml_set_name(dst, name);
             model.tensors[name] = dst;
 
             if (f16_materialise) {
-                // Materialise F32 → F16 host-side.  When src was
-                // originally F16/Q8_0 we expand to F32 first via
-                // the existing helper, then convert back to F16
-                // — round-trip is lossless for the F16 case (the
-                // original 16-bit pattern is preserved) and a
-                // one-shot rounding loss for the Q8_0 case
-                // (acceptable; matches what Q4_0 + F16 down-quant
-                // does in chatterbox).
+                // Phase 2A F16 materialise path.
                 std::vector<float> src_f32;
                 if (should_expand_supertonic_tensor(src->type)) {
                     src_f32 = expand_supertonic_tensor_to_f32(src);
@@ -716,7 +925,13 @@ bool load_supertonic_gguf(const std::string & path,
                 ggml_fp32_to_fp16_row(src_f32.data(),
                                       reinterpret_cast<ggml_fp16_t *>(f16.data()),
                                       (int64_t) src_f32.size());
+            } else if (needs_supertonic_tensor_conversion(src->type, dst_type)) {
+                // Precision-driven conversion (ours).  Covers f32 → q8_0,
+                // q8_0 → f32, f16 → f32 etc.  Buffered here, uploaded later.
+                convert_supertonic_tensor_data(src, dst_type, converted_tensors[name]);
             } else if (should_expand_supertonic_tensor(src->type)) {
+                // Legacy fallback: f16/q8_0 src with f32 dst that
+                // didn't go through the conversion helper above.
                 expanded_f32_tensors[name] = expand_supertonic_tensor_to_f32(src);
             }
         }
@@ -779,16 +994,24 @@ bool load_supertonic_gguf(const std::string & path,
                 continue;
             }
             // Phase 2A: F16-materialised tensors take precedence over
-            // the F32 expansion path (they may have been promoted
-            // from either F32 or F16/Q8_0 sources).
+            // the precision-converted / F32-expanded paths (they may
+            // have been promoted from either F32 or F16/Q8_0 sources).
             auto f16_mat = f16_materialised_tensors.find(ggml_get_name(cur));
             if (f16_mat != f16_materialised_tensors.end()) {
                 ggml_backend_tensor_set(cur, f16_mat->second.data(), 0,
                                         f16_mat->second.size() * sizeof(uint16_t));
                 continue;
             }
-            auto expanded = expanded_f32_tensors.find(ggml_get_name(cur));
-            if (expanded != expanded_f32_tensors.end()) {
+            // Precision-driven conversion (`--precision q8_0`/f16 etc.) —
+            // bytes are already in dst-type representation.
+            auto converted = converted_tensors.find(ggml_get_name(cur));
+            if (converted != converted_tensors.end()) {
+                ggml_backend_tensor_set(cur, converted->second.data(), 0,
+                                        converted->second.size());
+            } else if (auto expanded = expanded_f32_tensors.find(ggml_get_name(cur));
+                       expanded != expanded_f32_tensors.end()) {
+                // Legacy f16/q8_0 → f32 expansion (used when the
+                // conversion helper didn't run).
                 ggml_backend_tensor_set(cur, expanded->second.data(), 0,
                                         expanded->second.size() * sizeof(float));
             } else {
@@ -802,14 +1025,21 @@ bool load_supertonic_gguf(const std::string & path,
             ggml_backend_tensor_get(unicode, model.unicode_indexer.data(), 0, ggml_nbytes(unicode));
         }
 
-        std::vector<std::string> tensor_names = get_string_array(gguf_ctx, "supertonic.tensor_names");
-        std::vector<std::string> source_names = get_string_array(gguf_ctx, "supertonic.source_names");
-        if (tensor_names.size() != source_names.size()) {
-            throw std::runtime_error("supertonic tensor/source metadata length mismatch");
-        }
-        for (size_t i = 0; i < tensor_names.size(); ++i) {
-            ggml_tensor * t = require_tensor(model, tensor_names[i]);
-            model.source_tensors[source_names[i]] = t;
+        // Populate the model's source_tensors lookup from the
+        // GGUF's `supertonic.tensor_names` / `supertonic.source_names`
+        // pair (the `tensor_to_source_for_alloc` map above only carries
+        // the same data for the pre-alloc decision; we re-read here so
+        // we don't have to widen its scope).
+        {
+            std::vector<std::string> tensor_names = get_string_array(gguf_ctx, "supertonic.tensor_names");
+            std::vector<std::string> source_names = get_string_array(gguf_ctx, "supertonic.source_names");
+            if (tensor_names.size() != source_names.size()) {
+                throw std::runtime_error("supertonic.tensor_names / source_names length mismatch");
+            }
+            for (size_t i = 0; i < tensor_names.size(); ++i) {
+                ggml_tensor * t = require_tensor(model, tensor_names[i]);
+                model.source_tensors[source_names[i]] = t;
+            }
         }
 
         for (const std::string & voice_name : get_string_array(gguf_ctx, "supertonic.voice_names")) {
@@ -983,6 +1213,128 @@ bool load_supertonic_gguf(const std::string & path,
                                         0, ggml_nbytes(it->second));
             }
         }
+
+        // Materialize pre-transposed copies of matmul weights to drop the
+        // runtime `cont(transpose(w))` dispatch that `dense_matmul_time_ggml`
+        // emits on every graph compute (~32 sites × 5 CFM steps per synth).
+        // CPU's `cblas_sgemm` already handles the transpose via its `Trans`
+        // flag, so this is a Metal-perf-only optimization — skip the extra
+        // memory + load-time cost on CPU.  Override via
+        // `SUPERTONIC_DISABLE_WEIGHT_PRETRANSPOSE=1` to debug the unpacked
+        // path.
+        //
+        // Coexists with the F6 pre-transposed t_proj pass above: that one
+        // handles 4 specific `[512, 64]` `t_proj` weights and registers
+        // them under the `__T` suffix; this one handles every other
+        // `:onnx::MatMul_` weight under the `:T` suffix.  No collisions.
+        static const bool disable_pretranspose =
+            std::getenv("SUPERTONIC_DISABLE_WEIGHT_PRETRANSPOSE") != nullptr;
+        if (!disable_pretranspose && model.backend &&
+            !ggml_backend_is_cpu(model.backend)) {
+            std::vector<std::pair<std::string, ggml_tensor *>> to_pretranspose;
+            for (const auto & [src_name, t] : model.source_tensors) {
+                if (!t) continue;
+                if (src_name.find(":onnx::MatMul_") == std::string::npos) continue;
+                if (ggml_n_dims(t) != 2) continue;
+                // Pretranspose f32 weights (default precision) AND q8_0 / f16
+                // weights (asymmetric load modes).  For q8_0 / f16 we
+                // dequant→transpose→requantize through f32; the round-trip
+                // introduces tiny rounding within the type's existing noise
+                // tolerance.  This is what unlocks A3 step 2
+                // (kernel_mul_mm_q8_0_f32 / kernel_mul_mm_f16_f32 dispatches
+                // when both (a) the pretransposed weight is available as
+                // src0 and (b) the new dense_matmul_time_wt_pretransposed_ggml
+                // swaps the mul_mat args so the weight is src0).
+                if (t->type != GGML_TYPE_F32 &&
+                    t->type != GGML_TYPE_F16  &&
+                    t->type != GGML_TYPE_Q8_0) continue;
+                to_pretranspose.push_back({src_name, t});
+            }
+            if (!to_pretranspose.empty()) {
+                ggml_init_params extra_params = {
+                    /*.mem_size=*/ ggml_tensor_overhead() * to_pretranspose.size(),
+                    /*.mem_buffer=*/ nullptr,
+                    /*.no_alloc=*/ true,
+                };
+                model.ctx_w_extra = ggml_init(extra_params);
+                if (!model.ctx_w_extra) {
+                    throw std::runtime_error("ggml_init ctx_w_extra failed");
+                }
+                std::vector<std::pair<ggml_tensor *, ggml_tensor *>> orig_to_pre;
+                orig_to_pre.reserve(to_pretranspose.size());
+                for (const auto & [src_name, t] : to_pretranspose) {
+                    // Pre tensor has same type as orig (f32 stays f32,
+                    // q8_0 stays q8_0); only the shape swaps.
+                    ggml_tensor * tt = ggml_new_tensor_2d(model.ctx_w_extra,
+                        t->type, t->ne[1], t->ne[0]);
+                    const std::string tt_name = std::string(ggml_get_name(t)) + ":T";
+                    ggml_set_name(tt, tt_name.c_str());
+                    model.source_tensors[src_name + ":T"] = tt;
+                    orig_to_pre.push_back({t, tt});
+                }
+                model.buffer_w_extra =
+                    ggml_backend_alloc_ctx_tensors(model.ctx_w_extra, model.backend);
+                if (!model.buffer_w_extra) {
+                    throw std::runtime_error(
+                        "ggml_backend_alloc_ctx_tensors ctx_w_extra failed");
+                }
+                // Upload the transposed data.  For f32 weights this is a
+                // straight host-side reorder.  For q8_0 weights we dequant
+                // to f32, transpose in f32, then requantize via from_float
+                // into the pretransposed q8_0 tensor.  Both directions go
+                // through the public ggml type-traits APIs.
+                for (const auto & [orig, pre] : orig_to_pre) {
+                    const int OC = (int) orig->ne[0];
+                    const int IC = (int) orig->ne[1];
+                    const size_t n = (size_t) OC * IC;
+
+                    // Step 1: download `orig` data, dequantize to f32 if needed.
+                    std::vector<float> host_orig_f32(n);
+                    if (orig->type == GGML_TYPE_F32) {
+                        ggml_backend_tensor_get(orig, host_orig_f32.data(), 0,
+                                                n * sizeof(float));
+                    } else {
+                        std::vector<uint8_t> raw(ggml_nbytes(orig));
+                        ggml_backend_tensor_get(orig, raw.data(), 0, raw.size());
+                        const ggml_type_traits * tr = ggml_get_type_traits(orig->type);
+                        if (!tr || !tr->to_float) {
+                            throw std::runtime_error(
+                                std::string("pretranspose: missing to_float for ") +
+                                ggml_type_name(orig->type));
+                        }
+                        tr->to_float(raw.data(), host_orig_f32.data(), (int64_t) n);
+                    }
+
+                    // Step 2: transpose in f32.
+                    std::vector<float> host_pre_f32(n);
+                    for (int oc = 0; oc < OC; ++oc) {
+                        for (int ic = 0; ic < IC; ++ic) {
+                            host_pre_f32[(size_t) ic + (size_t) oc * IC] =
+                                host_orig_f32[(size_t) oc + (size_t) ic * OC];
+                        }
+                    }
+
+                    // Step 3: upload (requantizing if needed).
+                    if (pre->type == GGML_TYPE_F32) {
+                        ggml_backend_tensor_set(pre, host_pre_f32.data(), 0,
+                                                n * sizeof(float));
+                    } else {
+                        const size_t dst_bytes = ggml_row_size(pre->type, n);
+                        std::vector<uint8_t> raw(dst_bytes);
+                        const ggml_type_traits_cpu * dtr =
+                            ggml_get_type_traits_cpu(pre->type);
+                        if (!dtr || !dtr->from_float) {
+                            throw std::runtime_error(
+                                std::string("pretranspose: missing from_float for ") +
+                                ggml_type_name(pre->type));
+                        }
+                        dtr->from_float(host_pre_f32.data(), raw.data(), (int64_t) n);
+                        ggml_backend_tensor_set(pre, raw.data(), 0, raw.size());
+                    }
+                    model.pretransposed_weights[orig] = pre;
+                }
+            }
+        }
     } catch (const std::exception & e) {
         fprintf(stderr, "load_supertonic_gguf: %s\n", e.what());
         gguf_free(gguf_ctx);
@@ -1009,6 +1361,10 @@ void free_supertonic_model(supertonic_model & model) {
     if (model.generation_id != 0) {
         unregister_supertonic_alive(model.generation_id);
     }
+    if (model.buffer_w_extra) {
+        ggml_backend_buffer_free(model.buffer_w_extra);
+        model.buffer_w_extra = nullptr;
+    }
     if (model.buffer_w) {
         ggml_backend_buffer_free(model.buffer_w);
         model.buffer_w = nullptr;
@@ -1017,10 +1373,15 @@ void free_supertonic_model(supertonic_model & model) {
         ggml_backend_free(model.backend);
         model.backend = nullptr;
     }
+    if (model.ctx_w_extra) {
+        ggml_free(model.ctx_w_extra);
+        model.ctx_w_extra = nullptr;
+    }
     if (model.ctx_w) {
         ggml_free(model.ctx_w);
         model.ctx_w = nullptr;
     }
+    model.pretransposed_weights.clear();
     model.tensors.clear();
     model.source_tensors.clear();
     model.vocoder = {};
diff --git a/tts-cpp/src/supertonic_internal.h b/tts-cpp/src/supertonic_internal.h
index 97ae58a3813..d18e84ec131 100644
--- a/tts-cpp/src/supertonic_internal.h
+++ b/tts-cpp/src/supertonic_internal.h
@@ -9,6 +9,7 @@
 #include <vector>
 
 #include "ggml-backend.h"
+#include "ggml-cpu.h"
 #include "ggml.h"
 
 namespace tts_cpp::supertonic::detail {
@@ -131,10 +132,29 @@ struct supertonic_model {
     // Override via `EngineOptions::f16_weights` / `--f16-weights`.
     bool use_f16_weights = false;
 
+    // The compute precision the model was loaded with — set by
+    // `load_supertonic_gguf`.  Lets graph builders dispatch precision-
+    // specific code paths (e.g. asymmetric q8_0 load on Metal).
+    // Orthogonal to `use_f16_weights` above (that's a per-op runtime
+    // selector for the OpenCL hot-weight materialisation; this is the
+    // global storage-type selector).
+    int precision_id = 0; // supertonic_precision::F32
+
     std::map<std::string, ggml_tensor *> tensors;
     std::unordered_map<std::string, ggml_tensor *> source_tensors;
     std::unordered_map<std::string, supertonic_voice_style> voices;
 
+    // Pre-transposed copies of matmul weights, materialized at load time
+    // to eliminate the per-call `cont(transpose(w))` dispatch that
+    // `dense_matmul_time_ggml` issues on every graph compute.  Keyed by
+    // the ORIGINAL weight tensor pointer (i.e. the value in
+    // `source_tensors[<MatMul_*>]`); the mapped value is the transposed
+    // f32 copy with `ne = [IC, OC]` and lives in `ctx_w_extra` /
+    // `buffer_w_extra`.  Lookup via `try_pretransposed_weight(model, w)`.
+    ggml_context * ctx_w_extra = nullptr;
+    ggml_backend_buffer_t buffer_w_extra = nullptr;
+    std::unordered_map<const ggml_tensor *, ggml_tensor *> pretransposed_weights;
+
     std::vector<int32_t> unicode_indexer;
     std::vector<std::string> languages;
     std::string tts_json;
@@ -217,17 +237,48 @@ struct supertonic_model {
 //        regardless of backend).
 // See Phase 2A in `aiDocs/PLAN_SUPERTONIC_OPENCL.md` for the
 // roster + auto-policy rationale.
+//
+// `precision` (separate concern): selects the storage type for
+// matmul weights at GGUF load time.  Mirrors the public
+// `tts_cpp::supertonic::Precision` enum.  F32 is the historical
+// default; Q8_0 / F16 trigger asymmetric loads on Metal.
+enum class supertonic_precision {
+    F32 = 0,
+    F16 = 1,
+    Q8_0 = 2,
+};
+
 bool load_supertonic_gguf(const std::string & path,
                           supertonic_model & model,
                           int n_gpu_layers = 0,
                           bool verbose = false,
-                          int f16_weights = -1);
+                          int f16_weights = -1,
+                          supertonic_precision precision = supertonic_precision::F32);
 void free_supertonic_model(supertonic_model & model);
 void supertonic_set_n_threads(supertonic_model & model, int n_threads);
 void supertonic_graph_compute(const supertonic_model & model, ggml_cgraph * graph);
 
+// True when the model's compute backend supports the per-stage CPU fast paths
+// (the `ggml_custom_4d` callbacks in conv1d_f32 / depthwise_same_ggml /
+// layer_norm_ggml etc.).  ggml custom ops are CPU-only by design; on Metal /
+// CUDA / Vulkan the helpers must fall through to their stock-ggml-op paths.
+// Mirrors the `!ggml_backend_is_cpu(backend)` idiom Chatterbox uses to gate
+// its Metal-only batched-CFG path.
+inline bool model_prefers_cpu_kernels(const supertonic_model & model) {
+    return model.backend == nullptr || ggml_backend_is_cpu(model.backend);
+}
+
 ggml_tensor * require_tensor(const supertonic_model & model, const std::string & name);
 ggml_tensor * require_source_tensor(const supertonic_model & model, const std::string & source_name);
+ggml_tensor * try_source_tensor(const supertonic_model & model, const std::string & source_name);
+
+// Look up a pre-transposed copy of a matmul weight.  Returns nullptr if no
+// pre-transposed copy was materialized for `w` at load time (e.g. CPU backend
+// — pre-transposition is a Metal-perf-only optimization).  When non-null, the
+// returned tensor has `ne = [IC, OC]` (the swapped layout of `w`), is f32 and
+// contiguous in `model.buffer_w_extra`.  Callers should reshape it as the
+// conv1d kernel `[K=1, IC, OC]` directly and skip the cont(transpose(w)).
+ggml_tensor * try_pretransposed_weight(const supertonic_model & model, const ggml_tensor * w);
 
 std::string supertonic_preprocess_text(const std::string & text,
                                        const std::string & language,
@@ -401,6 +452,24 @@ void supertonic_profile_csv_record(const char * stage, const char * island,
 void supertonic_profile_csv_flush();
 void supertonic_profile_csv_set_path(const char * path);
 
+// Phase A1+A2 (Metal): run ALL `total_steps` CFM denoising steps inside
+// ONE ggml_cgraph, dispatched with a single ggml_backend_graph_compute
+// call.  On non-CPU backends this replaces the engine's per-step loop
+// entirely (latent stays in GPU memory step-to-step, no host round-trip).
+// On CPU it falls back to a per-step loop over `supertonic_vector_step_ggml`
+// so the cblas fastpaths still apply.  Override the GPU path with
+// SUPERTONIC_DISABLE_LOOP_GRAPH=1 to A/B against the per-step path.
+bool supertonic_vector_loop_ggml(const supertonic_model & model,
+                                  const float * initial_noisy_latent,
+                                  int latent_len,
+                                  const float * text_emb,
+                                  int text_len,
+                                  const float * style_ttl,
+                                  const float * latent_mask,
+                                  int total_steps,
+                                  std::vector<float> & final_latent_out,
+                                  std::string * error = nullptr);
+
 bool supertonic_vector_trace_proj_ggml(const supertonic_model & model,
                                        const float * noisy_latent,
                                        const float * text_emb,
diff --git a/tts-cpp/src/supertonic_text_encoder.cpp b/tts-cpp/src/supertonic_text_encoder.cpp
index 80ee1f44f87..2ea17f4bd93 100644
--- a/tts-cpp/src/supertonic_text_encoder.cpp
+++ b/tts-cpp/src/supertonic_text_encoder.cpp
@@ -116,7 +116,14 @@ ggml_tensor * repeat_like(ggml_context * ctx, ggml_tensor * v, ggml_tensor * lik
         else if (like->ne[1] == v->ne[0]) v = ggml_reshape_2d(ctx, v, 1, v->ne[0]);
     }
     if (!ggml_can_repeat(v, like)) throw std::runtime_error("cannot repeat tensor in text encoder graph");
-    return ggml_repeat(ctx, v, like);
+    // Every caller feeds this into ggml_add/ggml_mul which broadcast natively;
+    // skip the explicit ggml_repeat dispatch.
+    static const bool force_explicit_repeat =
+        std::getenv("SUPERTONIC_FORCE_EXPLICIT_REPEAT") != nullptr;
+    if (force_explicit_repeat) {
+        return ggml_repeat(ctx, v, like);
+    }
+    return v;
 }
 
 ggml_tensor * conv1d_f32(ggml_context * ctx,
@@ -125,6 +132,8 @@ ggml_tensor * conv1d_f32(ggml_context * ctx,
                          int stride,
                          int padding,
                          int dilation) {
+    // text_encoder uses the pure-graph path unconditionally; no CPU fast path
+    // here so no use_cpu_fastpath plumbing.
     ggml_tensor * im2col = ggml_im2col(ctx, kernel, input, stride, 0, padding, 0, dilation, 0, false, GGML_TYPE_F32);
     ggml_tensor * result = ggml_mul_mat(ctx,
         ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1]),
@@ -133,6 +142,15 @@ ggml_tensor * conv1d_f32(ggml_context * ctx,
 }
 
 ggml_tensor * edge_clamp_pad_1d(ggml_context * ctx, ggml_tensor * x, int pad_left, int pad_right) {
+    if (pad_left == 0 && pad_right == 0) return x;
+    static const bool disable_fused_edge_pad =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_EDGE_PAD") != nullptr;
+    if (!disable_fused_edge_pad &&
+        x->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 &&
+        ggml_is_contiguous(x)) {
+        return ggml_supertonic_edge_pad_1d(ctx, x, pad_left, pad_right);
+    }
     const int64_t L = x->ne[0], C = x->ne[1];
     ggml_tensor * out = x;
     if (pad_left > 0) {
@@ -151,6 +169,16 @@ ggml_tensor * depthwise_same_ggml(ggml_context * ctx,
                                   ggml_tensor * w,
                                   ggml_tensor * b) {
     const int K = (int)w->ne[0];
+    static const bool disable_fused =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_DEPTHWISE") != nullptr;
+    if (!disable_fused && (K == 3 || K == 5) &&
+        x->type == GGML_TYPE_F32 && w->type == GGML_TYPE_F32 &&
+        b->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 && w->ne[1] == 1 && w->ne[3] == 1 &&
+        w->ne[2] == x->ne[1] && b->ne[0] == x->ne[1] &&
+        ggml_is_contiguous(x) && ggml_is_contiguous(w) && ggml_is_contiguous(b)) {
+        return ggml_supertonic_depthwise_1d(ctx, x, w, b, 1);
+    }
     const int pad_left = (K - 1) / 2;
     const int pad_right = (K - 1) - pad_left;
     ggml_tensor * padded = edge_clamp_pad_1d(ctx, x, pad_left, pad_right);
@@ -162,6 +190,15 @@ ggml_tensor * depthwise_same_ggml(ggml_context * ctx,
 }
 
 ggml_tensor * layer_norm_ggml(ggml_context * ctx, ggml_tensor * x, ggml_tensor * g, ggml_tensor * b) {
+    static const bool disable_fused_layer_norm =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_LAYER_NORM") != nullptr;
+    if (!disable_fused_layer_norm &&
+        x->type == GGML_TYPE_F32 && g->type == GGML_TYPE_F32 && b->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 &&
+        g->ne[0] == x->ne[1] && b->ne[0] == x->ne[1] &&
+        ggml_is_contiguous(x) && ggml_is_contiguous(g) && ggml_is_contiguous(b)) {
+        return ggml_supertonic_layer_norm_channel(ctx, x, g, b, 1e-6f);
+    }
     ggml_tensor * xt = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));
     xt = ggml_norm(ctx, xt, 1e-6f);
     xt = ggml_mul(ctx, xt, repeat_like(ctx, g, xt));
@@ -683,6 +720,9 @@ void speech_prompted_attention(const supertonic_model & m, int idx,
     dense_time_matmul(merged, L, C, out_w, out_b, C, out_lc);
 }
 
+// `speech_attention_cache` + `build_speech_attention_cache` own the
+// second-of-two graph caches `speech_prompted_attention_ggml` runs
+// (flash-attn + out-proj after host-side q/k/v_pack work).
 struct speech_attention_cache {
     const supertonic_model * model = nullptr;
     uint64_t generation_id = 0;
@@ -700,19 +740,19 @@ struct speech_attention_cache {
     ggml_tensor * v = nullptr;
 };
 
-void free_speech_attention_cache(speech_attention_cache & cache) {
+inline void free_speech_attention_cache(speech_attention_cache & cache) {
     supertonic_safe_gallocr_free(cache.allocr, cache.generation_id);
     if (cache.ctx) ggml_free(cache.ctx);
     cache = {};
 }
 
-void build_speech_attention_cache(speech_attention_cache & cache,
-                                  const supertonic_model & m,
-                                  int idx,
-                                  int L,
-                                  int Lctx,
-                                  const std::string & out_w_source,
-                                  const std::string & out_b_source) {
+inline void build_speech_attention_cache(speech_attention_cache & cache,
+                                         const supertonic_model & m,
+                                         int idx,
+                                         int L,
+                                         int Lctx,
+                                         const std::string & out_w_source,
+                                         const std::string & out_b_source) {
     free_speech_attention_cache(cache);
     cache.model = &m;
     cache.generation_id = m.generation_id;
@@ -748,6 +788,123 @@ void build_speech_attention_cache(speech_attention_cache & cache,
     ggml_gallocr_alloc_graph(cache.allocr, cache.gf);
 }
 
+// Phase A4: speech_prompted_attention as ONE merged ggml graph.
+//
+// Pre-A4 this function built two separate graphs (QKV proj, then
+// flash-attn+out-proj) with host-side q_pack/v_pack/k_pack head-split
+// work between them.  The merged version does the head-split in-graph
+// via reshape + permute + cont (or relies on ggml's view semantics
+// where it's free), feeds straight into flash_attn, and runs the out
+// projection — all in one `ggml_backend_graph_compute` call.
+//
+// Per call savings: 1 graph dispatch (one fewer command buffer) +
+// host-side pack work (3 round-trips of q/v/k_pack data eliminated).
+// Two calls per synth = 2 dispatches saved.
+struct speech_prompted_merged_cache {
+    const supertonic_model * model = nullptr;
+    uint64_t generation_id = 0;
+    int idx = -1;
+    int L = 0;
+    int Lctx = 0;
+    std::string out_w_source;
+    std::string out_b_source;
+    std::vector<uint8_t> buf;
+    ggml_context * ctx = nullptr;
+    ggml_cgraph * gf = nullptr;
+    ggml_gallocr_t allocr = nullptr;
+    ggml_tensor * x_in = nullptr;       // [L, C]
+    ggml_tensor * style_in = nullptr;   // [Lctx, C]
+    ggml_tensor * out = nullptr;        // [L, C] result
+};
+
+void free_speech_prompted_merged_cache(speech_prompted_merged_cache & cache) {
+    supertonic_safe_gallocr_free(cache.allocr, cache.generation_id);
+    if (cache.ctx) ggml_free(cache.ctx);
+    cache = {};
+}
+
+void build_speech_prompted_merged_cache(speech_prompted_merged_cache & cache,
+                                        const supertonic_model & m,
+                                        int idx,
+                                        int L,
+                                        int Lctx,
+                                        const std::string & q_w_source,
+                                        const std::string & v_w_source,
+                                        const std::string & out_w_source,
+                                        const std::string & out_b_source,
+                                        const std::string & tanh_k_source,
+                                        const std::string & q_b_source,
+                                        const std::string & v_b_source) {
+    const int C = 256;
+    const int half = 128;
+    const int H = 2;
+    (void)H;
+    free_speech_prompted_merged_cache(cache);
+    cache.model = &m;
+    cache.generation_id = m.generation_id;
+    cache.idx = idx;
+    cache.L = L;
+    cache.Lctx = Lctx;
+    cache.out_w_source = out_w_source;
+    cache.out_b_source = out_b_source;
+
+    constexpr int NODES = 512;
+    const size_t buf_size = ggml_tensor_overhead() * NODES + ggml_graph_overhead_custom(NODES, false);
+    cache.buf.assign(buf_size, 0);
+    ggml_init_params gp = { buf_size, cache.buf.data(), true };
+    cache.ctx = ggml_init(gp);
+    cache.gf = ggml_new_graph_custom(cache.ctx, NODES, false);
+
+    cache.x_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, L, C);
+    ggml_set_name(cache.x_in, "spm_x_in"); ggml_set_input(cache.x_in);
+    cache.style_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, Lctx, C);
+    ggml_set_name(cache.style_in, "spm_style_in"); ggml_set_input(cache.style_in);
+
+    // Q proj.  Output ne=[L, C].  Head-split: reshape to [L, half, H]
+    // then permute(1, 0, 2, 3) → cont gives [half, L, H] — the layout
+    // flash_attn views as [head_dim, q_len, n_heads].
+    ggml_tensor * q_tc = dense_matmul_time_ggml(cache.ctx, cache.x_in,
+        require_source_tensor(m, q_w_source),
+        require_source_tensor(m, q_b_source));
+    ggml_tensor * q_3d = ggml_reshape_3d(cache.ctx, q_tc, L, half, 2);
+    ggml_tensor * q_dlh = ggml_cont(cache.ctx, ggml_permute(cache.ctx, q_3d, 1, 0, 2, 3));
+
+    // V proj on style.  Same head-split into [half, Lctx, H].
+    ggml_tensor * v_tc = dense_matmul_time_ggml(cache.ctx, cache.style_in,
+        require_source_tensor(m, v_w_source),
+        require_source_tensor(m, v_b_source));
+    ggml_tensor * v_3d = ggml_reshape_3d(cache.ctx, v_tc, Lctx, half, 2);
+    ggml_tensor * v_dlh = ggml_cont(cache.ctx, ggml_permute(cache.ctx, v_3d, 1, 0, 2, 3));
+
+    // K is the precomputed tanh_k model tensor.  Stored as ne=[Lctx, C].
+    // Same head-split: reshape to [Lctx, half, H] then permute to
+    // [half, Lctx, H] and cont.  No per-call host work needed since
+    // K is constant per model.
+    ggml_tensor * k_orig = require_source_tensor(m, tanh_k_source);
+    ggml_tensor * k_3d = ggml_reshape_3d(cache.ctx, k_orig, Lctx, half, 2);
+    ggml_tensor * k_dlh = ggml_cont(cache.ctx, ggml_permute(cache.ctx, k_3d, 1, 0, 2, 3));
+
+    // Flash attention.  Same call shape as the pre-A4 path.
+    ggml_tensor * attn = ggml_flash_attn_ext(cache.ctx, q_dlh, k_dlh, v_dlh,
+                                              nullptr, 1.0f / 16.0f, 0.0f, 0.0f);
+    attn = ggml_reshape_2d(cache.ctx, attn, C, L);
+    ggml_tensor * ctx_tc = ggml_cont(cache.ctx, ggml_transpose(cache.ctx, attn));
+
+    // Output projection.
+    cache.out = dense_matmul_time_ggml(cache.ctx, ctx_tc,
+        require_source_tensor(m, out_w_source),
+        require_source_tensor(m, out_b_source));
+    ggml_set_name(cache.out, "spm_out"); ggml_set_output(cache.out);
+    ggml_build_forward_expand(cache.gf, cache.out);
+
+    cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m.backend));
+    if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new speech_prompted_merged failed");
+    if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) {
+        throw std::runtime_error("ggml_gallocr_reserve speech_prompted_merged failed");
+    }
+    ggml_gallocr_alloc_graph(cache.allocr, cache.gf);
+}
+
 // F14 — cached speech-prompted attention QKV graph.
 //
 // Pre-audit, `speech_prompted_attention_ggml` allocated a fresh
@@ -787,6 +944,8 @@ void speech_prompted_attention_ggml(const supertonic_model & m, int idx,
     const std::string q_w = "text_encoder:" + std::string(idx == 0 ? "onnx::MatMul_3678" : "onnx::MatMul_3682");
     const std::string v_w = "text_encoder:" + std::string(idx == 0 ? "onnx::MatMul_3680" : "onnx::MatMul_3684");
     const std::string o_w = "text_encoder:" + std::string(idx == 0 ? "onnx::MatMul_3681" : "onnx::MatMul_3685");
+    const std::string tanh_k_src = "text_encoder:/speech_prompted_text_encoder/attention" + std::to_string(attn_num) + "/tanh/Tanh_output_0";
+    (void) tanh_k_src; // master's path uses model.speech_tanh_k_cache; tanh_k_src kept for symbolic parity with read_f32 fallback below.
 
     // F14: per-(model, idx, L) cached QKV graph.  Two thread-local
     // slots so the two speech-prompted layers don't fight over a
@@ -879,8 +1038,9 @@ void speech_prompted_attention_ggml(const supertonic_model & m, int idx,
     speech_attention_cache & cache = caches[idx];
     if (cache.model != &m || cache.generation_id != m.generation_id ||
         cache.idx != idx || cache.L != L || cache.Lctx != Lctx ||
-        cache.out_w_source != o_w || cache.out_b_source != p + ".out_fc.linear.bias") {
-        build_speech_attention_cache(cache, m, idx, L, Lctx, o_w, p + ".out_fc.linear.bias");
+        cache.out_w_source != o_w) {
+        build_speech_attention_cache(cache, m, idx, L, Lctx, o_w,
+                                      p + ".out_fc.linear.bias");
     }
     ggml_backend_tensor_set(cache.q, q_pack.data(), 0, q_pack.size()*sizeof(float));
     ggml_backend_tensor_set(cache.k, k_pack.data(), 0, k_pack.size()*sizeof(float));
diff --git a/tts-cpp/src/supertonic_vector_estimator.cpp b/tts-cpp/src/supertonic_vector_estimator.cpp
index 597957a06f3..42e12f5ff01 100644
--- a/tts-cpp/src/supertonic_vector_estimator.cpp
+++ b/tts-cpp/src/supertonic_vector_estimator.cpp
@@ -155,7 +155,18 @@ ggml_tensor * repeat_like(ggml_context * ctx, ggml_tensor * v, ggml_tensor * lik
             std::to_string(like->ne[0]) + "," + std::to_string(like->ne[1]) + "," +
             std::to_string(like->ne[2]) + "," + std::to_string(like->ne[3]) + "]");
     }
-    return ggml_repeat(ctx, v, like);
+    // Every call site in this file feeds the return value straight into
+    // ggml_add / ggml_mul, both of which broadcast natively in ggml.  Skip
+    // the explicit ggml_repeat node so the downstream op handles the
+    // broadcast — saves ~282 REPEAT ops per consolidated per-step graph.
+    // Override with SUPERTONIC_FORCE_EXPLICIT_REPEAT=1 if this regresses
+    // on a backend that doesn't broadcast (none observed today).
+    static const bool force_explicit_repeat =
+        std::getenv("SUPERTONIC_FORCE_EXPLICIT_REPEAT") != nullptr;
+    if (force_explicit_repeat) {
+        return ggml_repeat(ctx, v, like);
+    }
+    return v;
 }
 
 ggml_tensor * conv1d_f32(ggml_context * ctx,
@@ -217,6 +228,19 @@ ggml_tensor * conv1d_f32(ggml_context * ctx,
 }
 
 ggml_tensor * edge_clamp_pad_1d(ggml_context * ctx, ggml_tensor * x, int pad_left, int pad_right) {
+    if (pad_left == 0 && pad_right == 0) return x;
+    // Fused fast path via supertonic_edge_pad_1d.  Same kernel handles
+    // both sides; the legacy view + repeat_4d + concat chain (2 ops
+    // per side) becomes 1 dispatch total.  Override:
+    // SUPERTONIC_DISABLE_FUSED_EDGE_PAD=1.
+    static const bool disable_fused_edge_pad =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_EDGE_PAD") != nullptr;
+    if (!disable_fused_edge_pad &&
+        x->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 &&
+        ggml_is_contiguous(x)) {
+        return ggml_supertonic_edge_pad_1d(ctx, x, pad_left, pad_right);
+    }
     const int64_t L = x->ne[0];
     const int64_t C = x->ne[1];
     ggml_tensor * out = x;
@@ -337,6 +361,23 @@ ggml_tensor * depthwise_same_ggml(ggml_context * ctx,
         return custom;
     }
     const int K = (int) w->ne[0];
+    // Fused-op fast path (any backend that registers GGML_OP_SUPERTONIC_DEPTHWISE_1D
+    // — Metal does via the local ggml port overlay; CPU's
+    // ggml_compute_forward_supertonic_depthwise_1d is the parity backstop).
+    // Replaces the edge_clamp_pad + im2col + mul_mat + add chain with one
+    // dispatch.  Currently supports K in {3, 5}; the existing graph path is
+    // the fallback for K outside that set.  Override with
+    // SUPERTONIC_DISABLE_FUSED_DEPTHWISE=1 to force the stock-op chain.
+    static const bool disable_fused =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_DEPTHWISE") != nullptr;
+    if (!disable_fused && (K == 3 || K == 5) &&
+        x->type == GGML_TYPE_F32 && w->type == GGML_TYPE_F32 &&
+        b->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 && w->ne[1] == 1 && w->ne[3] == 1 &&
+        w->ne[2] == x->ne[1] && b->ne[0] == x->ne[1] &&
+        ggml_is_contiguous(x) && ggml_is_contiguous(w) && ggml_is_contiguous(b)) {
+        return ggml_supertonic_depthwise_1d(ctx, x, w, b, dilation);
+    }
     const int pad_left = ((K - 1) * dilation) / 2;
     const int pad_right = (K - 1) * dilation - pad_left;
     ggml_tensor * padded = edge_clamp_pad_1d(ctx, x, pad_left, pad_right);
@@ -351,6 +392,19 @@ ggml_tensor * layer_norm_ggml(ggml_context * ctx,
                               ggml_tensor * x,
                               ggml_tensor * g,
                               ggml_tensor * b) {
+    // Fused-op fast path on non-CPU backends (Metal/Vulkan/CUDA/OpenCL):
+    // GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL collapses the
+    // permute + cont + ggml_norm + mul + add + permute + cont chain into
+    // a single dispatch.  Override with SUPERTONIC_DISABLE_FUSED_LAYER_NORM=1.
+    static const bool disable_fused_layer_norm =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_LAYER_NORM") != nullptr;
+    if (!supertonic_use_cpu_custom_ops() && !disable_fused_layer_norm &&
+        x->type == GGML_TYPE_F32 && g->type == GGML_TYPE_F32 && b->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 &&
+        g->ne[0] == x->ne[1] && b->ne[0] == x->ne[1] &&
+        ggml_is_contiguous(x) && ggml_is_contiguous(g) && ggml_is_contiguous(b)) {
+        return ggml_supertonic_layer_norm_channel(ctx, x, g, b, 1e-6f);
+    }
     // CPU-only direct row-wise layer-norm; falls through to permute +
     // ggml_norm on non-CPU backends so the graph stays GPU-executable.
     if (supertonic_use_cpu_custom_ops() &&
@@ -465,6 +519,13 @@ ggml_tensor * dense_matmul_time_ggml(ggml_context * ctx,
     // tensors are loaded as ne=[OC, IC].  Make that transpose contiguous, then
     // view it as a Conv1d kernel [K=1, IC, OC] so it can consume the repo's
     // standard time-major activation layout [T, IC].
+    //
+    // Tried replacing this conv1d_f32 wrapper with a direct ggml_mul_mat on
+    // 2026-05-11 — it requires cont on BOTH operands to satisfy mul_mat's
+    // !ggml_is_transposed(A) assertion, which yields the SAME dispatch count
+    // (cont + cont + mul_mat + add) as the current conv1d path (cont +
+    // im2col + mul_mat + add).  Net wash; keeping conv1d_f32 because it's
+    // already battle-tested with the CPU fastpath.
     ggml_tensor * wt = ggml_cont(ctx, ggml_transpose(ctx, w));
     ggml_tensor * kernel = ggml_reshape_3d(ctx, wt, 1, w->ne[1], w->ne[0]);
     ggml_tensor * y = conv1d_f32(ctx, kernel, x, 1, 0, 1);
@@ -472,9 +533,146 @@ ggml_tensor * dense_matmul_time_ggml(ggml_context * ctx,
     return y;
 }
 
+// Same as dense_matmul_time_ggml, but `model` is consulted for a pre-
+// transposed copy of `w` (built at load time for `:onnx::MatMul_*` weights
+// on non-CPU backends).  When available, the runtime `cont(transpose(w))`
+// dispatch is skipped — the pre-transposed tensor already has the
+// `[IC, OC]` layout that the conv1d_f32 K=1 kernel expects.  CPU callers
+// fall through to the original path (the cblas pointwise fast path takes
+// the loaded `[OC, IC]` weight directly).
+// Forward decl — defined below.
+ggml_tensor * dense_matmul_time_wt_pretransposed_ggml(ggml_context * ctx,
+                                                      const supertonic_model & model,
+                                                      ggml_tensor * x,
+                                                      ggml_tensor * w,
+                                                      ggml_tensor * b);
+
+ggml_tensor * dense_matmul_time_pretransposed_ggml(ggml_context * ctx,
+                                                   const supertonic_model & model,
+                                                   ggml_tensor * x,
+                                                   ggml_tensor * w,
+                                                   ggml_tensor * b) {
+    if (!supertonic_use_cpu_custom_ops()) {
+        if (ggml_tensor * w_pre = try_pretransposed_weight(model, w)) {
+            if (w_pre->type == GGML_TYPE_F32) {
+                // f32 fast path: reshape w_pre into the conv1d kernel
+                // [K=1, IC, OC] and dispatch via the existing wrapper.
+                // mul_mat(im2col_f32, kernel_f32) hits the optimised
+                // kernel_mul_mm_f32_f32.
+                ggml_tensor * kernel = ggml_reshape_3d(ctx, w_pre, 1, w_pre->ne[0], w_pre->ne[1]);
+                ggml_tensor * y = conv1d_f32(ctx, kernel, x, 1, 0, 1);
+                if (b) y = ggml_add(ctx, y, repeat_like(ctx, b, y));
+                return y;
+            }
+            // Quantized w_pre (q8_0): the f32 fast path's
+            // mul_mat(im2col_f32, kernel_quant) would need a
+            // kernel_mul_mm_f32_q8_0 variant which ggml-metal doesn't ship.
+            // Route through the wt helper (kernel as src0 — dispatches
+            // kernel_mul_mm_q8_0_f32) and transpose the [A, T] result back
+            // to [T, A] so the caller's downstream code (residual adds,
+            // [T, C]-shaped intermediate state) doesn't have to change.
+            ggml_tensor * y_wt = dense_matmul_time_wt_pretransposed_ggml(
+                ctx, model, x, w, b);
+            return ggml_cont(ctx, ggml_transpose(ctx, y_wt));
+        }
+    }
+    return dense_matmul_time_ggml(ctx, x, w, b);
+}
+
+// Phase B2 partial: like dense_matmul_time_pretransposed_ggml but emits
+// the result in *width-major* `[OC, T]` layout instead of `[T, OC]`.
+//
+// The trick is to swap the `ggml_mul_mat` operand order from
+// `mul_mat(im2col_[IC,T], kernel_[IC,OC]) -> [T, OC]` to
+// `mul_mat(kernel_[IC,OC], im2col_[IC,T]) -> [OC, T]`.  Both operands
+// stay non-transposed so the assertion on `a`/`b` is satisfied.  The
+// kernel-as-`src0` ordering is also what `kernel_mul_mm_q8_0_f32`
+// requires, so this single change *also* unlocks A3 step 2 (the
+// optimized quantized matmul kernel will dispatch when `w_pre` is
+// q8_0 — see the asymmetric load logic in supertonic_gguf.cpp).
+//
+// Used at the Q/K/V projection sites in the per-step graph: the
+// downstream rope + flash_attn expect `[A, L]` layout, so the cont
+// (transpose) that used to flip `[L, A]` -> `[A, L]` becomes dead
+// code.  Eliminates ~24 cont dispatches per per-step graph × 5
+// steps = ~120 ops per synth.
+//
+// Bias add: `b` (shape `[OC]`) broadcasts naturally against the
+// new `[OC, T]` output via `repeat_like`'s 1-d → 2-d reshape on the
+// `ne[0]` match.
+//
+// Falls through to the legacy path with a runtime cont(transpose)
+// on the activation when no pretransposed weight is available
+// (e.g. weight not on the `:onnx::MatMul_` allowlist).
+ggml_tensor * dense_matmul_time_wt_pretransposed_ggml(ggml_context * ctx,
+                                                      const supertonic_model & model,
+                                                      ggml_tensor * x,
+                                                      ggml_tensor * w,
+                                                      ggml_tensor * b) {
+    if (!supertonic_use_cpu_custom_ops()) {
+        if (ggml_tensor * w_pre = try_pretransposed_weight(model, w)) {
+            const int IC = (int) w_pre->ne[0];
+            const int OC = (int) w_pre->ne[1];
+
+            // ggml_im2col only reads the kernel's SHAPE (ne[0..3]); it never
+            // touches the kernel data — the output buffer holds the
+            // rearranged activation.  So for the SHAPE we can use:
+            //   - a reshape of w_pre when w_pre is f32 (cheap, just metadata)
+            //   - a tiny phantom f32 tensor allocated in the graph context
+            //     when w_pre is quantized (because reshape_3d(q8_0, 1, IC, OC)
+            //     would set ne[0]=1 < q8_0's 32-element block size and break
+            //     the type's invariants).  The phantom is never read.
+            ggml_tensor * shape_kernel;
+            if (w_pre->type == GGML_TYPE_F32) {
+                shape_kernel = ggml_reshape_3d(ctx, w_pre, 1, IC, OC);
+            } else {
+                shape_kernel = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, IC, OC);
+                // No data needs binding — im2col only consults ne[0..3].
+            }
+
+            ggml_tensor * im2col = ggml_im2col(ctx, shape_kernel, x, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F32);
+            // im2col has ne=[IC, T, 1, 1].  Reshape to 2D for mul_mat.
+            ggml_tensor * im2col_2d = ggml_reshape_2d(ctx, im2col,
+                                                      im2col->ne[0], im2col->ne[2] * im2col->ne[1]);
+            // Swapped order: w_pre first (src0 = the quantized/f32 weight),
+            // im2col second (src1 = f32 activation).  Result is [M=OC, N=T].
+            // For w_pre=q8_0 this dispatches kernel_mul_mm_q8_0_f32 — the
+            // bandwidth-optimised quantized matmul kernel — which is the
+            // A3 step 2 unlock.
+            ggml_tensor * w_2d = ggml_reshape_2d(ctx, w_pre, IC, OC);
+            ggml_tensor * y = ggml_mul_mat(ctx, w_2d, im2col_2d);
+            // y has ne=[OC, T] — already the wt layout.
+            if (b) y = ggml_add(ctx, y, repeat_like(ctx, b, y));
+            return y;
+        }
+    }
+    // Fallback: legacy [T, OC] matmul + explicit cont(transpose) to
+    // produce [OC, T] for the caller.  CPU also lands here (and gets
+    // the cblas fast path for free via dense_matmul_time_ggml).
+    ggml_tensor * y_tc = dense_matmul_time_ggml(ctx, x, w, b);
+    return ggml_cont(ctx, ggml_transpose(ctx, y_tc));
+}
+
 ggml_tensor * bias_gelu_ggml(ggml_context * ctx, ggml_tensor * x, ggml_tensor * b) {
+    const bool use_cpu_custom = supertonic_use_cpu_custom_ops();
+    // Fused-op fast path (any backend that registers
+    // GGML_OP_SUPERTONIC_BIAS_GELU — Metal does via the local ggml port
+    // overlay; CPU's ggml_compute_forward_supertonic_bias_gelu is the
+    // parity backstop).  Replaces the add(bias) + gelu_erf chain
+    // (2 dispatches on Metal) with one dispatch.  Override with
+    // SUPERTONIC_DISABLE_FUSED_BIAS_GELU=1 to force the stock-op chain.
+    // Skipped on CPU custom-op backends (cblas path below is faster).
+    static const bool disable_fused_bias_gelu =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_BIAS_GELU") != nullptr;
+    if (!use_cpu_custom && !disable_fused_bias_gelu &&
+        x->type == GGML_TYPE_F32 && b->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 &&
+        b->ne[0] == x->ne[1] &&
+        ggml_is_contiguous(x) && ggml_is_contiguous(b)) {
+        return ggml_supertonic_bias_gelu(ctx, x, b);
+    }
     // CPU-only fused bias + GELU; falls back to gelu(add(x, b)) on GPU.
-    if (supertonic_use_cpu_custom_ops() &&
+    if (use_cpu_custom &&
         x->type == GGML_TYPE_F32 && b->type == GGML_TYPE_F32 && x->ne[2] == 1 && x->ne[3] == 1) {
         auto op = [](ggml_tensor * dst, int ith, int nth, void *) {
             const ggml_tensor * src = dst->src[0];
@@ -507,9 +705,29 @@ ggml_tensor * pw2_residual_ggml(ggml_context * ctx,
                                 ggml_tensor * x,
                                 ggml_tensor * b,
                                 ggml_tensor * gamma) {
+    const bool use_cpu_custom = supertonic_use_cpu_custom_ops();
+    // Fused-op fast path (any backend that registers
+    // GGML_OP_SUPERTONIC_PW2_RESIDUAL — Metal does via the local ggml port
+    // overlay; CPU's ggml_compute_forward_supertonic_pw2_residual is the
+    // parity backstop).  Replaces the add(bias) + mul(gamma) + add(residual)
+    // chain with one dispatch.  Override with
+    // SUPERTONIC_DISABLE_FUSED_PW2_RESIDUAL=1 to force the stock-op chain.
+    // Skipped on CPU custom-op backends (cblas fast path below is faster).
+    static const bool disable_fused_pw2_residual =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_PW2_RESIDUAL") != nullptr;
+    if (!use_cpu_custom && !disable_fused_pw2_residual &&
+        residual->type == GGML_TYPE_F32 && x->type == GGML_TYPE_F32 &&
+        b->type == GGML_TYPE_F32 && gamma->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 &&
+        residual->ne[0] == x->ne[0] && residual->ne[1] == x->ne[1] &&
+        b->ne[0] == x->ne[1] && gamma->ne[0] == x->ne[1] &&
+        ggml_is_contiguous(residual) && ggml_is_contiguous(x) &&
+        ggml_is_contiguous(b) && ggml_is_contiguous(gamma)) {
+        return ggml_supertonic_pw2_residual(ctx, residual, x, b, gamma);
+    }
     // CPU-only fused (bias + gamma + residual); falls back to the
     // 3-step add/mul/add chain on GPU.
-    if (supertonic_use_cpu_custom_ops() &&
+    if (use_cpu_custom &&
         residual->type == GGML_TYPE_F32 && x->type == GGML_TYPE_F32 &&
         b->type == GGML_TYPE_F32 && gamma->type == GGML_TYPE_F32 &&
         x->ne[2] == 1 && x->ne[3] == 1) {
@@ -568,6 +786,109 @@ ggml_tensor * vector_convnext_ggml(ggml_context * ctx,
         require_source_tensor(model, p + ".gamma"));
 }
 
+// Phase B2 full: [C, T]-layout pointwise (K=1) Conv1d as a direct matmul.
+//
+// pwconv1/pwconv2 weights load as Conv1d kernels with ne=[K=1, IC, OC, 1].
+// With activations already in [C, T] layout (IC inner-most), the K=1
+// dimension is degenerate and the convolution is just:
+//
+//   y[OC, T] = sum_IC w[IC, OC] * x[IC, T]
+//
+// which is exactly `ggml_mul_mat(w_2d=[IC, OC], x_2d=[IC, T])` — no
+// im2col, no transpose, no pretranspose-cache lookup needed.  Result is
+// f32 contiguous and directly consumable by the next [C, T] op.
+//
+// CPU is intentionally NOT routed here: AMX cblas_sgemm in the legacy
+// path is faster than the equivalent ggml_mul_mat dispatch on Apple
+// CPUs.  Caller's `vector_convnext_ggml_ct` already roundtrips on CPU.
+ggml_tensor * pointwise_matmul_ct(ggml_context * ctx,
+                                  ggml_tensor * x_ct,   // [IC, T, 1, 1]
+                                  ggml_tensor * w,      // [1, IC, OC, 1]  (Conv1d K=1)
+                                  ggml_tensor * b) {
+    GGML_ASSERT(w->ne[0] == 1);            // K=1
+    GGML_ASSERT(w->ne[1] == x_ct->ne[0]);  // IC match
+    GGML_ASSERT(ggml_is_contiguous(w));
+    ggml_tensor * w_2d = ggml_reshape_2d(ctx, w, w->ne[1], w->ne[2]);
+    ggml_tensor * x_2d = ggml_reshape_2d(ctx, x_ct, x_ct->ne[0], x_ct->ne[1]);
+    ggml_tensor * y = ggml_mul_mat(ctx, w_2d, x_2d);  // [OC, T]
+    if (b) y = ggml_add(ctx, y, repeat_like(ctx, b, y));
+    return y;
+}
+
+// Phase B2 full: ConvNeXt block operating on `[C, T]` activations end-to-end.
+// All five fused custom Metal kernels have layout-flag plumbing landed in
+// port-version 13; this block strings their `_ct` variants together so the
+// activation tensor never needs to flip layout mid-block.  Used by callers
+// that fuse a chain of N convnext blocks with a single entry permute
+// `[T, C] -> [C, T]` before the loop and a single exit permute after — net
+// savings = (N - 1) intra-block transposes per chain × 5 CFM steps.
+//
+// Input  x:   [C, T, 1, 1]  f32 contiguous
+// Output    : [C, T, 1, 1]  f32 contiguous
+//
+// CPU backends fall through to the legacy `[T, C]` path: the `_ct` ops have
+// CPU forward implementations but they would force AMX-cblas off, so on
+// CPU we permute in/out around the legacy block to keep AMX engaged.
+ggml_tensor * vector_convnext_ggml_ct(ggml_context * ctx,
+                                      const supertonic_model & model,
+                                      const std::string & p,
+                                      ggml_tensor * x_ct,
+                                      int dilation) {
+    if (model_prefers_cpu_kernels(model)) {
+        // CPU: roundtrip to [T, C], run legacy block (AMX cblas fast path),
+        // roundtrip back.  Cheap on CPU because the permute is just a copy.
+        ggml_tensor * x_tc = ggml_cont(ctx, ggml_permute(ctx, x_ct, 1, 0, 2, 3));
+        ggml_tensor * y_tc = vector_convnext_ggml(ctx, model, p, x_tc, dilation);
+        return ggml_cont(ctx, ggml_permute(ctx, y_tc, 1, 0, 2, 3));
+    }
+
+    // Helper: flatten leading-1 dims so per-channel tensors come out as [C].
+    // Supertonic GGUFs ship bias/gamma/norm parameters as [C, 1, 1, 1] or
+    // [1, C, 1, 1] depending on which PyTorch broadcast view they were
+    // exported from.  The `_ct` ctors all assert `param->ne[0] == C_dim`, so
+    // unflattened tensors break them.  This is the same shape mismatch that
+    // has been silently disabling the legacy `pw2_residual_ggml` fused path
+    // for ConvNeXt blocks all along.
+    auto flatten_1d = [&](ggml_tensor * t) -> ggml_tensor * {
+        const int64_t n = ggml_nelements(t);
+        // Skip reshape only when already a literal 1-d view with ne[0] == n
+        // (`ggml_n_dims` is unreliable here — it ignores leading-1 dims and
+        // would return 1 for a [1, C, 1, 1] tensor where ne[0] = 1).
+        if (t->ne[0] == n && t->ne[1] == 1 && t->ne[2] == 1 && t->ne[3] == 1) {
+            return t;
+        }
+        return ggml_reshape_1d(ctx, t, n);
+    };
+
+    ggml_tensor * residual = x_ct;
+    // depthwise_1d_ct: [C, T] -> [C, T]
+    ggml_tensor * y = ggml_supertonic_depthwise_1d_ct(ctx, x_ct,
+        require_source_tensor(model, p + ".dwconv.weight"),
+        flatten_1d(require_source_tensor(model, p + ".dwconv.bias")),
+        dilation);
+    // layer_norm_channel_ct: [C, T] -> [C, T]
+    y = ggml_supertonic_layer_norm_channel_ct(ctx, y,
+        flatten_1d(require_source_tensor(model, p + ".norm.norm.weight")),
+        flatten_1d(require_source_tensor(model, p + ".norm.norm.bias")),
+        1e-6f);
+    // pw1 matmul: [IC=C, T] -> [OC, T]
+    y = pointwise_matmul_ct(ctx, y,
+        require_source_tensor(model, p + ".pwconv1.weight"),
+        nullptr);
+    // bias_gelu_ct: [OC, T] -> [OC, T]
+    y = ggml_supertonic_bias_gelu_ct(ctx, y,
+        flatten_1d(require_source_tensor(model, p + ".pwconv1.bias")));
+    // pw2 matmul: [IC=OC, T] -> [C, T]   (restores channel count)
+    y = pointwise_matmul_ct(ctx, y,
+        require_source_tensor(model, p + ".pwconv2.weight"),
+        nullptr);
+    // pw2_residual_ct: x[C, T] + bias[C] (×) gamma[C] + residual[C, T] -> [C, T]
+    return ggml_supertonic_pw2_residual_ct(ctx, y,
+        flatten_1d(require_source_tensor(model, p + ".pwconv2.bias")),
+        flatten_1d(require_source_tensor(model, p + ".gamma")),
+        residual);
+}
+
 std::vector<float> tensor_to_time_channel(ggml_tensor * t) {
     const int L = (int) t->ne[0];
     const int C = (int) t->ne[1];
@@ -730,7 +1051,7 @@ void build_text_attention_cache(vector_text_attention_cache & cache,
     ggml_set_name(ctx_tc, "vector_attn_ctx"); ggml_set_output(ctx_tc);
     ggml_build_forward_expand(cache.gf, ctx_tc);
 
-    ggml_tensor * out = dense_matmul_time_ggml(cache.ctx, ctx_tc,
+    ggml_tensor * out = dense_matmul_time_pretransposed_ggml(cache.ctx, model, ctx_tc,
         require_source_tensor(model, out_w_source),
         require_source_tensor(model, out_b_source));
     ggml_set_name(out, "vector_attn_out"); ggml_set_output(out);
@@ -994,18 +1315,20 @@ void build_group_graph_cache(vector_group_graph_cache & cache,
     }
     // F6: pre-transposed companion lives in model.ctx_w under
     // `<matmul_source>__T` (populated at load).  Falls back to the
-    // in-graph `ggml_cont(ggml_transpose(W))` rewrite if the
-    // pre-transpose roster didn't cover this weight (e.g. when
-    // running against a model whose `matmul_source` shape doesn't
-    // match the audit's [512, 64] expectation; see the defensive
-    // check in supertonic_gguf.cpp's F6 hook).
+    // per-pointer `pretransposed_weights` map (Metal's broader Q/K/V
+    // pretranspose roster), and finally to an in-graph
+    // `ggml_cont(ggml_transpose(W))` rewrite if neither covers this
+    // weight.
     ggml_tensor * t_proj;
     {
         auto pretrans_it = model.source_tensors.find(matmul_source + "__T");
         ggml_tensor * w_t = (pretrans_it != model.source_tensors.end()) ? pretrans_it->second : nullptr;
         if (!w_t) {
-            w_t = ggml_cont(cache.ctx, ggml_transpose(cache.ctx,
-                require_source_tensor(model, matmul_source)));
+            ggml_tensor * t_proj_w_orig = require_source_tensor(model, matmul_source);
+            w_t = try_pretransposed_weight(model, t_proj_w_orig);
+            if (!w_t) {
+                w_t = ggml_cont(cache.ctx, ggml_transpose(cache.ctx, t_proj_w_orig));
+            }
         }
         t_proj = ggml_mul_mat(cache.ctx, w_t,
             ggml_reshape_2d(cache.ctx, cache.temb_in, 64, 1));
@@ -1029,13 +1352,13 @@ void build_group_graph_cache(vector_group_graph_cache & cache,
     ggml_build_forward_expand(cache.gf, cur);
 
     const std::string attn_prefix = vector_main_block(post_block + 1) + ".attn.";
-    ggml_tensor * q = dense_matmul_time_ggml(cache.ctx, cur,
+    ggml_tensor * q = dense_matmul_time_pretransposed_ggml(cache.ctx, model, cur,
         require_source_tensor(model, q_matmul_source),
         require_source_tensor(model, attn_prefix + "W_query.linear.bias"));
-    ggml_tensor * k = dense_matmul_time_ggml(cache.ctx, cache.text_in,
+    ggml_tensor * k = dense_matmul_time_pretransposed_ggml(cache.ctx, model, cache.text_in,
         require_source_tensor(model, k_matmul_source),
         require_source_tensor(model, attn_prefix + "W_key.linear.bias"));
-    ggml_tensor * v = dense_matmul_time_ggml(cache.ctx, cache.text_in,
+    ggml_tensor * v = dense_matmul_time_pretransposed_ggml(cache.ctx, model, cache.text_in,
         require_source_tensor(model, v_matmul_source),
         require_source_tensor(model, attn_prefix + "W_value.linear.bias"));
     ggml_set_name(q, q_name.c_str()); ggml_set_output(q); ggml_build_forward_expand(cache.gf, q);
@@ -1361,14 +1684,14 @@ void build_res_style_qkv_cache(vector_res_style_qkv_cache & cache,
     ggml_build_forward_expand(cache.gf, post);
 
     const std::string style_prefix = vector_main_block(style_block) + ".attention.";
-    ggml_tensor * sq = dense_matmul_time_ggml(cache.ctx, post,
+    ggml_tensor * sq = dense_matmul_time_pretransposed_ggml(cache.ctx, model, post,
         require_source_tensor(model, q_matmul_source),
         require_source_tensor(model, style_prefix + "W_query.linear.bias"));
-    ggml_tensor * sk = dense_matmul_time_ggml(cache.ctx, cache.kctx_in,
+    ggml_tensor * sk = dense_matmul_time_pretransposed_ggml(cache.ctx, model, cache.kctx_in,
         require_source_tensor(model, k_matmul_source),
         require_source_tensor(model, style_prefix + "W_key.linear.bias"));
     sk = ggml_tanh(cache.ctx, sk);
-    ggml_tensor * sv = dense_matmul_time_ggml(cache.ctx, cache.style_v_in,
+    ggml_tensor * sv = dense_matmul_time_pretransposed_ggml(cache.ctx, model, cache.style_v_in,
         require_source_tensor(model, v_matmul_source),
         require_source_tensor(model, style_prefix + "W_value.linear.bias"));
     ggml_set_name(sq, q_name.c_str()); ggml_set_output(sq); ggml_build_forward_expand(cache.gf, sq);
@@ -3150,6 +3473,912 @@ bool supertonic_vector_trace_proj_ggml(const supertonic_model & model,
     }
 }
 
+// Apply Supertonic's non-standard RoPE in-graph.
+// Supertonic uses angle = (t/L) * theta[d_half], where theta is loaded from
+// the GGUF and L is the per-call sequence length.  ggml_rope_ext's formula
+// expands to angle = (pos / freq_factors[d/2]) * freq_scale * freq_base^(-d/n_dims).
+// Setting freq_base=1, freq_scale=1, freq_factors[d_half] = L / theta[d_half],
+// positions = [0..L) reproduces the Supertonic formula exactly.  NEOX mode
+// matches apply_rope's split-pairs layout (x[d] rotates with x[d+D/2]) at
+// supertonic_vector_estimator.cpp:1416.
+//
+// x_tc must be a contiguous 2D tensor of shape ne=[H*D, q_len] (width-major).
+// `positions` is int32 [q_len], `freq_factors` is f32 [D/2]; both are caller-
+// owned input tensors set via ggml_backend_tensor_set before compute.
+ggml_tensor * apply_supertonic_rope_ggml(ggml_context * ctx,
+                                          ggml_tensor * x_tc,
+                                          ggml_tensor * positions,
+                                          ggml_tensor * freq_factors,
+                                          int q_len,
+                                          int H,
+                                          int D) {
+    GGML_ASSERT(x_tc->ne[0] == (int64_t)(H*D));
+    GGML_ASSERT(x_tc->ne[1] == (int64_t)q_len);
+    const size_t row_bytes = (size_t)(H*D) * sizeof(float);
+    const size_t head_bytes = (size_t)D * sizeof(float);
+    // View [H*D, q_len] as [D, H, q_len] so rope's outer dim is time.
+    // Strides: nb1 = head step (D floats), nb2 = time step (H*D floats).
+    // This view is naturally contiguous (nb[0]=elem_size, nb[1]=D*elem_size,
+    // nb[2]=H*D*elem_size = ne[0]*ne[1]*elem_size) so we can skip the
+    // ggml_cont copy that earlier versions inserted defensively.
+    ggml_tensor * x_view = ggml_view_3d(ctx, x_tc, D, H, q_len,
+                                         head_bytes, row_bytes, 0);
+    ggml_tensor * roped = ggml_rope_ext(ctx, x_view, positions, freq_factors,
+                                         D, GGML_ROPE_TYPE_NEOX, 0,
+                                         /*freq_base=*/1.0f,
+                                         /*freq_scale=*/1.0f,
+                                         /*ext_factor=*/0.0f,
+                                         /*attn_factor=*/1.0f,
+                                         /*beta_fast=*/0.0f,
+                                         /*beta_slow=*/0.0f);
+    return ggml_reshape_2d(ctx, roped, (int64_t) H * D, q_len);
+}
+
+// Append a text-attention subgraph (Q, K, V flash-attention + out projection +
+// bias add) to the parent (ctx, gf).  Mirrors build_text_attention_cache but
+// composes into the caller's context instead of owning one.
+//
+// Inputs:
+//   q_tc, k_tc, v_tc: contiguous [H*D, *_len] tensors
+//   out_w_tensor: model tensor for the out projection weight
+//   out_b_tensor: model tensor for the out projection bias
+// Returns: out_tc tensor of shape [out_dim, q_len].
+ggml_tensor * append_text_attention_subgraph(ggml_context * ctx,
+                                              const supertonic_model & model,
+                                              ggml_tensor * q_tc,
+                                              ggml_tensor * k_tc,
+                                              ggml_tensor * v_tc,
+                                              int q_len, int kv_len,
+                                              int n_heads, int head_dim,
+                                              ggml_tensor * out_w_tensor,
+                                              ggml_tensor * out_b_tensor,
+                                              float scale) {
+    const int width = n_heads * head_dim;
+    const size_t time_stride = (size_t)width * sizeof(float);
+    const size_t head_stride = (size_t)head_dim * sizeof(float);
+    ggml_tensor * q_in = ggml_view_3d(ctx, q_tc,
+        head_dim, q_len, n_heads, time_stride, head_stride, 0);
+    ggml_tensor * k_in = ggml_view_3d(ctx, k_tc,
+        head_dim, kv_len, n_heads, time_stride, head_stride, 0);
+    ggml_tensor * v_in = ggml_view_3d(ctx, v_tc,
+        head_dim, kv_len, n_heads, time_stride, head_stride, 0);
+    ggml_tensor * attn = ggml_flash_attn_ext(ctx, q_in, k_in, v_in,
+                                              nullptr, scale, 0.0f, 0.0f);
+    attn = ggml_reshape_2d(ctx, attn, (int64_t) n_heads * head_dim, q_len);
+    ggml_tensor * ctx_tc = ggml_cont(ctx, ggml_transpose(ctx, attn));
+    return dense_matmul_time_pretransposed_ggml(ctx, model, ctx_tc, out_w_tensor, out_b_tensor);
+}
+
+// Per-group MatMul tensor name suffixes (groups 0..3).  See per-group source
+// names in trace_proj_ggml; these tables centralise them for the consolidated
+// path.
+struct vector_step_group_names {
+    int t_linear;    // time-linear (matmul for time embedding projection)
+    int attn_q;
+    int attn_k;
+    int attn_v;
+    int attn_out;
+    int style_q;
+    int style_k;
+    int style_v;
+    int style_out;
+};
+
+static const vector_step_group_names kGroupNames[4] = {
+    {3095, 3101, 3102, 3103, 3110, 3116, 3117, 3118, 3119},
+    {3140, 3146, 3147, 3148, 3155, 3161, 3162, 3163, 3164},
+    {3185, 3191, 3192, 3193, 3200, 3206, 3207, 3208, 3209},
+    {3230, 3236, 3237, 3238, 3245, 3251, 3252, 3253, 3254},
+};
+
+static std::string matmul_name(int suffix) {
+    return "vector_estimator:onnx::MatMul_" + std::to_string(suffix);
+}
+
+// Bundle of input tensors a single CFM step subgraph needs.  Used both by
+// the per-step cache (one step per ggml_cgraph) and by the
+// 5-steps-unrolled-into-one-graph cache (Phase A1+A2).
+//
+// `x_in` / `noise_in` vary per step (x_in = latent for this step,
+// noise_in is the "residual" we add the velocity to — for Supertonic's
+// CFM equation `next = noise_in + velocity * (1 / total_steps)` they
+// happen to be the same tensor for a single step but become DIFFERENT
+// tensors when steps are chained: step N's x_in is step N-1's output,
+// while noise_in is still the original noisy latent that step.  In the
+// per-step path we bind them to the same external buffer; in the
+// unrolled-loop path we wire them as graph edges between steps).
+//
+// `t_emb_in` varies per step (one time embedding per CFM step index).
+// All other inputs are constant across the 5 CFM steps and bind to a
+// single shared input tensor regardless of which path is used.
+struct vector_step_inputs {
+    ggml_tensor * x_in           = nullptr;  // ne=[L, Cin]    f32
+    ggml_tensor * mask_in        = nullptr;  // ne=[L]         f32
+    ggml_tensor * t_emb_in       = nullptr;  // ne=[64]        f32  (per-step)
+    ggml_tensor * text_in        = nullptr;  // ne=[text_len, 256] f32
+    ggml_tensor * style_v_raw_in = nullptr;  // ne=[50, 256]   f32
+    ggml_tensor * style_kctx_in  = nullptr;  // ne=[50, 256]   f32
+    ggml_tensor * noise_in       = nullptr;  // ne=[L, Cin]    f32  (per-step)
+    ggml_tensor * pos_q          = nullptr;  // ne=[L]         i32
+    ggml_tensor * pos_k          = nullptr;  // ne=[text_len]  i32
+    ggml_tensor * freq_factors_q = nullptr;  // ne=[D/2]       f32
+    ggml_tensor * freq_factors_k = nullptr;  // ne=[D/2]       f32
+};
+
+// Append one CFM step's subgraph (proj_in → 4 groups → tail → proj_out
+// → velocity → next = noise + velocity / total_steps) to `gf`.  All
+// inputs are pre-bound by the caller; this function only builds the
+// dataflow and returns the `next` tensor (ne=[L, Cin]) so the caller
+// can either set it as a graph output or feed it as the next step's
+// `x_in`.  The function does NOT call `ggml_set_output` /
+// `ggml_build_forward_expand` on the result — that's the caller's
+// decision.
+//
+// `L`, `text_len` and `total_steps` are passed explicitly because they're
+// used in several places.  CPU vs GPU dispatch lives on the thread-local
+// `supertonic_use_cpu_custom_ops()` flag set by the outer
+// `supertonic_op_dispatch_scope` at the public entry point.
+ggml_tensor * append_supertonic_vector_step_subgraph(
+        ggml_context * gctx,
+        ggml_cgraph * gf,
+        const supertonic_model & model,
+        const vector_step_inputs & inputs,
+        int L,
+        int text_len,
+        int total_steps);
+
+// Consolidated per-step cache: one ctx, one cgraph, one gallocr for the entire
+// per-step computation.  Replaces the ~17 sub-graph dispatches the trace_proj
+// orchestrator emits with a single ggml_backend_graph_compute call.
+struct vector_step_one_graph_cache {
+    const supertonic_model * model = nullptr;
+    uint64_t generation_id = 0;
+    int L = 0;
+    int text_len = 0;
+    int total_steps = 0;
+
+    std::vector<uint8_t> buf;
+    ggml_context * ctx = nullptr;
+    ggml_cgraph * gf = nullptr;
+    ggml_gallocr_t allocr = nullptr;
+
+    // Per-call inputs
+    ggml_tensor * x_in = nullptr;          // noisy_latent (L, Cin) ggml-shape: ne=[L, Cin]
+    ggml_tensor * mask_in = nullptr;       // [L]
+    ggml_tensor * t_emb_in = nullptr;      // [64]
+    ggml_tensor * text_in = nullptr;       // [text_len, 256]
+    ggml_tensor * style_v_raw_in = nullptr; // [50, 256] (style_ttl repacked)
+    ggml_tensor * style_kctx_in = nullptr;  // [50, 256] (model's /Expand_output_0)
+    ggml_tensor * noise_in = nullptr;       // (L, Cin) (same data as x_in but indep slot for tail)
+
+    // Per-build (rope) inputs
+    ggml_tensor * pos_q = nullptr;          // int32 [L]
+    ggml_tensor * pos_k = nullptr;          // int32 [text_len]
+    ggml_tensor * freq_factors_q = nullptr; // f32 [32] (head_dim/2)
+    ggml_tensor * freq_factors_k = nullptr; // f32 [32]
+
+    // Output
+    ggml_tensor * next_latent_out = nullptr; // ne=[L, Cin] in (t, c) order
+};
+
+void free_vector_step_one_graph_cache(vector_step_one_graph_cache & cache) {
+    if (cache.allocr) {
+        supertonic_safe_gallocr_free(cache.allocr, cache.model ? cache.model->generation_id : 0);
+        cache.allocr = nullptr;
+    }
+    if (cache.ctx) {
+        ggml_free(cache.ctx);
+        cache.ctx = nullptr;
+    }
+    cache.gf = nullptr;
+    cache.buf.clear();
+    cache.model = nullptr;
+    cache.generation_id = 0;
+    cache.L = 0;
+    cache.text_len = 0;
+    cache.total_steps = 0;
+    cache.x_in = cache.mask_in = cache.t_emb_in = cache.text_in = nullptr;
+    cache.style_v_raw_in = cache.style_kctx_in = cache.noise_in = nullptr;
+    cache.pos_q = cache.pos_k = cache.freq_factors_q = cache.freq_factors_k = nullptr;
+    cache.next_latent_out = nullptr;
+}
+
+ggml_tensor * append_supertonic_vector_step_subgraph(
+        ggml_context * gctx,
+        ggml_cgraph * gf,
+        const supertonic_model & model,
+        const vector_step_inputs & inputs,
+        int L,
+        int text_len,
+        int total_steps) {
+    const bool use_cpu_custom = supertonic_use_cpu_custom_ops();
+    // Shape constants that aren't dependent on L / text_len.  Mirror the
+    // values from supertonic_vector_step_one_graph_ggml.
+    const int C = 512;
+    const int H = 4;        // text-attention heads
+    const int D = 64;       // text-attention head_dim
+    const int SH = 2;       // style-attention heads
+    const int SD = 128;     // style-attention head_dim
+    const int kv_style = 50; // fixed by /Expand_output_0
+    (void)H; (void)D; (void)SH; (void)SD; (void)kv_style;
+
+    // ===== PHASE 0: proj_in + mask =====
+    ggml_tensor * cur = conv1d_f32(gctx,
+        require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.proj_in.net.weight"),
+        inputs.x_in, 1, 0, 1);
+    cur = ggml_mul(gctx, cur, repeat_like(gctx, inputs.mask_in, cur));
+
+    // ===== PHASE 1: Group 0 prologue — ConvNeXt × 4 on main_blocks.0 + time_add (1) + ConvNeXt (2) =====
+    int dils[4] = {1, 2, 4, 8};
+    // Phase B2 full: permute to [C, T] once before the 4-block chain, run
+    // the chain in [C, T] (which lets each block's two pointwise convs
+    // become a direct ggml_mul_mat with no im2col), permute back to
+    // [T, C] for the downstream time-add.  Saves 2 im2col dispatches per
+    // block × 4 blocks × 5 steps − 2 permutes per chain × 5 steps =
+    // 30 dispatches eliminated per synth.  Override:
+    // SUPERTONIC_DISABLE_CT_CONVNEXT=1.
+    static const bool disable_ct_convnext =
+        std::getenv("SUPERTONIC_DISABLE_CT_CONVNEXT") != nullptr;
+    const bool use_ct_convnext = !disable_ct_convnext && !use_cpu_custom;
+    if (use_ct_convnext) {
+        ggml_tensor * cur_ct = ggml_cont(gctx, ggml_permute(gctx, cur, 1, 0, 2, 3));
+        for (int j = 0; j < 4; ++j) {
+            cur_ct = vector_convnext_ggml_ct(gctx, model,
+                "vector_estimator:tts.ttl.vector_field.main_blocks.0.convnext." + std::to_string(j),
+                cur_ct, dils[j]);
+        }
+        cur = ggml_cont(gctx, ggml_permute(gctx, cur_ct, 1, 0, 2, 3));
+    } else {
+        for (int j = 0; j < 4; ++j) {
+            cur = vector_convnext_ggml(gctx, model,
+                "vector_estimator:tts.ttl.vector_field.main_blocks.0.convnext." + std::to_string(j),
+                cur, dils[j]);
+        }
+    }
+    // Time-add for group 0.
+    {
+        ggml_tensor * w = require_source_tensor(model, matmul_name(kGroupNames[0].t_linear));
+        ggml_tensor * b = require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks.1.linear.linear.bias");
+        ggml_tensor * w_t = try_pretransposed_weight(model, w);
+        if (!w_t) w_t = ggml_cont(gctx, ggml_transpose(gctx, w));
+        ggml_tensor * t_proj = ggml_mul_mat(gctx, w_t, ggml_reshape_2d(gctx, inputs.t_emb_in, 64, 1));
+        t_proj = ggml_add(gctx, t_proj, ggml_reshape_2d(gctx, b, C, 1));
+        cur = ggml_add(gctx, cur, repeat_like(gctx, t_proj, cur));
+    }
+    cur = vector_convnext_ggml(gctx, model,
+        "vector_estimator:tts.ttl.vector_field.main_blocks.2.convnext.0",
+        cur, 1);
+    ggml_tensor * block_pre_attn = cur;
+
+    // Per-group attention block.
+    auto run_group = [&](ggml_tensor * x, int group, ggml_tensor * x_pre_attn) -> ggml_tensor * {
+        const auto & names = kGroupNames[group];
+        const int attn_block = group * 6 + 3;
+        const int post_attn_block = group * 6 + 4;
+        const int style_block = group * 6 + 5;
+
+        // Text attention QKV — output directly in [A, T] (width-major)
+        // layout so the cont(transpose) before rope/flash_attn is gone.
+        // The kernel-as-src0 ordering also dispatches the optimized
+        // kernel_mul_mm_q8_0_f32 when weights are q8_0.
+        ggml_tensor * q_wt = dense_matmul_time_wt_pretransposed_ggml(gctx, model, x_pre_attn,
+            require_source_tensor(model, matmul_name(names.attn_q)),
+            require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                                         std::to_string(attn_block) + ".attn.W_query.linear.bias"));
+        ggml_tensor * k_wt = dense_matmul_time_wt_pretransposed_ggml(gctx, model, inputs.text_in,
+            require_source_tensor(model, matmul_name(names.attn_k)),
+            require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                                         std::to_string(attn_block) + ".attn.W_key.linear.bias"));
+        ggml_tensor * v_wt = dense_matmul_time_wt_pretransposed_ggml(gctx, model, inputs.text_in,
+            require_source_tensor(model, matmul_name(names.attn_v)),
+            require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                                         std::to_string(attn_block) + ".attn.W_value.linear.bias"));
+
+        q_wt = apply_supertonic_rope_ggml(gctx, q_wt, inputs.pos_q, inputs.freq_factors_q, L, H, D);
+        k_wt = apply_supertonic_rope_ggml(gctx, k_wt, inputs.pos_k, inputs.freq_factors_k, text_len, H, D);
+
+        ggml_tensor * attn_out = append_text_attention_subgraph(gctx, model,
+            q_wt, k_wt, v_wt, L, text_len, H, D,
+            require_source_tensor(model, matmul_name(names.attn_out)),
+            require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                                         std::to_string(attn_block) + ".attn.out_fc.linear.bias"),
+            1.0f / 16.0f);
+
+        ggml_tensor * residual = ggml_add(gctx, x_pre_attn, attn_out);
+        ggml_tensor * normed = layer_norm_ggml(gctx, residual,
+            require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                                         std::to_string(attn_block) + ".norm.norm.weight"),
+            require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                                         std::to_string(attn_block) + ".norm.norm.bias"));
+
+        ggml_tensor * post = vector_convnext_ggml(gctx, model,
+            "vector_estimator:tts.ttl.vector_field.main_blocks." +
+            std::to_string(post_attn_block) + ".convnext.0",
+            normed, 1);
+
+        ggml_tensor * masked_post = ggml_mul(gctx, post, repeat_like(gctx, inputs.mask_in, post));
+
+        // Style attention QKV — output directly in [A, T] layout.
+        ggml_tensor * sq_wt = dense_matmul_time_wt_pretransposed_ggml(gctx, model, masked_post,
+            require_source_tensor(model, matmul_name(names.style_q)),
+            require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                                         std::to_string(style_block) + ".attention.W_query.linear.bias"));
+        ggml_tensor * sk_wt = dense_matmul_time_wt_pretransposed_ggml(gctx, model, inputs.style_kctx_in,
+            require_source_tensor(model, matmul_name(names.style_k)),
+            require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                                         std::to_string(style_block) + ".attention.W_key.linear.bias"));
+        sk_wt = ggml_tanh(gctx, sk_wt);
+        ggml_tensor * sv_wt = dense_matmul_time_wt_pretransposed_ggml(gctx, model, inputs.style_v_raw_in,
+            require_source_tensor(model, matmul_name(names.style_v)),
+            require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                                         std::to_string(style_block) + ".attention.W_value.linear.bias"));
+
+        ggml_tensor * style_out = append_text_attention_subgraph(gctx, model,
+            sq_wt, sk_wt, sv_wt, L, kv_style, SH, SD,
+            require_source_tensor(model, matmul_name(names.style_out)),
+            require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                                         std::to_string(style_block) + ".attention.out_fc.linear.bias"),
+            1.0f / 16.0f);
+
+        ggml_tensor * style_residual = ggml_add(gctx, post, style_out);
+        ggml_tensor * style_normed = layer_norm_ggml(gctx, style_residual,
+            require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                                         std::to_string(style_block) + ".norm.norm.weight"),
+            require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                                         std::to_string(style_block) + ".norm.norm.bias"));
+        (void)x;
+        return style_normed;
+    };
+
+    // Group prep for groups 1-3.
+    auto group_prep = [&](ggml_tensor * x, int group) -> ggml_tensor * {
+        const int conv_block = group * 6 + 0;
+        const int linear_block = group * 6 + 1;
+        const int post_block = group * 6 + 2;
+        int dils2[4] = {1, 2, 4, 8};
+        ggml_tensor * y = x;
+        if (use_ct_convnext) {
+            ggml_tensor * y_ct = ggml_cont(gctx, ggml_permute(gctx, y, 1, 0, 2, 3));
+            for (int j = 0; j < 4; ++j) {
+                y_ct = vector_convnext_ggml_ct(gctx, model,
+                    "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                    std::to_string(conv_block) + ".convnext." + std::to_string(j),
+                    y_ct, dils2[j]);
+            }
+            y = ggml_cont(gctx, ggml_permute(gctx, y_ct, 1, 0, 2, 3));
+        } else {
+            for (int j = 0; j < 4; ++j) {
+                y = vector_convnext_ggml(gctx, model,
+                    "vector_estimator:tts.ttl.vector_field.main_blocks." +
+                    std::to_string(conv_block) + ".convnext." + std::to_string(j),
+                    y, dils2[j]);
+            }
+        }
+        ggml_tensor * w = require_source_tensor(model, matmul_name(kGroupNames[group].t_linear));
+        ggml_tensor * b = require_source_tensor(model,
+            "vector_estimator:tts.ttl.vector_field.main_blocks." +
+            std::to_string(linear_block) + ".linear.linear.bias");
+        ggml_tensor * w_t = try_pretransposed_weight(model, w);
+        if (!w_t) w_t = ggml_cont(gctx, ggml_transpose(gctx, w));
+        ggml_tensor * t_proj = ggml_mul_mat(gctx, w_t, ggml_reshape_2d(gctx, inputs.t_emb_in, 64, 1));
+        t_proj = ggml_add(gctx, t_proj, ggml_reshape_2d(gctx, b, C, 1));
+        y = ggml_add(gctx, y, repeat_like(gctx, t_proj, y));
+        y = vector_convnext_ggml(gctx, model,
+            "vector_estimator:tts.ttl.vector_field.main_blocks." +
+            std::to_string(post_block) + ".convnext.0",
+            y, 1);
+        return y;
+    };
+
+    ggml_tensor * x_after_g0 = run_group(cur, 0, block_pre_attn);
+    ggml_tensor * x_pre_g1 = group_prep(x_after_g0, 1);
+    ggml_tensor * x_after_g1 = run_group(x_after_g0, 1, x_pre_g1);
+    ggml_tensor * x_pre_g2 = group_prep(x_after_g1, 2);
+    ggml_tensor * x_after_g2 = run_group(x_after_g1, 2, x_pre_g2);
+    ggml_tensor * x_pre_g3 = group_prep(x_after_g2, 3);
+    ggml_tensor * x_after_g3 = run_group(x_after_g2, 3, x_pre_g3);
+
+    // Tail: last_convnext × 4 + proj_out + mask + noise add.
+    ggml_tensor * tail = x_after_g3;
+    if (use_ct_convnext) {
+        ggml_tensor * tail_ct = ggml_cont(gctx, ggml_permute(gctx, tail, 1, 0, 2, 3));
+        for (int j = 0; j < 4; ++j) {
+            tail_ct = vector_convnext_ggml_ct(gctx, model,
+                "vector_estimator:tts.ttl.vector_field.last_convnext.convnext." + std::to_string(j),
+                tail_ct, 1);
+        }
+        tail = ggml_cont(gctx, ggml_permute(gctx, tail_ct, 1, 0, 2, 3));
+    } else {
+        for (int j = 0; j < 4; ++j) {
+            tail = vector_convnext_ggml(gctx, model,
+                "vector_estimator:tts.ttl.vector_field.last_convnext.convnext." + std::to_string(j),
+                tail, 1);
+        }
+    }
+    ggml_tensor * velocity = conv1d_f32(gctx,
+        require_source_tensor(model, "vector_estimator:tts.ttl.vector_field.proj_out.net.weight"),
+        tail, 1, 0, 1);
+    ggml_tensor * masked_velocity = ggml_mul(gctx, velocity, repeat_like(gctx, inputs.mask_in, velocity));
+    ggml_tensor * scaled = ggml_scale(gctx, masked_velocity, 1.0f / (float)total_steps);
+    ggml_tensor * next = ggml_add(gctx, inputs.noise_in, scaled);
+
+    // Mark gf as used so the unused-parameter warning doesn't fire — the
+    // graph build is via the tensors above which inherit gf via ctx.
+    (void)gf;
+    return next;
+}
+
+
+// Compute one CFM denoising step as ONE ggml graph.  Used only when the
+// model's backend isn't CPU (Metal / CUDA / Vulkan / OpenCL).  Replaces the
+// ~21 sub-graph dispatches the trace_proj orchestrator emits with a single
+// ggml_backend_graph_compute call.
+bool supertonic_vector_step_one_graph_ggml(const supertonic_model & model,
+                                            const float * noisy_latent,
+                                            int latent_len,
+                                            const float * text_emb,
+                                            int text_len,
+                                            const float * style_ttl,
+                                            const float * latent_mask,
+                                            int current_step,
+                                            int total_steps,
+                                            std::vector<float> & next_latent_out,
+                                            std::string * error) {
+    // The outer entry point sets `supertonic_op_dispatch_scope`; this
+    // function is only called on non-CPU backends, so the thread-local
+    // `supertonic_use_cpu_custom_ops()` reads false inside the helpers.
+    try {
+        const int L = latent_len;
+        const int Cin = model.hparams.latent_channels;  // typically 16
+        const int C = 512;
+        const int text_C = 256;
+        const int H = 4;        // text-attention heads
+        const int D = 64;       // text-attention head_dim
+        const int A = H * D;    // 256 = attention width
+        const int SH = 2;       // style-attention heads
+        const int SD = 128;     // style-attention head_dim
+        const int kv_style = 50; // style attention kv length (fixed by /Expand_output_0)
+
+        thread_local vector_step_one_graph_cache cache;
+        const bool need_rebuild = cache.model != &model ||
+                                  cache.generation_id != model.generation_id ||
+                                  cache.L != L ||
+                                  cache.text_len != text_len ||
+                                  cache.total_steps != total_steps;
+        if (need_rebuild) {
+            free_vector_step_one_graph_cache(cache);
+            cache.model = &model;
+            cache.generation_id = model.generation_id;
+            cache.L = L;
+            cache.text_len = text_len;
+            cache.total_steps = total_steps;
+
+            // Memory budget for the consolidated graph.  The original
+            // sub-graphs each used 128-512 nodes; the full per-step graph is
+            // roughly the sum (4 groups x ~700 ops/group + tail + front).
+            // Round up generously.
+            constexpr int MAX_NODES = 8192;
+            const size_t buf_size = ggml_tensor_overhead() * MAX_NODES +
+                                     ggml_graph_overhead_custom(MAX_NODES, false);
+            cache.buf.assign(buf_size, 0);
+            ggml_init_params p = { buf_size, cache.buf.data(), true };
+            cache.ctx = ggml_init(p);
+            cache.gf = ggml_new_graph_custom(cache.ctx, MAX_NODES, false);
+
+            // --- Per-call inputs ---
+            cache.x_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, L, Cin);
+            ggml_set_name(cache.x_in, "step_x_in"); ggml_set_input(cache.x_in);
+            cache.mask_in = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, L);
+            ggml_set_name(cache.mask_in, "step_mask"); ggml_set_input(cache.mask_in);
+            cache.t_emb_in = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, 64);
+            ggml_set_name(cache.t_emb_in, "step_temb"); ggml_set_input(cache.t_emb_in);
+            cache.text_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, text_len, text_C);
+            ggml_set_name(cache.text_in, "step_text_in"); ggml_set_input(cache.text_in);
+            cache.style_v_raw_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, kv_style, text_C);
+            ggml_set_name(cache.style_v_raw_in, "step_style_v"); ggml_set_input(cache.style_v_raw_in);
+            cache.style_kctx_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, kv_style, text_C);
+            ggml_set_name(cache.style_kctx_in, "step_style_kctx"); ggml_set_input(cache.style_kctx_in);
+            cache.noise_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, L, Cin);
+            ggml_set_name(cache.noise_in, "step_noise_in"); ggml_set_input(cache.noise_in);
+
+            // --- RoPE inputs ---
+            cache.pos_q = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_I32, L);
+            ggml_set_name(cache.pos_q, "step_pos_q"); ggml_set_input(cache.pos_q);
+            cache.pos_k = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_I32, text_len);
+            ggml_set_name(cache.pos_k, "step_pos_k"); ggml_set_input(cache.pos_k);
+            cache.freq_factors_q = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, D / 2);
+            ggml_set_name(cache.freq_factors_q, "step_ff_q"); ggml_set_input(cache.freq_factors_q);
+            cache.freq_factors_k = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, D / 2);
+            ggml_set_name(cache.freq_factors_k, "step_ff_k"); ggml_set_input(cache.freq_factors_k);
+
+            ggml_context * gctx = cache.ctx;
+            ggml_cgraph * gf = cache.gf;
+
+            vector_step_inputs inputs;
+            inputs.x_in           = cache.x_in;
+            inputs.mask_in        = cache.mask_in;
+            inputs.t_emb_in       = cache.t_emb_in;
+            inputs.text_in        = cache.text_in;
+            inputs.style_v_raw_in = cache.style_v_raw_in;
+            inputs.style_kctx_in  = cache.style_kctx_in;
+            inputs.noise_in       = cache.noise_in;
+            inputs.pos_q          = cache.pos_q;
+            inputs.pos_k          = cache.pos_k;
+            inputs.freq_factors_q = cache.freq_factors_q;
+            inputs.freq_factors_k = cache.freq_factors_k;
+
+            ggml_tensor * next = append_supertonic_vector_step_subgraph(
+                gctx, gf, model, inputs, L, text_len, total_steps);
+
+            ggml_set_name(next, "step_next_latent");
+            ggml_set_output(next);
+            ggml_build_forward_expand(gf, next);
+            cache.next_latent_out = next;
+
+
+            // Allocate.
+            cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
+            if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector step one-graph failed");
+            if (!ggml_gallocr_reserve(cache.allocr, gf)) {
+                throw std::runtime_error("ggml_gallocr_reserve vector step one-graph failed");
+            }
+            ggml_gallocr_alloc_graph(cache.allocr, gf);
+        }
+
+        // ===== Per-call inputs =====
+        // The existing trace_proj_ggml at lines 2143/2151 sets these tensors
+        // DIRECTLY from the caller-provided channel-major buffers (no host
+        // transpose), and the views downstream interpret memory accordingly.
+        // Copy that pattern exactly — my earlier transpose loops were a bug
+        // (correlation 0.003 vs CPU reference; root-caused 2026-05-11).
+        ggml_backend_tensor_set(cache.x_in, noisy_latent, 0, (size_t)L * Cin * sizeof(float));
+        ggml_backend_tensor_set(cache.noise_in, noisy_latent, 0, (size_t)L * Cin * sizeof(float));
+        ggml_backend_tensor_set(cache.mask_in, latent_mask, 0, (size_t)L * sizeof(float));
+
+        std::vector<float> te_host = time_embedding(model, current_step, total_steps);
+        ggml_backend_tensor_set(cache.t_emb_in, te_host.data(), 0, te_host.size() * sizeof(float));
+
+        // text_emb is in (C=256, text_len) channel-major; the tensor has
+        // ne=[text_len, 256] which puts t_len fast in memory.  Same raw layout,
+        // so direct memcpy (matches trace_proj_ggml).
+        ggml_backend_tensor_set(cache.text_in, text_emb, 0, (size_t)text_len * 256 * sizeof(float));
+
+        // Style inputs (cached host buffers from existing helper).
+        const std::vector<float> * style_v_raw_ptr = nullptr;
+        const std::vector<float> * kctx_raw_ptr = nullptr;
+        cached_style_layouts(model, style_ttl, style_v_raw_ptr, kctx_raw_ptr);
+        ggml_backend_tensor_set(cache.style_v_raw_in, style_v_raw_ptr->data(), 0, style_v_raw_ptr->size() * sizeof(float));
+        ggml_backend_tensor_set(cache.style_kctx_in, kctx_raw_ptr->data(), 0, kctx_raw_ptr->size() * sizeof(float));
+
+        // RoPE positions + freq_factors.  theta is loaded from the model and
+        // depends on L (sequence length); recompute per call.
+        {
+            std::vector<int32_t> pos_q_host(L);
+            for (int i = 0; i < L; ++i) pos_q_host[i] = i;
+            ggml_backend_tensor_set(cache.pos_q, pos_q_host.data(), 0, pos_q_host.size() * sizeof(int32_t));
+            std::vector<int32_t> pos_k_host(text_len);
+            for (int i = 0; i < text_len; ++i) pos_k_host[i] = i;
+            ggml_backend_tensor_set(cache.pos_k, pos_k_host.data(), 0, pos_k_host.size() * sizeof(int32_t));
+
+            const int half = 32;  // D/2 = 64/2
+            f32_tensor theta = read_f32(model, "vector_estimator:tts.ttl.vector_field.main_blocks.3.attn.theta");
+            if ((int)theta.data.size() < half) {
+                throw std::runtime_error("theta tensor has fewer than D/2 elements");
+            }
+            std::vector<float> ff_q(half), ff_k(half);
+            for (int d = 0; d < half; ++d) {
+                ff_q[d] = (float)L / theta.data[d];
+                ff_k[d] = (float)text_len / theta.data[d];
+            }
+            ggml_backend_tensor_set(cache.freq_factors_q, ff_q.data(), 0, ff_q.size() * sizeof(float));
+            ggml_backend_tensor_set(cache.freq_factors_k, ff_k.data(), 0, ff_k.size() * sizeof(float));
+        }
+
+        // ===== ONE compute call =====
+        supertonic_graph_compute(model, cache.gf);
+
+        // ===== Read output =====
+        // The output tensor has ne=[L, Cin] with element (i=t, j=c) at offset
+        // c*L+t — exactly the (c, t) channel-major layout the caller expects.
+        // Direct memcpy, no transpose.
+        next_latent_out.assign((size_t)Cin * L, 0.0f);
+        ggml_backend_tensor_get(cache.next_latent_out, next_latent_out.data(), 0,
+                                 (size_t)Cin * L * sizeof(float));
+        if (error) error->clear();
+        return true;
+    } catch (const std::exception & e) {
+        if (error) *error = e.what();
+        return false;
+    }
+}
+
+// =====================================================================
+// Phase A1+A2 — single-graph CFM loop
+// =====================================================================
+//
+// Unroll all `total_steps` CFM denoising steps into ONE ggml_cgraph and
+// dispatch with a single ggml_backend_graph_compute call.  Each step's
+// `x_in` and `noise_in` is the previous step's output node (no host
+// round-trip), and only `t_emb_in` differs per step (N inputs, one
+// per CFM step).  Replaces the engine's `for (step ...) {
+// supertonic_vector_step_ggml(...) }` loop on non-CPU backends.
+//
+// CPU keeps the per-step path because its cblas fastpaths benefit from
+// the cache-per-shape boundary and the host-side rope/style helpers in
+// trace_proj_ggml expect to see per-step outputs.
+
+struct vector_loop_one_graph_cache {
+    const supertonic_model * model = nullptr;
+    uint64_t generation_id = 0;
+    int L = 0;
+    int text_len = 0;
+    int total_steps = 0;
+
+    std::vector<uint8_t> buf;
+    ggml_context * ctx = nullptr;
+    ggml_cgraph * gf = nullptr;
+    ggml_gallocr_t allocr = nullptr;
+
+    // Shared inputs (constant across CFM steps).
+    ggml_tensor * x0_in = nullptr;          // ne=[L, Cin]  initial noisy latent
+    ggml_tensor * mask_in = nullptr;        // ne=[L]
+    ggml_tensor * text_in = nullptr;        // ne=[text_len, 256]
+    ggml_tensor * style_v_raw_in = nullptr; // ne=[50, 256]
+    ggml_tensor * style_kctx_in = nullptr;  // ne=[50, 256]
+
+    // RoPE inputs (constant across steps).
+    ggml_tensor * pos_q = nullptr;
+    ggml_tensor * pos_k = nullptr;
+    ggml_tensor * freq_factors_q = nullptr;
+    ggml_tensor * freq_factors_k = nullptr;
+
+    // Per-step time embedding (one tensor per CFM step).
+    std::vector<ggml_tensor *> t_emb_in;
+
+    // Final output — last step's `next` tensor.
+    ggml_tensor * final_latent_out = nullptr;
+};
+
+void free_vector_loop_one_graph_cache(vector_loop_one_graph_cache & cache) {
+    if (cache.allocr) {
+        supertonic_safe_gallocr_free(cache.allocr, cache.model ? cache.model->generation_id : 0);
+        cache.allocr = nullptr;
+    }
+    if (cache.ctx) {
+        ggml_free(cache.ctx);
+        cache.ctx = nullptr;
+    }
+    cache.gf = nullptr;
+    cache.buf.clear();
+    cache.model = nullptr;
+    cache.generation_id = 0;
+    cache.L = 0;
+    cache.text_len = 0;
+    cache.total_steps = 0;
+    cache.x0_in = cache.mask_in = cache.text_in = nullptr;
+    cache.style_v_raw_in = cache.style_kctx_in = nullptr;
+    cache.pos_q = cache.pos_k = cache.freq_factors_q = cache.freq_factors_k = nullptr;
+    cache.t_emb_in.clear();
+    cache.final_latent_out = nullptr;
+}
+
+bool supertonic_vector_loop_one_graph_ggml(const supertonic_model & model,
+                                            const float * initial_noisy_latent,
+                                            int latent_len,
+                                            const float * text_emb,
+                                            int text_len,
+                                            const float * style_ttl,
+                                            const float * latent_mask,
+                                            int total_steps,
+                                            std::vector<float> & final_latent_out,
+                                            std::string * error) {
+    // Public entry point — set the thread-local dispatch flag so the
+    // helpers' `supertonic_use_cpu_custom_ops()` reads consistently
+    // (false on non-CPU backends, true on CPU + accelerate/cblas).
+    supertonic_op_dispatch_scope dispatch(model);
+    try {
+        const int L = latent_len;
+        const int Cin = model.hparams.latent_channels;
+        const int text_C = 256;
+        const int D = 64;
+        const int kv_style = 50;
+
+        thread_local vector_loop_one_graph_cache cache;
+        const bool need_rebuild = cache.model != &model ||
+                                  cache.generation_id != model.generation_id ||
+                                  cache.L != L ||
+                                  cache.text_len != text_len ||
+                                  cache.total_steps != total_steps;
+        if (need_rebuild) {
+            free_vector_loop_one_graph_cache(cache);
+            cache.model = &model;
+            cache.generation_id = model.generation_id;
+            cache.L = L;
+            cache.text_len = text_len;
+            cache.total_steps = total_steps;
+
+            // ~5x the per-step node budget.  Each per-step build registered ~1056
+            // ggml nodes pre-Tier-2; post-Tier-2 it's ~928.  Round up to 8192/step
+            // × total_steps = ~40k.  Plus the shared inputs (a few dozen) +
+            // per-step temb input tensors.
+            const int MAX_NODES = 8192 * std::max(1, total_steps) + 256;
+            const size_t buf_size = ggml_tensor_overhead() * (size_t) MAX_NODES +
+                                     ggml_graph_overhead_custom(MAX_NODES, false);
+            cache.buf.assign(buf_size, 0);
+            ggml_init_params p = { buf_size, cache.buf.data(), true };
+            cache.ctx = ggml_init(p);
+            cache.gf = ggml_new_graph_custom(cache.ctx, MAX_NODES, false);
+
+            // --- Shared inputs ---
+            cache.x0_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, L, Cin);
+            ggml_set_name(cache.x0_in, "loop_x0_in"); ggml_set_input(cache.x0_in);
+            cache.mask_in = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, L);
+            ggml_set_name(cache.mask_in, "loop_mask"); ggml_set_input(cache.mask_in);
+            cache.text_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, text_len, text_C);
+            ggml_set_name(cache.text_in, "loop_text_in"); ggml_set_input(cache.text_in);
+            cache.style_v_raw_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, kv_style, text_C);
+            ggml_set_name(cache.style_v_raw_in, "loop_style_v"); ggml_set_input(cache.style_v_raw_in);
+            cache.style_kctx_in = ggml_new_tensor_2d(cache.ctx, GGML_TYPE_F32, kv_style, text_C);
+            ggml_set_name(cache.style_kctx_in, "loop_style_kctx"); ggml_set_input(cache.style_kctx_in);
+
+            cache.pos_q = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_I32, L);
+            ggml_set_name(cache.pos_q, "loop_pos_q"); ggml_set_input(cache.pos_q);
+            cache.pos_k = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_I32, text_len);
+            ggml_set_name(cache.pos_k, "loop_pos_k"); ggml_set_input(cache.pos_k);
+            cache.freq_factors_q = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, D / 2);
+            ggml_set_name(cache.freq_factors_q, "loop_ff_q"); ggml_set_input(cache.freq_factors_q);
+            cache.freq_factors_k = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, D / 2);
+            ggml_set_name(cache.freq_factors_k, "loop_ff_k"); ggml_set_input(cache.freq_factors_k);
+
+            cache.t_emb_in.resize(total_steps, nullptr);
+            for (int s = 0; s < total_steps; ++s) {
+                cache.t_emb_in[s] = ggml_new_tensor_1d(cache.ctx, GGML_TYPE_F32, 64);
+                const std::string name = "loop_temb_" + std::to_string(s);
+                ggml_set_name(cache.t_emb_in[s], name.c_str());
+                ggml_set_input(cache.t_emb_in[s]);
+            }
+
+            // --- Chain N CFM steps together ---
+            ggml_tensor * cur_latent = cache.x0_in;
+            for (int s = 0; s < total_steps; ++s) {
+                vector_step_inputs inputs;
+                inputs.x_in           = cur_latent;       // previous step's output
+                inputs.mask_in        = cache.mask_in;
+                inputs.t_emb_in       = cache.t_emb_in[s];
+                inputs.text_in        = cache.text_in;
+                inputs.style_v_raw_in = cache.style_v_raw_in;
+                inputs.style_kctx_in  = cache.style_kctx_in;
+                inputs.noise_in       = cur_latent;       // CFM: next = noise_in + v/N
+                inputs.pos_q          = cache.pos_q;
+                inputs.pos_k          = cache.pos_k;
+                inputs.freq_factors_q = cache.freq_factors_q;
+                inputs.freq_factors_k = cache.freq_factors_k;
+
+                ggml_tensor * next = append_supertonic_vector_step_subgraph(
+                    cache.ctx, cache.gf, model, inputs, L, text_len, total_steps);
+                const std::string step_name = "loop_next_" + std::to_string(s);
+                ggml_set_name(next, step_name.c_str());
+                cur_latent = next;
+            }
+            ggml_set_output(cur_latent);
+            ggml_build_forward_expand(cache.gf, cur_latent);
+            cache.final_latent_out = cur_latent;
+
+            cache.allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
+            if (!cache.allocr) throw std::runtime_error("ggml_gallocr_new vector loop one-graph failed");
+            if (!ggml_gallocr_reserve(cache.allocr, cache.gf)) {
+                throw std::runtime_error("ggml_gallocr_reserve vector loop one-graph failed");
+            }
+            ggml_gallocr_alloc_graph(cache.allocr, cache.gf);
+        }
+
+        // --- Per-call inputs (constants across CFM steps) ---
+        ggml_backend_tensor_set(cache.x0_in, initial_noisy_latent, 0,
+                                 (size_t) L * Cin * sizeof(float));
+        ggml_backend_tensor_set(cache.mask_in, latent_mask, 0, (size_t) L * sizeof(float));
+        ggml_backend_tensor_set(cache.text_in, text_emb, 0, (size_t) text_len * 256 * sizeof(float));
+
+        const std::vector<float> * style_v_raw_ptr = nullptr;
+        const std::vector<float> * kctx_raw_ptr = nullptr;
+        cached_style_layouts(model, style_ttl, style_v_raw_ptr, kctx_raw_ptr);
+        ggml_backend_tensor_set(cache.style_v_raw_in, style_v_raw_ptr->data(), 0,
+                                 style_v_raw_ptr->size() * sizeof(float));
+        ggml_backend_tensor_set(cache.style_kctx_in, kctx_raw_ptr->data(), 0,
+                                 kctx_raw_ptr->size() * sizeof(float));
+
+        {
+            std::vector<int32_t> pos_q_host(L);
+            for (int i = 0; i < L; ++i) pos_q_host[i] = i;
+            ggml_backend_tensor_set(cache.pos_q, pos_q_host.data(), 0,
+                                     pos_q_host.size() * sizeof(int32_t));
+            std::vector<int32_t> pos_k_host(text_len);
+            for (int i = 0; i < text_len; ++i) pos_k_host[i] = i;
+            ggml_backend_tensor_set(cache.pos_k, pos_k_host.data(), 0,
+                                     pos_k_host.size() * sizeof(int32_t));
+
+            const int half = 32;
+            f32_tensor theta = read_f32(model, "vector_estimator:tts.ttl.vector_field.main_blocks.3.attn.theta");
+            if ((int) theta.data.size() < half) {
+                throw std::runtime_error("theta tensor has fewer than D/2 elements");
+            }
+            std::vector<float> ff_q(half), ff_k(half);
+            for (int d = 0; d < half; ++d) {
+                ff_q[d] = (float) L / theta.data[d];
+                ff_k[d] = (float) text_len / theta.data[d];
+            }
+            ggml_backend_tensor_set(cache.freq_factors_q, ff_q.data(), 0,
+                                     ff_q.size() * sizeof(float));
+            ggml_backend_tensor_set(cache.freq_factors_k, ff_k.data(), 0,
+                                     ff_k.size() * sizeof(float));
+        }
+
+        // --- Per-step time embeddings ---
+        for (int s = 0; s < total_steps; ++s) {
+            std::vector<float> te = time_embedding(model, s, total_steps);
+            ggml_backend_tensor_set(cache.t_emb_in[s], te.data(), 0,
+                                     te.size() * sizeof(float));
+        }
+
+        // --- ONE compute call for ALL CFM steps ---
+        supertonic_graph_compute(model, cache.gf);
+
+        // --- Read final output ---
+        final_latent_out.assign((size_t) Cin * L, 0.0f);
+        ggml_backend_tensor_get(cache.final_latent_out, final_latent_out.data(), 0,
+                                 (size_t) Cin * L * sizeof(float));
+        if (error) error->clear();
+        return true;
+    } catch (const std::exception & e) {
+        if (error) *error = e.what();
+        return false;
+    }
+}
+
+// Public-ish driver: dispatches to the unrolled-loop path on non-CPU
+// backends, falls back to the per-step `supertonic_vector_step_ggml`
+// loop on CPU.  Gate the unrolled path off with
+// SUPERTONIC_DISABLE_LOOP_GRAPH=1 to A/B against the per-step path on
+// the same backend.
+bool supertonic_vector_loop_ggml(const supertonic_model & model,
+                                  const float * initial_noisy_latent,
+                                  int latent_len,
+                                  const float * text_emb,
+                                  int text_len,
+                                  const float * style_ttl,
+                                  const float * latent_mask,
+                                  int total_steps,
+                                  std::vector<float> & final_latent_out,
+                                  std::string * error) {
+    const bool disable_loop =
+        std::getenv("SUPERTONIC_DISABLE_LOOP_GRAPH") != nullptr;
+    if (!disable_loop && !model_prefers_cpu_kernels(model)) {
+        return supertonic_vector_loop_one_graph_ggml(
+            model, initial_noisy_latent, latent_len, text_emb, text_len,
+            style_ttl, latent_mask, total_steps, final_latent_out, error);
+    }
+    // CPU / disabled path: run the per-step loop in the addon's existing way.
+    try {
+        std::vector<float> latent((size_t) model.hparams.latent_channels * latent_len);
+        std::memcpy(latent.data(), initial_noisy_latent, latent.size() * sizeof(float));
+        std::vector<float> next;
+        for (int step = 0; step < total_steps; ++step) {
+            if (!supertonic_vector_step_ggml(model, latent.data(), latent_len,
+                                              text_emb, text_len,
+                                              style_ttl, latent_mask,
+                                              step, total_steps, next, error)) {
+                return false;
+            }
+            latent.swap(next);
+        }
+        final_latent_out = std::move(latent);
+        if (error) error->clear();
+        return true;
+    } catch (const std::exception & e) {
+        if (error) *error = e.what();
+        return false;
+    }
+}
+
 bool supertonic_vector_step_ggml(const supertonic_model & model,
                                  const float * noisy_latent,
                                  int latent_len,
@@ -3162,6 +4391,19 @@ bool supertonic_vector_step_ggml(const supertonic_model & model,
                                  std::vector<float> & next_latent_out,
                                  std::string * error) {
     supertonic_op_dispatch_scope dispatch(model);
+    // Metal / CUDA / Vulkan / OpenCL: use the consolidated one-graph path
+    // (one ggml_backend_graph_compute call per CFM step instead of ~21).
+    // CPU: keep the multi-cache trace_proj path — its CPU fast-paths and
+    // thread_local sub-graph caches stay competitive on CPU and trace mode
+    // relies on the per-stage outputs.  Set SUPERTONIC_DISABLE_ONE_GRAPH=1
+    // to fall back to the multi-cache path on GPU backends if needed.
+    const bool disable_one_graph = std::getenv("SUPERTONIC_DISABLE_ONE_GRAPH") != nullptr;
+    if (!disable_one_graph && !model_prefers_cpu_kernels(model)) {
+        return supertonic_vector_step_one_graph_ggml(model, noisy_latent, latent_len,
+                                                      text_emb, text_len, style_ttl,
+                                                      latent_mask, current_step,
+                                                      total_steps, next_latent_out, error);
+    }
     try {
         std::vector<supertonic_trace_tensor> scalar_trace;
         std::vector<supertonic_trace_tensor> ggml_trace;
diff --git a/tts-cpp/src/supertonic_vocoder.cpp b/tts-cpp/src/supertonic_vocoder.cpp
index fe6ffbf80d2..daf32f5ad11 100644
--- a/tts-cpp/src/supertonic_vocoder.cpp
+++ b/tts-cpp/src/supertonic_vocoder.cpp
@@ -88,11 +88,33 @@ ggml_tensor * repeat_like(ggml_context * ctx, ggml_tensor * v, ggml_tensor * lik
             std::to_string(like->ne[0]) + "," + std::to_string(like->ne[1]) + "," +
             std::to_string(like->ne[2]) + "," + std::to_string(like->ne[3]) + "]");
     }
-    return ggml_repeat(ctx, v, like);
+    // Every caller feeds the return value straight into ggml_add / ggml_mul,
+    // both of which broadcast natively in ggml.  Skip the explicit
+    // ggml_repeat node so the downstream op handles the broadcast — saves a
+    // kernel_repeat launch per call on Metal.
+    static const bool force_explicit_repeat =
+        std::getenv("SUPERTONIC_FORCE_EXPLICIT_REPEAT") != nullptr;
+    if (force_explicit_repeat) {
+        return ggml_repeat(ctx, v, like);
+    }
+    return v;
 }
 
 ggml_tensor * causal_replicate_pad_1d(ggml_context * ctx, ggml_tensor * x, int pad_left) {
     if (pad_left <= 0) return x;
+    // Prefer the fused supertonic_edge_pad_1d op when available (Metal
+    // via the overlay port + CPU via the parity backstop) — collapses
+    // the view + repeat_4d + concat triplet into a single dispatch.
+    // Override with SUPERTONIC_DISABLE_FUSED_EDGE_PAD=1 to A/B against
+    // the stock-ops chain.
+    static const bool disable_fused_edge_pad =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_EDGE_PAD") != nullptr;
+    if (!disable_fused_edge_pad &&
+        x->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 &&
+        ggml_is_contiguous(x)) {
+        return ggml_supertonic_edge_pad_1d(ctx, x, pad_left, 0);
+    }
     const int64_t C = x->ne[1];
     ggml_tensor * first = ggml_view_2d(ctx, x, 1, C, x->nb[1], 0);
     ggml_tensor * rep = ggml_repeat_4d(ctx, first, pad_left, C, 1, 1);
@@ -340,6 +362,15 @@ ggml_tensor * layer_norm_channel_ggml(ggml_context * ctx,
                                       ggml_tensor * gamma,
                                       ggml_tensor * beta,
                                       float eps = 1e-6f) {
+    static const bool disable_fused_layer_norm =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_LAYER_NORM") != nullptr;
+    if (!disable_fused_layer_norm &&
+        x->type == GGML_TYPE_F32 && gamma->type == GGML_TYPE_F32 && beta->type == GGML_TYPE_F32 &&
+        x->ne[2] == 1 && x->ne[3] == 1 &&
+        gamma->ne[0] == x->ne[1] && beta->ne[0] == x->ne[1] &&
+        ggml_is_contiguous(x) && ggml_is_contiguous(gamma) && ggml_is_contiguous(beta)) {
+        return ggml_supertonic_layer_norm_channel(ctx, x, gamma, beta, eps);
+    }
     ggml_tensor * y = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));
     y = ggml_norm(ctx, y, eps);
     y = ggml_mul(ctx, y, repeat_like(ctx, gamma, y));
@@ -352,29 +383,128 @@ ggml_tensor * convnext_block_ggml(ggml_context * ctx,
                                   ggml_tensor * x,
                                   int idx) {
     static const int dilations[10] = {1, 2, 4, 1, 2, 4, 1, 1, 1, 1};
-    // Audit follow-up #6 (F7) — fused LN + pw1 + gelu + pw2 + γ +
-    // residual.  The fused helper keeps the layer-norm output in
-    // `[C, T0]` (channel-major) memory and lowers both K=1 pointwise
-    // convs to direct `ggml_mul_mat` against that layout, eliminating
-    // the LN back-permute/cont and both im2col copies the previous
-    // chain paid (audit cost: ~16.8 MiB / vocoder pass).  The
-    // depthwise op stays in this TU so the CBLAS custom-op fast
-    // path is unaffected.  Trace + pipeline parity preserved — the
-    // fused helper computes the same arithmetic in the same order,
-    // just on a different (compatible) intermediate layout.  See
-    // `supertonic_internal.h::convnext_block_fused_ggml` for the
-    // op-by-op rationale and
-    // `test/test_supertonic_convnext_block_fused.cpp` for the
-    // parity test.
+    const bool use_cpu_custom = supertonic_use_cpu_custom_ops();
     ggml_tensor * dw = depthwise_conv1d_causal_ggml(ctx, x, w.dw_w, w.dw_b, dilations[idx]);
-    return convnext_block_fused_ggml(
-        ctx,
-        /*residual=*/x,
-        /*dw_out=*/dw,
-        w.norm_g, w.norm_b,
-        w.pw1_w, w.pw1_b,
-        w.pw2_w, w.pw2_b,
-        w.gamma);
+    if (use_cpu_custom) {
+        // Audit follow-up #6 (F7) — fused LN + pw1 + gelu + pw2 + γ +
+        // residual.  The fused helper keeps the layer-norm output in
+        // `[C, T0]` (channel-major) memory and lowers both K=1 pointwise
+        // convs to direct `ggml_mul_mat` against that layout, eliminating
+        // the LN back-permute/cont and both im2col copies the previous
+        // chain paid (audit cost: ~16.8 MiB / vocoder pass).  The
+        // depthwise op stays in this TU so the CBLAS custom-op fast
+        // path is unaffected.  Trace + pipeline parity preserved — the
+        // fused helper computes the same arithmetic in the same order,
+        // just on a different (compatible) intermediate layout.  See
+        // `supertonic_internal.h::convnext_block_fused_ggml` for the
+        // op-by-op rationale and
+        // `test/test_supertonic_convnext_block_fused.cpp` for the
+        // parity test.
+        return convnext_block_fused_ggml(
+            ctx,
+            /*residual=*/x,
+            /*dw_out=*/dw,
+            w.norm_g, w.norm_b,
+            w.pw1_w, w.pw1_b,
+            w.pw2_w, w.pw2_b,
+            w.gamma);
+    }
+    // Metal / non-CPU backend path: keep the granular chain so the
+    // per-op Metal fused-kernel fast paths inside the helpers (layer
+    // norm, bias+gelu, ...) get a chance to fire.  GGML_OP_CUSTOM is
+    // rejected on GPU backends so the F7 fused helper above isn't
+    // usable here regardless.
+    ggml_tensor * residual = x;
+    ggml_tensor * y = dw;
+    y = layer_norm_channel_ggml(ctx, y, w.norm_g, w.norm_b);
+    // pw1 + bias + GELU.  On Metal we drop the bias from conv1d_causal_ggml
+    // and feed the pre-bias matmul output to the fused bias_gelu op (one
+    // dispatch instead of two: ggml_add + gelu_erf).  CPU keeps its existing
+    // cblas+bias_inside path — the standard library erff in the unfused
+    // chain is already the cheapest there.
+    static const bool disable_fused_bias_gelu =
+        std::getenv("SUPERTONIC_DISABLE_FUSED_BIAS_GELU") != nullptr;
+    if (!disable_fused_bias_gelu &&
+        y->type == GGML_TYPE_F32 && w.pw1_w->type == GGML_TYPE_F32 &&
+        w.pw1_b->type == GGML_TYPE_F32) {
+        y = conv1d_causal_ggml(ctx, y, w.pw1_w, /*b=*/nullptr);
+        if (y->ne[2] == 1 && y->ne[3] == 1 &&
+            w.pw1_b->ne[0] == y->ne[1] &&
+            ggml_is_contiguous(y) && ggml_is_contiguous(w.pw1_b)) {
+            y = ggml_supertonic_bias_gelu(ctx, y, w.pw1_b);
+        } else {
+            y = ggml_add(ctx, y, repeat_like(ctx, w.pw1_b, y));
+            y = ggml_gelu_erf(ctx, y);
+        }
+    } else {
+        y = conv1d_causal_ggml(ctx, y, w.pw1_w, w.pw1_b);
+        y = ggml_gelu_erf(ctx, y);
+    }
+    // NOTE: the vector_estimator's `ggml_supertonic_pw2_residual` op
+    // expects `gamma` to be `[C]` (per-channel scale); the vocoder
+    // however stores `gamma` as a `[1]` scalar (single learnable
+    // scale per ConvNeXt block).  The shapes are incompatible, so we
+    // keep the unfused chain here.  A vocoder-specific fused op with
+    // scalar gamma is possible but the win would be tiny (~10
+    // dispatches × ~40μs = 0.4 ms).
+    y = conv1d_causal_ggml(ctx, y, w.pw2_w, w.pw2_b);
+    y = ggml_mul(ctx, y, repeat_like(ctx, w.gamma, y));
+    return ggml_add(ctx, residual, y);
+}
+
+ggml_tensor * pointwise_matmul_ct_voc(ggml_context * ctx,
+                                      ggml_tensor * x_ct,
+                                      ggml_tensor * w,
+                                      ggml_tensor * b) {
+    GGML_ASSERT(w->ne[0] == 1);
+    GGML_ASSERT(w->ne[1] == x_ct->ne[0]);
+    GGML_ASSERT(ggml_is_contiguous(w));
+    ggml_tensor * w_2d = ggml_reshape_2d(ctx, w, w->ne[1], w->ne[2]);
+    ggml_tensor * x_2d = ggml_reshape_2d(ctx, x_ct, x_ct->ne[0], x_ct->ne[1]);
+    ggml_tensor * y = ggml_mul_mat(ctx, w_2d, x_2d);
+    if (b) y = ggml_add(ctx, y, repeat_like(ctx, b, y));
+    return y;
+}
+
+// Phase B2 follow-up: vocoder ConvNeXt block on `[C, T]` activations
+// end-to-end.  Takes `[C, T]` input and returns `[C, T]` — the caller
+// wraps the 10-block chain in a single `[T, C] -> [C, T]` permute at
+// entry and a single `[C, T] -> [T, C]` permute at exit, so this
+// block has zero intra-block permutes.
+//
+// Vocoder ConvNeXt differs from vector_estimator's: (1) depthwise is
+// **causal** (left-only pad) rather than symmetric edge-clamp — handled
+// by the `_causal_ct` variant of the fused depthwise kernel (port-v14).
+// (2) `gamma` is a scalar `[1]`, not per-channel, so the `pw2_residual_ct`
+// fused op doesn't fit — unfused scalar `mul + add` tail.  (3) `norm_g` /
+// `norm_b` ship as `[1, C]` (same flatten-needed quirk as vector_estimator's
+// `.gamma`).
+//
+// Caller: `SUPERTONIC_DISABLE_CT_VOCODER=1` reverts to legacy
+// `convnext_block_ggml`.
+ggml_tensor * convnext_block_ggml_ct(ggml_context * ctx,
+                                     const supertonic_vocoder_convnext_weights & w,
+                                     ggml_tensor * x_ct,
+                                     int idx) {
+    static const int dilations[10] = {1, 2, 4, 1, 2, 4, 1, 1, 1, 1};
+    ggml_tensor * residual = x_ct;
+
+    auto flatten_1d = [&](ggml_tensor * t) -> ggml_tensor * {
+        const int64_t n = ggml_nelements(t);
+        if (t->ne[0] == n && t->ne[1] == 1 && t->ne[2] == 1 && t->ne[3] == 1) return t;
+        return ggml_reshape_1d(ctx, t, n);
+    };
+
+    ggml_tensor * y_ct = ggml_supertonic_depthwise_1d_causal_ct(ctx, x_ct,
+        w.dw_w, flatten_1d(w.dw_b), dilations[idx]);
+    y_ct = ggml_supertonic_layer_norm_channel_ct(ctx, y_ct,
+        flatten_1d(w.norm_g), flatten_1d(w.norm_b), 1e-6f);
+    y_ct = pointwise_matmul_ct_voc(ctx, y_ct, w.pw1_w, /*bias=*/nullptr);
+    y_ct = ggml_supertonic_bias_gelu_ct(ctx, y_ct, flatten_1d(w.pw1_b));
+    y_ct = pointwise_matmul_ct_voc(ctx, y_ct, w.pw2_w, flatten_1d(w.pw2_b));
+    // Scalar gamma multiply (broadcasts in any layout).
+    y_ct = ggml_mul(ctx, y_ct, repeat_like(ctx, w.gamma, y_ct));
+    return ggml_add(ctx, residual, y_ct);
 }
 
 struct vocoder_graph_cache {
@@ -415,6 +545,10 @@ void free_vocoder_cache(vocoder_graph_cache & cache) {
 void build_supertonic_vocoder_cache(vocoder_graph_cache & cache,
                                     const supertonic_model & model,
                                     int latent_len) {
+    // `supertonic_op_dispatch_scope` is set by the outer
+    // `supertonic_vocoder_forward_ggml` entry point; inside graph builders
+    // we read the thread-local flag directly.
+    const bool use_cpu_custom = supertonic_use_cpu_custom_ops();
     free_vocoder_cache(cache);
     cache.model = &model;
     cache.generation_id = model.generation_id;
@@ -470,9 +604,28 @@ void build_supertonic_vocoder_cache(vocoder_graph_cache & cache,
 
     x = conv1d_causal_ggml(cache.ctx, x, model.vocoder.embed_w, model.vocoder.embed_b);
     ggml_set_name(x, "vocoder_embed");
-    for (int i = 0; i < 10; ++i) {
-        x = convnext_block_ggml(cache.ctx, model.vocoder.convnext[(size_t) i], x, i);
-        ggml_set_name(x, ("vocoder_convnext_" + std::to_string(i)).c_str());
+    // Phase B2 follow-up: route the 10-block ConvNeXt chain through the
+    // `[C, T]` variant on Metal.  Each block runs depthwise (causal_ct) +
+    // layer_norm + pw1 + bias_gelu + pw2 + scalar gamma + residual add
+    // entirely on `[C, T]` — no intra-block permutes.  The single
+    // `[T, C] -> [C, T]` permute happens once before the chain and the
+    // single reverse permute once after.  Override:
+    // SUPERTONIC_DISABLE_CT_VOCODER=1.
+    static const bool disable_ct_vocoder =
+        std::getenv("SUPERTONIC_DISABLE_CT_VOCODER") != nullptr;
+    const bool use_ct_vocoder = !disable_ct_vocoder && !use_cpu_custom;
+    if (use_ct_vocoder) {
+        ggml_tensor * x_ct = ggml_cont(cache.ctx, ggml_permute(cache.ctx, x, 1, 0, 2, 3));
+        for (int i = 0; i < 10; ++i) {
+            x_ct = convnext_block_ggml_ct(cache.ctx, model.vocoder.convnext[(size_t) i], x_ct, i);
+            ggml_set_name(x_ct, ("vocoder_convnext_" + std::to_string(i)).c_str());
+        }
+        x = ggml_cont(cache.ctx, ggml_permute(cache.ctx, x_ct, 1, 0, 2, 3));
+    } else {
+        for (int i = 0; i < 10; ++i) {
+            x = convnext_block_ggml(cache.ctx, model.vocoder.convnext[(size_t) i], x, i);
+            ggml_set_name(x, ("vocoder_convnext_" + std::to_string(i)).c_str());
+        }
     }
 
     // F2: reference the pre-baked weight tensors directly instead