tetherto · GustavoA1604 · May 13, 2026 · May 13, 2026 · May 13, 2026
@@ -1,5 +1,8 @@
 # Vendored ggml (cloned separately at setup time; see README)
-ggml/
+/ggml/
+# (We DO commit cmake/vcpkg-overlay-ports/ggml/ — it's the QVAC ggml port
+# overlay carrying our Supertonic custom-op patches.  The `/ggml/` above is
+# anchored to the tts-cpp root only.)
 
 # Build artifacts
 build/

@@ -115,23 +115,23 @@ if (NOT TARGET ggml)
         endif()
         add_library(ggml ALIAS ggml::ggml)
     else()
-        # In-tree subtree of qvac-ext-lib-whisper.cpp: the standalone
-        # patches/ folder + scripts/setup-ggml.sh tooling is intentionally
-        # absent here.  Without them, an add_subdirectory(ggml) build
-        # would silently miss the ggml-backend-reg-filename-prefix patch
-        # that GGML_BACKEND_DL_PROJECT_PREFIX="speech-" depends on, so
-        # libspeech-ggml-*.so files would exist on disk but the runtime
-        # loader would still search for libggml-*.so under
-        # GGML_BACKEND_DL=ON.  Reject up front with a pointer at the
-        # right consumption path.
-        if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/patches")
+        # Bundled-ggml dev build path (TTS_CPP_USE_SYSTEM_GGML=OFF).
+        # Expects `tts-cpp/ggml/` to be a checkout of the
+        # tetherto/qvac-ext-ggml repo on the `speech` branch — the QVAC
+        # fork carrying every infrastructure patch + the Supertonic 2
+        # fused custom op family as commits (not as a patches/ overlay).
+        #
+        # Run `bash tts-cpp/scripts/setup-ggml.sh` first to clone +
+        # check out the pinned commit.  No patches/ directory is
+        # consulted: the speech branch is already pre-patched at the
+        # commit level.
+        if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ggml/CMakeLists.txt")
             message(FATAL_ERROR
-                "tts-cpp: this in-tree subtree does not ship the patches/ "
-                "directory.  Pass -DTTS_CPP_USE_SYSTEM_GGML=ON to consume "
-                "the QVAC speech-stack `ggml-speech` vcpkg port (which "
-                "carries the pre-applied patches), or use the standalone "
-                "github.com/gianni-cor/chatterbox.cpp repo for a "
-                "bundled-ggml dev build with patches/ present.")
+                "tts-cpp: bundled-ggml build requires tts-cpp/ggml/ to be "
+                "a checkout of tetherto/qvac-ext-ggml@speech.  Run "
+                "`bash tts-cpp/scripts/setup-ggml.sh` first, or pass "
+                "-DTTS_CPP_USE_SYSTEM_GGML=ON to consume the QVAC "
+                "speech-stack `ggml-speech` vcpkg port.")
         endif()
         add_subdirectory(ggml)
     endif()

@@ -338,28 +338,38 @@ target_link_libraries(my_app PRIVATE tts-cpp::tts-cpp)
 ```
 
 For development out of this in-tree subtree (running the parity
-harnesses, prototyping API changes, etc.) the canonical build is:
+harnesses, prototyping API changes, etc.) the canonical build is the
+**bundled-ggml dev flow**:
+
+```bash
+bash tts-cpp/scripts/setup-ggml.sh    # clones qvac-ext-ggml@speech into tts-cpp/ggml/
+cmake -S tts-cpp -B tts-cpp/build -DCMAKE_BUILD_TYPE=Release \
+  -DTTS_CPP_USE_SYSTEM_GGML=OFF
+cmake --build tts-cpp/build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu)
+```
+
+`setup-ggml.sh` checks out the pinned tetherto/qvac-ext-ggml@speech
+commit (which already carries every QVAC infrastructure patch + the
+Supertonic 2 fused custom op family — no `patches/` overlay needed).
+CMakeLists's `add_subdirectory(ggml)` path then consumes it directly
+with `GGML_NATIVE=ON` for native ARM/SIMD codegen — typically ~10%
+faster on M-series than the vcpkg-port flavor's portable build.
+
+Downstream production builds use the system-installed `ggml` instead:
 
 ```bash
-# Install the speech-stack ggml port via vcpkg first; then:
 cmake -S tts-cpp -B tts-cpp/build -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_TOOLCHAIN_FILE=<vcpkg_root>/scripts/buildsystems/vcpkg.cmake
 cmake --build tts-cpp/build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu)
 ```
 
-`TTS_CPP_USE_SYSTEM_GGML` defaults to `ON` here so the build picks
-up the patched ggml from vcpkg automatically; flipping it `OFF` in
-this subtree is rejected at configure time (no `patches/` to apply).
-GPU acceleration is selected at the ggml-port level - the
-`ggml-speech` port already carries the Metal / Vulkan / OpenCL
-backend support its consumers ask for; pass `--n-gpu-layers 99` at
-runtime to actually use the compiled GPU backend.
-
-If you need a bundled-ggml dev build (`add_subdirectory(ggml)` with
-patches applied locally rather than coming from vcpkg), use the
-standalone [`chatterbox.cpp`](https://github.com/gianni-cor/chatterbox.cpp)
-repo - the source-of-truth this subtree was copied from - which keeps
-`scripts/setup-ggml.sh` + `patches/` for that flow.
+`TTS_CPP_USE_SYSTEM_GGML` defaults to `ON` for this flow, finding
+the `ggml-speech` port from qvac-registry-vcpkg (which pulls
+qvac-ext-ggml@speech with patches as commits).  GPU acceleration is
+selected at the ggml-port level — the port already carries the
+Metal / Vulkan / OpenCL backend support its consumers ask for; pass
+`--n-gpu-layers 99` at runtime to actually use the compiled GPU
+backend.
 
 ### Useful CMake options
 

@@ -14,7 +14,15 @@
 //
 //     EngineOptions opts;
 //     opts.model_gguf_path = "models/supertonic.gguf";
-//     opts.n_gpu_layers    = 0;                      // CPU only today
+//     opts.n_gpu_layers    = 0;                      // 0 = CPU; >0 enables Metal
+//                                                    // on macOS / CUDA / Vulkan /
+//                                                    // OpenCL when compiled in.
+//                                                    // Metal on Apple silicon is the
+//                                                    // fastest backend as of 2026-05-12
+//                                                    // (~35× realtime on M2, beats
+//                                                    // ggml-CPU, ONNX-CPU and ONNX-CoreML
+//                                                    // on every stage that matters).
+//                                                    // See PROGRESS_SUPERTONIC.md.
 //
 //     Engine engine(opts);
 //     for (const auto & line : lines) {
@@ -43,6 +51,26 @@
 
 namespace tts_cpp::supertonic {
 
+// Compute precision for matmul weights inside the model buffer.  Selects
+// how the GGUF's stored q8_0 weights are loaded into the resident model:
+//   - F32  (default): expand q8_0 to f32 at load time.  CPU path uses
+//          cblas/AMX f32 matmul.  Metal path uses kernel_mul_mat_f32_f32.
+//          Highest accuracy + simplest, but on Metal misses the 4×
+//          weight-bandwidth win of running the native q8_0 matmul kernel.
+//   - F16  (Phase B1): expand q8_0 to f16 at load time, run f16 matmul
+//          with f32 accumulator.  ~2× less activation bandwidth on Metal,
+//          may drift slightly across the 5 CFM steps (parity tolerance
+//          relaxed to ~1e-2 L_inf).
+//   - Q8_0 (Phase A3): keep weights as q8_0 in the model buffer, let
+//          ggml's quantized matmul kernels dispatch directly.  Metal-only
+//          (Phase A3 makes the load logic asymmetric: q8_0 on Metal, f32
+//          on CPU).
+enum class Precision {
+    F32,
+    F16,
+    Q8_0,
+};
+
 struct EngineOptions {
     // Required.
     std::string model_gguf_path;
@@ -56,6 +84,11 @@ struct EngineOptions {
     int   n_threads     = 0;
     int   n_gpu_layers  = 0;
 
+    // Compute precision for matmul weights — see Precision enum above.
+    // Default F32 is the current behaviour (load q8_0 GGUF, expand to f32).
+    // F16 / Q8_0 are non-default GPU paths (Metal-validated).
+    Precision precision = Precision::F32;
+
     // F16 K/V flash-attention in the vector estimator.  When -1, the
     // engine auto-enables this on GPU backends (non-CPU) and disables
     // it on CPU; pass 1 / 0 to force the setting regardless of the
@@ -72,6 +105,9 @@ struct EngineOptions {
     // Halves the GPU read bandwidth into those ops with a small
     // (≤ 2e-3 abs / 5e-3 cosine) numerical drift on the end-to-end
     // synth.  Mirrors chatterbox's CHATTERBOX_F16_CFM gate.
+    // Orthogonal to `precision`: this is a per-op runtime selector for
+    // the OpenCL hot-weight materialisation, while `precision` decides
+    // the storage type of all matmul weights uniformly.
     int f16_weights = -1;
 
     // Optional path to a .npy file containing the initial noise tensor of

@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+#
+# setup-ggml.sh — clone the qvac-ext-ggml@speech branch into tts-cpp/ggml/
+#
+# The bundled-ggml dev build path for tts-cpp out of this in-tree subtree.
+# Replaces the vcpkg-port consumption when you want a fast iteration loop
+# without going through vcpkg installs.
+#
+# Pinned to the head of the `speech` branch (a tetherto/qvac-ext-ggml fork
+# of ggml-org/ggml carrying all QVAC infrastructure patches + the
+# Supertonic 2 fused custom op family pre-applied as commits — no
+# patches/ directory needed at this layer).
+#
+# Usage:
+#   bash tts-cpp/scripts/setup-ggml.sh
+#   cmake -S tts-cpp -B tts-cpp/build -DTTS_CPP_USE_SYSTEM_GGML=OFF
+#   cmake --build tts-cpp/build -j
+#
+# To update to a newer pin: bump GGML_REF below and re-run.  The script
+# is idempotent — re-running checks out the right ref into the existing
+# tts-cpp/ggml/ clone without re-cloning.
+
+set -euo pipefail
+
+GGML_REPO_URL="https://github.com/tetherto/qvac-ext-ggml.git"
+GGML_REF="60a172e48f699bd0a00575ef911feed9473b2187"   # merge of qvac-ext-ggml#8 (speech HEAD)
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TTS_CPP_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+GGML_DIR="${TTS_CPP_DIR}/ggml"
+
+if [ -d "${GGML_DIR}/.git" ]; then
+    echo "setup-ggml: existing clone at ${GGML_DIR} — fetching + checking out pin ${GGML_REF:0:10}"
+    git -C "${GGML_DIR}" fetch --depth 1 origin "${GGML_REF}"
+    git -C "${GGML_DIR}" checkout --detach "${GGML_REF}"
+else
+    echo "setup-ggml: cloning qvac-ext-ggml @ ${GGML_REF:0:10} into ${GGML_DIR}"
+    rm -rf "${GGML_DIR}"
+    git clone --depth 1 --no-tags "${GGML_REPO_URL}" "${GGML_DIR}"
+    git -C "${GGML_DIR}" fetch --depth 1 origin "${GGML_REF}"
+    git -C "${GGML_DIR}" checkout --detach "${GGML_REF}"
+fi
+
+echo "setup-ggml: tts-cpp/ggml/ ready at $(git -C "${GGML_DIR}" rev-parse --short HEAD)"
+echo "setup-ggml: next: cmake -S tts-cpp -B tts-cpp/build -DTTS_CPP_USE_SYSTEM_GGML=OFF"
@@ -0,0 +1,168 @@
+#!/usr/bin/env bash
+# Multi-precision parity + bench harness for Supertonic 2.
+#
+# For each supported precision (f32, f16, q8_0):
+#   1. Synthesizes a reference WAV on CPU at that precision.
+#   2. Synthesizes the same WAV on Metal at the same precision.
+#   3. Reports parity (corr, L_inf, RMS) between the two.
+#   4. Optionally runs supertonic-bench at the same precision and emits
+#      a per-precision JSON artifact alongside.
+#
+# Usage:
+#   bash scripts/validate-precision-parity.sh [--bench] [--text TEXT] [--model PATH]
+#                                             [--precisions f32,f16,q8_0]
+#
+# Precisions not yet wired through the graph builders fail at load with
+# a clear "scaffolded but not yet supported" message and are skipped (not
+# counted as a parity failure).  This lets the harness be useful right
+# now while Phase A3 / B1 work lands.
+
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+MODEL="$ROOT/models/supertonic2.gguf"
+TEXT="The quick brown fox jumps over the lazy dog."
+PRECISIONS="f32,f16,q8_0"
+DO_BENCH=0
+RUNS=10
+WARMUP=2
+THREADS=4
+ARTIFACT_DIR="$ROOT/artifacts/bench/parity-matrix"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --bench)       DO_BENCH=1; shift ;;
+        --text)        TEXT="$2"; shift 2 ;;
+        --model)       MODEL="$2"; shift 2 ;;
+        --precisions)  PRECISIONS="$2"; shift 2 ;;
+        --runs)        RUNS="$2"; shift 2 ;;
+        --warmup)      WARMUP="$2"; shift 2 ;;
+        --threads)     THREADS="$2"; shift 2 ;;
+        --artifact-dir) ARTIFACT_DIR="$2"; shift 2 ;;
+        -h|--help)
+            sed -n '2,/^set -euo/p' "$0" | sed 's/^# //; s/^#//; /^set -euo/d'
+            exit 0 ;;
+        *) echo "unknown arg: $1" >&2; exit 2 ;;
+    esac
+done
+
+CLI="$ROOT/build/supertonic-cli"
+BENCH="$ROOT/build/supertonic-bench"
+PY="$ROOT/.venv/bin/python3"
+if [[ ! -x "$CLI" ]]; then
+    echo "build/supertonic-cli not found. Run 'cmake --build build --target supertonic-cli' first." >&2
+    exit 1
+fi
+if [[ "$DO_BENCH" -eq 1 && ! -x "$BENCH" ]]; then
+    echo "--bench requested but build/supertonic-bench not found." >&2
+    exit 1
+fi
+if [[ ! -x "$PY" ]]; then
+    echo "$PY not found. Activate a venv with numpy + wave installed." >&2
+    exit 1
+fi
+
+mkdir -p "$ARTIFACT_DIR"
+TMP="$(mktemp -d)"
+trap 'rm -rf "$TMP"' EXIT
+
+printf "\nSupertonic 2 multi-precision parity + bench harness\n"
+printf "  model:      %s\n" "$MODEL"
+printf "  text:       %.60s%s\n" "$TEXT" "$([[ ${#TEXT} -gt 60 ]] && echo '...')"
+printf "  precisions: %s\n" "$PRECISIONS"
+printf "  bench:      %s\n\n" "$([[ "$DO_BENCH" -eq 1 ]] && echo 'yes' || echo 'no')"
+
+OVERALL_RC=0
+IFS=',' read -r -a PREC_ARR <<< "$PRECISIONS"
+for P in "${PREC_ARR[@]}"; do
+    P_TRIM="$(echo "$P" | xargs)"
+    CPU_WAV="$TMP/cpu-$P_TRIM.wav"
+    MTL_WAV="$TMP/mtl-$P_TRIM.wav"
+
+    printf "=== %s ===\n" "$P_TRIM"
+
+    set +e
+    CPU_LOG="$("$CLI" --model "$MODEL" --text "$TEXT" --n-gpu-layers 0 \
+                       --precision "$P_TRIM" --out "$CPU_WAV" 2>&1)"
+    CPU_RC=$?
+    MTL_LOG="$("$CLI" --model "$MODEL" --text "$TEXT" --n-gpu-layers 1 \
+                       --precision "$P_TRIM" --out "$MTL_WAV" 2>&1)"
+    MTL_RC=$?
+    set -e
+
+    if echo "$CPU_LOG$MTL_LOG" | grep -qE "scaffolded but not yet|partially scaffolded"; then
+        printf "  SKIP: precision %s not yet wired through graph builders (Phase A3/B1)\n\n" "$P_TRIM"
+        continue
+    fi
+    # Tolerate the harmless post-write atexit `GGML_ASSERT([rsets->data count] == 0)`
+    # that fires on Metal cleanup AFTER the WAV is fully written.  Treat the run as
+    # successful iff the WAV file exists and is at least 1 KB (covers a synthesized
+    # signal, well above an empty/header-only file).
+    cpu_ok=1; mtl_ok=1
+    [[ -s "$CPU_WAV" ]] || cpu_ok=0
+    [[ -s "$MTL_WAV" ]] || mtl_ok=0
+    if [[ -f "$CPU_WAV" ]]; then
+        size=$(wc -c < "$CPU_WAV")
+        [[ $size -lt 1024 ]] && cpu_ok=0
+    fi
+    if [[ -f "$MTL_WAV" ]]; then
+        size=$(wc -c < "$MTL_WAV")
+        [[ $size -lt 1024 ]] && mtl_ok=0
+    fi
+    if [[ $cpu_ok -eq 0 || $mtl_ok -eq 0 ]]; then
+        printf "  FAIL: synthesis errored.  cpu_rc=%d mtl_rc=%d  wav_ok cpu=%d mtl=%d\n" \
+               "$CPU_RC" "$MTL_RC" "$cpu_ok" "$mtl_ok"
+        printf "  --- cpu tail ---\n%s\n  --- metal tail ---\n%s\n\n" \
+               "$(echo "$CPU_LOG" | tail -3)" "$(echo "$MTL_LOG" | tail -3)"
+        OVERALL_RC=1
+        continue
+    fi
+
+    "$PY" - <<PY
+import wave, numpy as np, sys
+def load(p):
+    with wave.open(p, 'rb') as w:
+        return np.frombuffer(w.readframes(w.getnframes()), dtype=np.int16).astype(np.float32) / 32768.0
+a = load("$CPU_WAV")
+b = load("$MTL_WAV")
+n = min(len(a), len(b))
+a, b = a[:n], b[:n]
+corr = float(np.corrcoef(a, b)[0, 1])
+linf = float(np.max(np.abs(a - b)))
+rms  = float(np.sqrt(np.mean((a - b) ** 2)))
+# Per-precision tolerance: numbers chosen against observed CPU↔Metal drift
+# on the benchmark text "The quick brown fox jumps over the lazy dog.".
+# Short text routinely gets L_inf ≈ 1.7e-3; long text accumulates more
+# float-order drift across 5 CFM steps × more attention positions, landing
+# around L_inf ≈ 3.7e-2 with corr ≥ 0.998 — audibly identical for f32.
+# Q8_0 has additional drift from the dequant→transpose→requantize round-trip
+# in the asymmetric load path (Metal keeps q8_0, CPU expands to f32, so the
+# two paths use slightly differently-quantized weights).  Audibly identical.
+tol_corr = {"f32": 0.998,  "f16": 0.99,  "q8_0": 0.96}.get("$P_TRIM", 0.99)
+tol_linf = {"f32": 0.05,   "f16": 0.10,  "q8_0": 0.15 }.get("$P_TRIM", 0.10)
+print(f"  corr={corr:.6f} (tol >= {tol_corr})  L_inf={linf:.6f} (tol <= {tol_linf})  RMS={rms:.6f}")
+ok = corr >= tol_corr and linf <= tol_linf
+print("  PASS" if ok else "  FAIL parity")
+sys.exit(0 if ok else 1)
+PY
+    PY_RC=$?
+    if [[ $PY_RC -ne 0 ]]; then OVERALL_RC=1; fi
+
+    if [[ "$DO_BENCH" -eq 1 ]]; then
+        JSON="$ARTIFACT_DIR/supertonic-mtl-${P_TRIM}.json"
+        printf "  bench --> %s\n" "$JSON"
+        "$BENCH" --model "$MODEL" --text "$TEXT" \
+                  --voice M1 --language en --steps 5 --speed 1.05 --seed 42 \
+                  --runs "$RUNS" --warmup "$WARMUP" --threads "$THREADS" \
+                  --n-gpu-layers 1 --precision "$P_TRIM" \
+                  --json-out "$JSON" 2>&1 | grep -E '^\s*(vector_estimator|vocoder|text_encoder|total|RTF|Real-time)' || true
+    fi
+    printf "\n"
+done
+
+if [[ $OVERALL_RC -eq 0 ]]; then
+    printf "All wired-up precisions pass parity.\n"
+else
+    printf "One or more precisions failed parity (or errored).\n" >&2
+fi
+exit $OVERALL_RC