Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion tts-cpp/.gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Vendored ggml (cloned separately at setup time; see README)
ggml/
/ggml/
# (We DO commit cmake/vcpkg-overlay-ports/ggml/ — it's the QVAC ggml port
# overlay carrying our Supertonic custom-op patches. The `/ggml/` above is
# anchored to the tts-cpp root only.)

# Build artifacts
build/
Expand Down
32 changes: 16 additions & 16 deletions tts-cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,23 +115,23 @@ if (NOT TARGET ggml)
endif()
add_library(ggml ALIAS ggml::ggml)
else()
# In-tree subtree of qvac-ext-lib-whisper.cpp: the standalone
# patches/ folder + scripts/setup-ggml.sh tooling is intentionally
# absent here. Without them, an add_subdirectory(ggml) build
# would silently miss the ggml-backend-reg-filename-prefix patch
# that GGML_BACKEND_DL_PROJECT_PREFIX="speech-" depends on, so
# libspeech-ggml-*.so files would exist on disk but the runtime
# loader would still search for libggml-*.so under
# GGML_BACKEND_DL=ON. Reject up front with a pointer at the
# right consumption path.
if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/patches")
# Bundled-ggml dev build path (TTS_CPP_USE_SYSTEM_GGML=OFF).
# Expects `tts-cpp/ggml/` to be a checkout of the
# tetherto/qvac-ext-ggml repo on the `speech` branch — the QVAC
# fork carrying every infrastructure patch + the Supertonic 2
# fused custom op family as commits (not as a patches/ overlay).
#
# Run `bash tts-cpp/scripts/setup-ggml.sh` first to clone +
# check out the pinned commit. No patches/ directory is
# consulted: the speech branch is already pre-patched at the
# commit level.
if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ggml/CMakeLists.txt")
message(FATAL_ERROR
"tts-cpp: this in-tree subtree does not ship the patches/ "
"directory. Pass -DTTS_CPP_USE_SYSTEM_GGML=ON to consume "
"the QVAC speech-stack `ggml-speech` vcpkg port (which "
"carries the pre-applied patches), or use the standalone "
"github.com/gianni-cor/chatterbox.cpp repo for a "
"bundled-ggml dev build with patches/ present.")
"tts-cpp: bundled-ggml build requires tts-cpp/ggml/ to be "
"a checkout of tetherto/qvac-ext-ggml@speech. Run "
"`bash tts-cpp/scripts/setup-ggml.sh` first, or pass "
"-DTTS_CPP_USE_SYSTEM_GGML=ON to consume the QVAC "
"speech-stack `ggml-speech` vcpkg port.")
endif()
add_subdirectory(ggml)
endif()
Expand Down
899 changes: 899 additions & 0 deletions tts-cpp/PROGRESS_SUPERTONIC.md

Large diffs are not rendered by default.

40 changes: 25 additions & 15 deletions tts-cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -338,28 +338,38 @@ target_link_libraries(my_app PRIVATE tts-cpp::tts-cpp)
```

For development out of this in-tree subtree (running the parity
harnesses, prototyping API changes, etc.) the canonical build is:
harnesses, prototyping API changes, etc.) the canonical build is the
**bundled-ggml dev flow**:

```bash
bash tts-cpp/scripts/setup-ggml.sh # clones qvac-ext-ggml@speech into tts-cpp/ggml/
cmake -S tts-cpp -B tts-cpp/build -DCMAKE_BUILD_TYPE=Release \
-DTTS_CPP_USE_SYSTEM_GGML=OFF
cmake --build tts-cpp/build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu)
```

`setup-ggml.sh` checks out the pinned tetherto/qvac-ext-ggml@speech
commit (which already carries every QVAC infrastructure patch + the
Supertonic 2 fused custom op family — no `patches/` overlay needed).
CMakeLists's `add_subdirectory(ggml)` path then consumes it directly
with `GGML_NATIVE=ON` for native ARM/SIMD codegen — typically ~10%
faster on M-series than the vcpkg-port flavor's portable build.

Downstream production builds use the system-installed `ggml` instead:

```bash
# Install the speech-stack ggml port via vcpkg first; then:
cmake -S tts-cpp -B tts-cpp/build -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_TOOLCHAIN_FILE=<vcpkg_root>/scripts/buildsystems/vcpkg.cmake
cmake --build tts-cpp/build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu)
```

`TTS_CPP_USE_SYSTEM_GGML` defaults to `ON` here so the build picks
up the patched ggml from vcpkg automatically; flipping it `OFF` in
this subtree is rejected at configure time (no `patches/` to apply).
GPU acceleration is selected at the ggml-port level - the
`ggml-speech` port already carries the Metal / Vulkan / OpenCL
backend support its consumers ask for; pass `--n-gpu-layers 99` at
runtime to actually use the compiled GPU backend.

If you need a bundled-ggml dev build (`add_subdirectory(ggml)` with
patches applied locally rather than coming from vcpkg), use the
standalone [`chatterbox.cpp`](https://github.com/gianni-cor/chatterbox.cpp)
repo - the source-of-truth this subtree was copied from - which keeps
`scripts/setup-ggml.sh` + `patches/` for that flow.
`TTS_CPP_USE_SYSTEM_GGML` defaults to `ON` for this flow, finding
the `ggml-speech` port from qvac-registry-vcpkg (which pulls
qvac-ext-ggml@speech with patches as commits). GPU acceleration is
selected at the ggml-port level — the port already carries the
Metal / Vulkan / OpenCL backend support its consumers ask for; pass
`--n-gpu-layers 99` at runtime to actually use the compiled GPU
backend.

### Useful CMake options

Expand Down
38 changes: 37 additions & 1 deletion tts-cpp/include/tts-cpp/supertonic/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,15 @@
//
// EngineOptions opts;
// opts.model_gguf_path = "models/supertonic.gguf";
// opts.n_gpu_layers = 0; // CPU only today
// opts.n_gpu_layers = 0; // 0 = CPU; >0 enables Metal
// // on macOS / CUDA / Vulkan /
// // OpenCL when compiled in.
// // Metal on Apple silicon is the
// // fastest backend as of 2026-05-12
// // (~35× realtime on M2, beats
// // ggml-CPU, ONNX-CPU and ONNX-CoreML
// // on every stage that matters).
// // See PROGRESS_SUPERTONIC.md.
//
// Engine engine(opts);
// for (const auto & line : lines) {
Expand Down Expand Up @@ -43,6 +51,26 @@

namespace tts_cpp::supertonic {

// Compute precision for matmul weights inside the model buffer. Selects
// how the GGUF's stored q8_0 weights are loaded into the resident model:
// - F32 (default): expand q8_0 to f32 at load time. CPU path uses
// cblas/AMX f32 matmul. Metal path uses kernel_mul_mat_f32_f32.
// Highest accuracy + simplest, but on Metal misses the 4×
// weight-bandwidth win of running the native q8_0 matmul kernel.
// - F16 (Phase B1): expand q8_0 to f16 at load time, run f16 matmul
// with f32 accumulator. ~2× less activation bandwidth on Metal,
// may drift slightly across the 5 CFM steps (parity tolerance
// relaxed to ~1e-2 L_inf).
// - Q8_0 (Phase A3): keep weights as q8_0 in the model buffer, let
// ggml's quantized matmul kernels dispatch directly. Metal-only
// (Phase A3 makes the load logic asymmetric: q8_0 on Metal, f32
// on CPU).
enum class Precision {
F32,
F16,
Q8_0,
};

struct EngineOptions {
// Required.
std::string model_gguf_path;
Expand All @@ -56,6 +84,11 @@ struct EngineOptions {
int n_threads = 0;
int n_gpu_layers = 0;

// Compute precision for matmul weights — see Precision enum above.
// Default F32 is the current behaviour (load q8_0 GGUF, expand to f32).
// F16 / Q8_0 are non-default GPU paths (Metal-validated).
Precision precision = Precision::F32;

// F16 K/V flash-attention in the vector estimator. When -1, the
// engine auto-enables this on GPU backends (non-CPU) and disables
// it on CPU; pass 1 / 0 to force the setting regardless of the
Expand All @@ -72,6 +105,9 @@ struct EngineOptions {
// Halves the GPU read bandwidth into those ops with a small
// (≤ 2e-3 abs / 5e-3 cosine) numerical drift on the end-to-end
// synth. Mirrors chatterbox's CHATTERBOX_F16_CFM gate.
// Orthogonal to `precision`: this is a per-op runtime selector for
// the OpenCL hot-weight materialisation, while `precision` decides
// the storage type of all matmul weights uniformly.
int f16_weights = -1;

// Optional path to a .npy file containing the initial noise tensor of
Expand Down
45 changes: 45 additions & 0 deletions tts-cpp/scripts/setup-ggml.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env bash
#
# setup-ggml.sh — clone the qvac-ext-ggml@speech branch into tts-cpp/ggml/
#
# The bundled-ggml dev build path for tts-cpp out of this in-tree subtree.
# Replaces the vcpkg-port consumption when you want a fast iteration loop
# without going through vcpkg installs.
#
# Pinned to the head of the `speech` branch (a tetherto/qvac-ext-ggml fork
# of ggml-org/ggml carrying all QVAC infrastructure patches + the
# Supertonic 2 fused custom op family pre-applied as commits — no
# patches/ directory needed at this layer).
#
# Usage:
# bash tts-cpp/scripts/setup-ggml.sh
# cmake -S tts-cpp -B tts-cpp/build -DTTS_CPP_USE_SYSTEM_GGML=OFF
# cmake --build tts-cpp/build -j
#
# To update to a newer pin: bump GGML_REF below and re-run. The script
# is idempotent — re-running checks out the right ref into the existing
# tts-cpp/ggml/ clone without re-cloning.

set -euo pipefail

GGML_REPO_URL="https://github.com/tetherto/qvac-ext-ggml.git"
GGML_REF="60a172e48f699bd0a00575ef911feed9473b2187" # merge of qvac-ext-ggml#8 (speech HEAD)

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TTS_CPP_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
GGML_DIR="${TTS_CPP_DIR}/ggml"

if [ -d "${GGML_DIR}/.git" ]; then
echo "setup-ggml: existing clone at ${GGML_DIR} — fetching + checking out pin ${GGML_REF:0:10}"
git -C "${GGML_DIR}" fetch --depth 1 origin "${GGML_REF}"
git -C "${GGML_DIR}" checkout --detach "${GGML_REF}"
else
echo "setup-ggml: cloning qvac-ext-ggml @ ${GGML_REF:0:10} into ${GGML_DIR}"
rm -rf "${GGML_DIR}"
git clone --depth 1 --no-tags "${GGML_REPO_URL}" "${GGML_DIR}"
git -C "${GGML_DIR}" fetch --depth 1 origin "${GGML_REF}"
git -C "${GGML_DIR}" checkout --detach "${GGML_REF}"
fi

echo "setup-ggml: tts-cpp/ggml/ ready at $(git -C "${GGML_DIR}" rev-parse --short HEAD)"
echo "setup-ggml: next: cmake -S tts-cpp -B tts-cpp/build -DTTS_CPP_USE_SYSTEM_GGML=OFF"
168 changes: 168 additions & 0 deletions tts-cpp/scripts/validate-precision-parity.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#!/usr/bin/env bash
# Multi-precision parity + bench harness for Supertonic 2.
#
# For each supported precision (f32, f16, q8_0):
# 1. Synthesizes a reference WAV on CPU at that precision.
# 2. Synthesizes the same WAV on Metal at the same precision.
# 3. Reports parity (corr, L_inf, RMS) between the two.
# 4. Optionally runs supertonic-bench at the same precision and emits
# a per-precision JSON artifact alongside.
#
# Usage:
# bash scripts/validate-precision-parity.sh [--bench] [--text TEXT] [--model PATH]
# [--precisions f32,f16,q8_0]
#
# Precisions not yet wired through the graph builders fail at load with
# a clear "scaffolded but not yet supported" message and are skipped (not
# counted as a parity failure). This lets the harness be useful right
# now while Phase A3 / B1 work lands.

set -euo pipefail

ROOT="$(cd "$(dirname "$0")/.." && pwd)"
MODEL="$ROOT/models/supertonic2.gguf"
TEXT="The quick brown fox jumps over the lazy dog."
PRECISIONS="f32,f16,q8_0"
DO_BENCH=0
RUNS=10
WARMUP=2
THREADS=4
ARTIFACT_DIR="$ROOT/artifacts/bench/parity-matrix"

while [[ $# -gt 0 ]]; do
case "$1" in
--bench) DO_BENCH=1; shift ;;
--text) TEXT="$2"; shift 2 ;;
--model) MODEL="$2"; shift 2 ;;
--precisions) PRECISIONS="$2"; shift 2 ;;
--runs) RUNS="$2"; shift 2 ;;
--warmup) WARMUP="$2"; shift 2 ;;
--threads) THREADS="$2"; shift 2 ;;
--artifact-dir) ARTIFACT_DIR="$2"; shift 2 ;;
-h|--help)
sed -n '2,/^set -euo/p' "$0" | sed 's/^# //; s/^#//; /^set -euo/d'
exit 0 ;;
*) echo "unknown arg: $1" >&2; exit 2 ;;
esac
done

CLI="$ROOT/build/supertonic-cli"
BENCH="$ROOT/build/supertonic-bench"
PY="$ROOT/.venv/bin/python3"
if [[ ! -x "$CLI" ]]; then
echo "build/supertonic-cli not found. Run 'cmake --build build --target supertonic-cli' first." >&2
exit 1
fi
if [[ "$DO_BENCH" -eq 1 && ! -x "$BENCH" ]]; then
echo "--bench requested but build/supertonic-bench not found." >&2
exit 1
fi
if [[ ! -x "$PY" ]]; then
echo "$PY not found. Activate a venv with numpy + wave installed." >&2
exit 1
fi

mkdir -p "$ARTIFACT_DIR"
TMP="$(mktemp -d)"
trap 'rm -rf "$TMP"' EXIT

printf "\nSupertonic 2 multi-precision parity + bench harness\n"
printf " model: %s\n" "$MODEL"
printf " text: %.60s%s\n" "$TEXT" "$([[ ${#TEXT} -gt 60 ]] && echo '...')"
printf " precisions: %s\n" "$PRECISIONS"
printf " bench: %s\n\n" "$([[ "$DO_BENCH" -eq 1 ]] && echo 'yes' || echo 'no')"

OVERALL_RC=0
IFS=',' read -r -a PREC_ARR <<< "$PRECISIONS"
for P in "${PREC_ARR[@]}"; do
P_TRIM="$(echo "$P" | xargs)"
CPU_WAV="$TMP/cpu-$P_TRIM.wav"
MTL_WAV="$TMP/mtl-$P_TRIM.wav"

printf "=== %s ===\n" "$P_TRIM"

set +e
CPU_LOG="$("$CLI" --model "$MODEL" --text "$TEXT" --n-gpu-layers 0 \
--precision "$P_TRIM" --out "$CPU_WAV" 2>&1)"
CPU_RC=$?
MTL_LOG="$("$CLI" --model "$MODEL" --text "$TEXT" --n-gpu-layers 1 \
--precision "$P_TRIM" --out "$MTL_WAV" 2>&1)"
MTL_RC=$?
set -e

if echo "$CPU_LOG$MTL_LOG" | grep -qE "scaffolded but not yet|partially scaffolded"; then
printf " SKIP: precision %s not yet wired through graph builders (Phase A3/B1)\n\n" "$P_TRIM"
continue
fi
# Tolerate the harmless post-write atexit `GGML_ASSERT([rsets->data count] == 0)`
# that fires on Metal cleanup AFTER the WAV is fully written. Treat the run as
# successful iff the WAV file exists and is at least 1 KB (covers a synthesized
# signal, well above an empty/header-only file).
cpu_ok=1; mtl_ok=1
[[ -s "$CPU_WAV" ]] || cpu_ok=0
[[ -s "$MTL_WAV" ]] || mtl_ok=0
if [[ -f "$CPU_WAV" ]]; then
size=$(wc -c < "$CPU_WAV")
[[ $size -lt 1024 ]] && cpu_ok=0
fi
if [[ -f "$MTL_WAV" ]]; then
size=$(wc -c < "$MTL_WAV")
[[ $size -lt 1024 ]] && mtl_ok=0
fi
if [[ $cpu_ok -eq 0 || $mtl_ok -eq 0 ]]; then
printf " FAIL: synthesis errored. cpu_rc=%d mtl_rc=%d wav_ok cpu=%d mtl=%d\n" \
"$CPU_RC" "$MTL_RC" "$cpu_ok" "$mtl_ok"
printf " --- cpu tail ---\n%s\n --- metal tail ---\n%s\n\n" \
"$(echo "$CPU_LOG" | tail -3)" "$(echo "$MTL_LOG" | tail -3)"
OVERALL_RC=1
continue
fi

"$PY" - <<PY
import wave, numpy as np, sys
def load(p):
with wave.open(p, 'rb') as w:
return np.frombuffer(w.readframes(w.getnframes()), dtype=np.int16).astype(np.float32) / 32768.0
a = load("$CPU_WAV")
b = load("$MTL_WAV")
n = min(len(a), len(b))
a, b = a[:n], b[:n]
corr = float(np.corrcoef(a, b)[0, 1])
linf = float(np.max(np.abs(a - b)))
rms = float(np.sqrt(np.mean((a - b) ** 2)))
# Per-precision tolerance: numbers chosen against observed CPU↔Metal drift
# on the benchmark text "The quick brown fox jumps over the lazy dog.".
# Short text routinely gets L_inf ≈ 1.7e-3; long text accumulates more
# float-order drift across 5 CFM steps × more attention positions, landing
# around L_inf ≈ 3.7e-2 with corr ≥ 0.998 — audibly identical for f32.
# Q8_0 has additional drift from the dequant→transpose→requantize round-trip
# in the asymmetric load path (Metal keeps q8_0, CPU expands to f32, so the
# two paths use slightly differently-quantized weights). Audibly identical.
tol_corr = {"f32": 0.998, "f16": 0.99, "q8_0": 0.96}.get("$P_TRIM", 0.99)
tol_linf = {"f32": 0.05, "f16": 0.10, "q8_0": 0.15 }.get("$P_TRIM", 0.10)
print(f" corr={corr:.6f} (tol >= {tol_corr}) L_inf={linf:.6f} (tol <= {tol_linf}) RMS={rms:.6f}")
ok = corr >= tol_corr and linf <= tol_linf
print(" PASS" if ok else " FAIL parity")
sys.exit(0 if ok else 1)
PY
PY_RC=$?
if [[ $PY_RC -ne 0 ]]; then OVERALL_RC=1; fi

if [[ "$DO_BENCH" -eq 1 ]]; then
JSON="$ARTIFACT_DIR/supertonic-mtl-${P_TRIM}.json"
printf " bench --> %s\n" "$JSON"
"$BENCH" --model "$MODEL" --text "$TEXT" \
--voice M1 --language en --steps 5 --speed 1.05 --seed 42 \
--runs "$RUNS" --warmup "$WARMUP" --threads "$THREADS" \
--n-gpu-layers 1 --precision "$P_TRIM" \
--json-out "$JSON" 2>&1 | grep -E '^\s*(vector_estimator|vocoder|text_encoder|total|RTF|Real-time)' || true
fi
printf "\n"
done

if [[ $OVERALL_RC -eq 0 ]]; then
printf "All wired-up precisions pass parity.\n"
else
printf "One or more precisions failed parity (or errored).\n" >&2
fi
exit $OVERALL_RC
Loading
Loading