diff --git a/.github/workflows/cpp-test-coverage-tts-ggml.yml b/.github/workflows/cpp-test-coverage-tts-ggml.yml index 56ecfbf448..f1d4c3dbae 100644 --- a/.github/workflows/cpp-test-coverage-tts-ggml.yml +++ b/.github/workflows/cpp-test-coverage-tts-ggml.yml @@ -1,21 +1,10 @@ name: CPP Tests (TTS GGML) -# Stub workflow. packages/tts-ggml's CMakeLists.txt currently has no C++ -# test executables wired up (BUILD_TESTING block is documented as a -# placeholder for the future qvac::ttsggml::* unit suite). Until those -# tests land, this workflow exposes a workflow_call entry point so -# on-pr-tts-ggml.yml can reference it without 404'ing, and prints a clear -# "no tests yet" notice instead of silently passing. -# -# When the C++ suite is added, replace the body of `cpp-tests` with the -# real ccache + vcpkg + npm run coverage:cpp:build/run/report sequence -# (mirror cpp-test-coverage-transcription-parakeet.yml). - on: workflow_call: inputs: ref: - description: "ref" + description: 'ref' type: string repository: type: string @@ -24,17 +13,250 @@ on: permissions: contents: read +env: + BUILD_TYPE: Debug + ENABLE_COVERAGE: ON + PKG_DIR: packages/tts-ggml + jobs: cpp-tests: - name: tts-ggml C++ tests (stub) - runs-on: ubuntu-latest permissions: contents: read + id-token: write + checks: write + runs-on: ${{ matrix.os }} + environment: release + name: ${{ matrix.platform }}-${{ matrix.arch }}-cpp-tests + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-22.04 + platform: linux + arch: x64 + coverage: true + env: + VCPKG_BINARY_SOURCES: "clear;files,${{ github.workspace }}/vcpkg/cache,readwrite" + + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || github.ref }} + token: ${{ secrets.PAT_TOKEN }} + lfs: true + + - name: Setup Node.js + uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0 + with: + node-version: lts/* + + - name: Setup LLVM + if: matrix.platform == 'linux' + uses: tetherto/qvac/.github/actions/setup-llvm@0c819dd1110e4902223b1f7646cc0f1be2c9bc5c + + - name: Setup ccache + if: matrix.platform == 'linux' + run: | + sudo apt-get install -y ccache + ccache -z + echo "CMAKE_C_COMPILER_LAUNCHER=ccache" >> $GITHUB_ENV + echo "CMAKE_CXX_COMPILER_LAUNCHER=ccache" >> $GITHUB_ENV + + - name: Get ccache cache + if: matrix.platform == 'linux' + uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # 5.0.4 + with: + key: ccache-cpp-coverage-tts-ggml-${{ matrix.platform }}-${{ matrix.arch }}-${{ hashFiles(format('{0}/vcpkg.json', env.PKG_DIR)) }} + path: ~/.cache/ccache + restore-keys: ccache-cpp-coverage-tts-ggml-${{ matrix.platform }}-${{ matrix.arch }}- + + - name: Install NPM dependencies + working-directory: ${{ env.PKG_DIR }} + run: | + npm install + npm install -g bare bare-make + + - if: ${{ matrix.os == 'ubuntu-22.04' }} + name: Configure vcpkg in linux + run: echo "VCPKG_ROOT=$VCPKG_INSTALLATION_ROOT" >> $GITHUB_ENV + + - name: Setup Vulkan SDK + uses: tetherto/qvac/.github/actions/setup-vulkan-sdk@0bbdca93da303a0b1634ba14a89cec085621078d + env: + MODEL_S3_BUCKET: ${{ secrets.MODEL_S3_BUCKET }} + with: + platform: ${{ matrix.platform }} + arch: ${{ matrix.arch }} + aws-role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }} + aws-region: eu-central-1 + + - name: Generate and build C++ unit tests (with coverage) + working-directory: ${{ env.PKG_DIR }} + run: | + npm run coverage:cpp:build + + - name: Setup Python (HF -> .gguf conversion for QVAC_TEST_* gates) + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + # Mirrors parakeet's cpp-test-coverage flow. The C++ suite has two + # tiers of tests: + # - Pure validation / static-helper tests (no GGUF needed). + # - QVAC_TEST_CHATTERBOX_T3_GGUF + QVAC_TEST_CHATTERBOX_S3GEN_GGUF + # and QVAC_TEST_SUPERTONIC_GGUF gated round-trip tests. + # We provision the multilingual Chatterbox (q4_0) + English + # Supertonic (q4_0 -> f16 per supertonic_ftype) GGUFs because they + # exercise the most code paths per byte downloaded. Cache hit + # skips the full HF download + conversion (~15 min cold, ~0s warm). + - name: Cache TTS GGML venv + q4 GGUFs + id: cache-tts-ggml-models + uses: actions/cache@v4 + with: + path: | + ${{ env.PKG_DIR }}/venv + ${{ env.PKG_DIR }}/models/chatterbox-t3-mtl.gguf + ${{ env.PKG_DIR }}/models/chatterbox-s3gen-mtl.gguf + ${{ env.PKG_DIR }}/models/supertonic.gguf + key: tts-ggml-cpp-mtl-supertonic-en-q4-v1-${{ hashFiles(format('{0}/scripts/convert-models.sh', env.PKG_DIR), format('{0}/scripts/setup-venv.sh', env.PKG_DIR), format('{0}/scripts/requirements.txt', env.PKG_DIR), format('{0}/scripts/convert-t3-mtl-to-gguf.py', env.PKG_DIR), format('{0}/scripts/convert-s3gen-to-gguf.py', env.PKG_DIR), format('{0}/scripts/convert-supertonic2-to-gguf.py', env.PKG_DIR)) }} + restore-keys: | + tts-ggml-cpp-mtl-supertonic-en-q4-v1- + + - name: Convert HF -> q4_0 GGUFs (cache miss) + if: steps.cache-tts-ggml-models.outputs.cache-hit != 'true' + working-directory: ${{ env.PKG_DIR }} + shell: bash + run: | + set -euo pipefail + echo "Cache miss -- provisioning Python venv and converting models..." + bash scripts/setup-venv.sh + bash scripts/convert-models.sh -t multilingual -q q4_0 + bash scripts/convert-models.sh -t supertonic-en -q q4_0 + ls -lh models/ + + - name: Verify GGUFs for C++ tests + working-directory: ${{ env.PKG_DIR }} + shell: bash + run: | + set -euo pipefail + for f in chatterbox-t3-mtl.gguf chatterbox-s3gen-mtl.gguf supertonic.gguf; do + if [ ! -s "models/$f" ]; then + echo "WARN: missing or empty models/$f -- gated tests will skip" >&2 + ls -la models/ || true + else + echo "OK: models/$f" + fi + done + + - name: Run C++ Unit Tests + id: cpp-tests + working-directory: ${{ env.PKG_DIR }} + env: + QVAC_TEST_CHATTERBOX_T3_GGUF: ${{ github.workspace }}/${{ env.PKG_DIR }}/models/chatterbox-t3-mtl.gguf + QVAC_TEST_CHATTERBOX_S3GEN_GGUF: ${{ github.workspace }}/${{ env.PKG_DIR }}/models/chatterbox-s3gen-mtl.gguf + QVAC_TEST_SUPERTONIC_GGUF: ${{ github.workspace }}/${{ env.PKG_DIR }}/models/supertonic.gguf + run: | + echo "=== Running TTS GGML C++ tests ===" + pwd + ls -la models/ || true + npm run test:cpp:run + continue-on-error: true + + - name: Debug - List test output files + if: always() + working-directory: ${{ env.PKG_DIR }} + run: | + echo "Contents of build/:" + ls -la build/ || echo "Directory not found" + echo "Profiling files:" + ls -la build/*.profraw || echo "No profraw files" + + - name: Generate Coverage Report + if: matrix.coverage + working-directory: ${{ env.PKG_DIR }} + run: | + npm run coverage:cpp:report + COVERAGE=$(grep 'TOTAL' build/coverage-summary.txt | awk '{print $10}') + echo "LINE_COVERAGE_PERCENTAGE=${COVERAGE}" >> $GITHUB_ENV + echo "Line Coverage: ${COVERAGE}" > coverage-badge.txt + + - name: Show ccache statistics + if: matrix.platform == 'linux' + run: ccache -s + + - name: Archive Coverage Results + if: matrix.coverage + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0 + with: + name: coverage-report-tts-ggml-${{ matrix.platform }}-${{ matrix.arch }} + path: | + ${{ env.PKG_DIR }}/build/coverage-html/ + ${{ env.PKG_DIR }}/build/lcov.info + ${{ env.PKG_DIR }}/build/coverage-summary.txt + ${{ env.PKG_DIR }}/build/cpp-test-results.xml + retention-days: 30 + + - name: Check if test results exist + id: check-test-results + working-directory: ${{ env.PKG_DIR }} + run: | + if [ -f build/cpp-test-results.xml ]; then + echo "exists=true" >> $GITHUB_OUTPUT + else + echo "exists=false" >> $GITHUB_OUTPUT + fi + continue-on-error: true + + - name: Publish Test Results + uses: EnricoMi/publish-unit-test-result-action@c950f6fb443cb5af20a377fd0dfaa78838901040 # 2.23.0 + if: always() && steps.check-test-results.outputs.exists == 'true' + with: + files: ${{ env.PKG_DIR }}/build/cpp-test-results.xml + comment_mode: off + check_name: "C++ Test Results (${{ matrix.platform }}-${{ matrix.arch }})" + + - name: Coverage Summary + if: matrix.coverage + working-directory: ${{ env.PKG_DIR }} + run: | + echo "## C++ Coverage Summary (TTS GGML)" >> $GITHUB_STEP_SUMMARY + echo "**Platform:** ${{ matrix.platform }}-${{ matrix.arch }}" >> $GITHUB_STEP_SUMMARY + echo "**Coverage:** ${{ env.LINE_COVERAGE_PERCENTAGE }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + if [ -f build/coverage-summary.txt ]; then + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY + cat build/coverage-summary.txt >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY + fi + + coverage-report: + permissions: + contents: read + actions: read + needs: cpp-tests + runs-on: ubuntu-latest + if: always() steps: - - name: Explain + - name: Download all coverage reports + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + path: coverage-reports + + - name: Combine Coverage Reports run: | - echo "packages/tts-ggml has no C++ unit tests today." - echo "CMakeLists.txt's BUILD_TESTING block is reserved for a future" - echo "qvac::ttsggml::* suite; this workflow will be expanded into" - echo "the real ccache + vcpkg + coverage pipeline at that point" - echo "(mirror cpp-test-coverage-transcription-parakeet.yml)." + echo "## TTS GGML C++ Code Coverage Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Test Suite:** ChatterboxModel + SupertonicModel + AddonCpp" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + for report in coverage-reports/*/coverage-summary.txt; do + if [ -f "$report" ]; then + platform=$(basename $(dirname "$report")) + echo "### $platform" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY + cat "$report" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + fi + done diff --git a/.github/workflows/integration-mobile-test-tts-ggml.yml b/.github/workflows/integration-mobile-test-tts-ggml.yml index 9acc67b320..18e11bd374 100644 --- a/.github/workflows/integration-mobile-test-tts-ggml.yml +++ b/.github/workflows/integration-mobile-test-tts-ggml.yml @@ -64,6 +64,8 @@ env: PREBUILD_ARTIFACT_PREFIX: "tts-ggml-" # Prefix for prebuild artifacts (must match prebuilds workflow upload names) TEST_FRAMEWORK_REF: "main" APP_BUNDLE_ID: "io.tether.test.qvac" + AWS_DEVICE_FARM_PROJECT_ARN: ${{ secrets.AWS_DEVICE_FARM_PROJECT_ARN_TTS_GGML || secrets.AWS_DEVICE_FARM_PROJECT_ARN_ONNX_TTS }} + IOS_DEVICE_POOL_ARN: ${{ secrets.IOS_DEVICE_POOL_ARN_TTS_GGML || secrets.IOS_DEVICE_POOL_ARN_ONNX_TTS }} WORKDIR: ${{ inputs.workdir || github.event.inputs.workdir || 'packages/tts-ggml' }} ADDON_DIR: addon/${{ inputs.workdir || github.event.inputs.workdir || 'packages/tts-ggml' }} TEST_FRAMEWORK_DIR: test-framework @@ -77,6 +79,13 @@ permissions: contents: read jobs: + # NOTE: continue-on-error: true mirrors integration-test-tts-ggml.yml + # and parakeet's matching workflow. on-pr-tts-ggml.yml's merge-guard + # treats `success || skipped` as a pass, so a hard Device Farm failure + # does NOT block a merge today — only sanity-checks + cpp-lint + + # cpp-tests-coverage + prebuild really gate. Tighten this once the + # mobile pipeline (Device Farm provisioning, signing, runner pool + # availability) is stable enough to make a hard gate cheap. build-and-test: name: Build ${{ matrix.platform }} and Run E2E Tests runs-on: ${{ matrix.runner }} @@ -358,62 +367,65 @@ jobs: # The mobile test framework bundles whatever sits in # `test/mobile/testAssets/` into the test app at `npm run build` # time (see qvac-test-addon-mobile/scripts/build-test-app.js's - # copyTestAssets / generateAssetManifest). On mobile we ship - # one GGUF per engine (not both English + multilingual variants - # for each) to keep the bundle reasonable: - # - chatterbox-t3-turbo.gguf + chatterbox-s3gen.gguf (English Turbo, q4_0) - # - supertonic.gguf (English Supertonic; --quant q4_0 maps to f16 - # for the Supertonic converter -- see - # supertonic_ftype() in scripts/convert-models.sh) - # The MTL Chatterbox / Supertonic-2 GGUFs are NOT bundled; the - # `ensure*Models` helpers in test/utils/downloadModel.js return - # success=false when their files aren't on disk so the matching - # MTL tests skip cleanly via t.pass('Skipped: ...') instead of - # failing. Bundle the MTL variants from this workflow when mobile - # MTL coverage becomes a release gate. + # copyTestAssets / generateAssetManifest). Metro's bundler reads + # each asset via `Buffer.toString()`, so any single asset has to + # fit in V8's max string length (0x1fffffe8 ~= 512 MiB). # - # The Python venv + converted GGUFs are cached together; cache-hit - # skips the whole HF download + conversion (~15 min cold, ~0s warm). - - name: Cache TTS GGML venv + q4 GGUFs + # On mobile we therefore bundle ONLY supertonic.gguf (~125 MiB at + # q4_0 -> f16 via supertonic_ftype()). The Chatterbox bundle does + # not fit: chatterbox-s3gen.gguf is ~1 GiB even at --quant q4_0 + # because the s3gen converter doesn't quantize most weight tensors + # ("0/291 weight tensors quantized" in the converter log), and + # Metro fails the bundle step with + # SyntaxError: assets/testAssets/chatterbox-s3gen.gguf: + # Cannot create a string longer than 0x1fffffe8 characters + # + # Mobile Chatterbox tests degrade cleanly (ensureChatterboxModels + # returns { success: false } and the test passes via + # t.pass('Skipped: Chatterbox GGUFs not available')). To enable + # them, either add `gguf` to the test framework's metro + # `assetExts` so it skips the JS-string read (then chatterbox can + # be bundled), or push the GGUFs to the device via `adb push` + # outside the bundle and surface the path through the existing + # ANDROID_CANDIDATE_DIRS fallback in downloadModel.js. + # + # The Python venv + converted GGUF are cached together; cache-hit + # skips the HF download + conversion entirely. + - name: Cache TTS GGML venv + Supertonic GGUF id: cache-tts-ggml-models uses: actions/cache@v4 with: path: | ${{ env.ADDON_DIR }}/venv - ${{ env.ADDON_DIR }}/models/chatterbox-t3-turbo.gguf - ${{ env.ADDON_DIR }}/models/chatterbox-s3gen.gguf ${{ env.ADDON_DIR }}/models/supertonic.gguf - key: tts-ggml-mobile-turbo-supertonic-en-q4-v1-${{ hashFiles(format('{0}/scripts/convert-models.sh', env.ADDON_DIR), format('{0}/scripts/setup-venv.sh', env.ADDON_DIR), format('{0}/scripts/requirements.txt', env.ADDON_DIR), format('{0}/scripts/convert-t3-turbo-to-gguf.py', env.ADDON_DIR), format('{0}/scripts/convert-s3gen-to-gguf.py', env.ADDON_DIR), format('{0}/scripts/convert-supertonic2-to-gguf.py', env.ADDON_DIR)) }} + key: tts-ggml-mobile-supertonic-en-q4-v2-${{ hashFiles(format('{0}/scripts/convert-models.sh', env.ADDON_DIR), format('{0}/scripts/setup-venv.sh', env.ADDON_DIR), format('{0}/scripts/requirements.txt', env.ADDON_DIR), format('{0}/scripts/convert-supertonic2-to-gguf.py', env.ADDON_DIR)) }} restore-keys: | - tts-ggml-mobile-turbo-supertonic-en-q4-v1- + tts-ggml-mobile-supertonic-en-q4-v2- - - name: Convert HF -> q4_0 GGUFs (cache miss) + - name: Convert HF -> Supertonic GGUF (cache miss) if: steps.cache-tts-ggml-models.outputs.cache-hit != 'true' working-directory: ${{ env.ADDON_DIR }} shell: bash run: | set -euo pipefail - echo "Cache miss -- provisioning Python venv and converting models..." + echo "Cache miss -- provisioning Python venv and converting Supertonic..." bash scripts/setup-venv.sh echo "" - echo "=== Converting Chatterbox Turbo (English, q4_0) ===" - bash scripts/convert-models.sh -t turbo -q q4_0 - echo "" echo "=== Converting Supertonic English (q4_0 -> f16) ===" bash scripts/convert-models.sh -t supertonic-en -q q4_0 echo "" - echo "Converted GGUFs:" + echo "Converted GGUF:" ls -lh models/ - - name: Stage q4 GGUFs into test/mobile/testAssets/ + - name: Stage Supertonic GGUF into test/mobile/testAssets/ working-directory: ${{ env.ADDON_DIR }} shell: bash run: | set -euo pipefail mkdir -p test/mobile/testAssets - for f in chatterbox-t3-turbo.gguf \ - chatterbox-s3gen.gguf \ - supertonic.gguf; do + # Only Supertonic is bundled today (see comment above the + # cache step for why Chatterbox can't fit through Metro yet). + for f in supertonic.gguf; do if [ ! -s "models/$f" ]; then echo "ERROR: missing or empty models/$f -- cache may be corrupt" >&2 ls -la models/ || true @@ -888,7 +900,7 @@ jobs: echo "Uploading app to AWS Device Farm..." UPLOAD_RESPONSE=$(aws devicefarm create-upload \ - --project-arn "${{ secrets.AWS_DEVICE_FARM_PROJECT_ARN_TTS_GGML }}" \ + --project-arn "${{ env.AWS_DEVICE_FARM_PROJECT_ARN }}" \ --name "$APP_NAME" \ --type "$APP_TYPE" \ --output json) @@ -989,7 +1001,7 @@ jobs: echo "Uploading test package to AWS Device Farm..." UPLOAD_RESPONSE=$(aws devicefarm create-upload \ - --project-arn "${{ secrets.AWS_DEVICE_FARM_PROJECT_ARN_TTS_GGML }}" \ + --project-arn "${{ env.AWS_DEVICE_FARM_PROJECT_ARN }}" \ --name "$ZIP_NAME" \ --type "APPIUM_NODE_TEST_PACKAGE" \ --output json) @@ -1173,7 +1185,7 @@ jobs: echo "Uploading test spec to Device Farm..." SPEC_RESPONSE=$(aws devicefarm create-upload \ - --project-arn "${{ secrets.AWS_DEVICE_FARM_PROJECT_ARN_TTS_GGML }}" \ + --project-arn "${{ env.AWS_DEVICE_FARM_PROJECT_ARN }}" \ --name "testspec.yml" \ --type "APPIUM_NODE_TEST_SPEC" \ --output json) @@ -1215,7 +1227,7 @@ jobs: RUN_NAME="PR-${{ github.event.pull_request.number || github.run_number }}-${{ matrix.platform }}" fi - PROJECT_ARN="${{ secrets.AWS_DEVICE_FARM_PROJECT_ARN_TTS_GGML }}" + PROJECT_ARN="${{ env.AWS_DEVICE_FARM_PROJECT_ARN }}" APP_ARN="${{ steps.upload_app.outputs.app_upload_arn }}" TEST_PACKAGE_ARN="${{ steps.upload_test_package.outputs.test_package_upload_arn }}" TEST_SPEC_ARN="${{ steps.upload_test_spec.outputs.test_spec_arn }}" @@ -1282,7 +1294,7 @@ jobs: --filters '[{"attribute":"MANUFACTURER","operator":"EQUALS","values":["Apple"]},{"attribute":"MODEL","operator":"CONTAINS","values":["iPhone 17"]},{"attribute":"PLATFORM","operator":"EQUALS","values":["IOS"]}]' \ --query 'devices[].{name:name,model:model,os:os,availability:availability}' --output table || true - POOL_ARN="${{ secrets.IOS_DEVICE_POOL_ARN_TTS_GGML }}" + POOL_ARN="${{ env.IOS_DEVICE_POOL_ARN }}" RUN_ARN_1=$(schedule_run_with_pool "$POOL_ARN" "$RUN_NAME" "$TEST_SPEC_ARN") echo "✅ iOS pool run scheduled: $RUN_ARN_1" diff --git a/.github/workflows/integration-test-tts-ggml.yml b/.github/workflows/integration-test-tts-ggml.yml index 742d5ae658..a662dc970a 100644 --- a/.github/workflows/integration-test-tts-ggml.yml +++ b/.github/workflows/integration-test-tts-ggml.yml @@ -40,6 +40,14 @@ permissions: packages: read jobs: + # NOTE: continue-on-error: true mirrors the early-days landing posture + # we use for parakeet's matching workflow. The on-pr-tts-ggml.yml + # merge-guard treats this job's `success || skipped` as a pass, so a + # hard failure inside the integration suite does NOT block a merge — + # only sanity-checks + cpp-lint + cpp-tests-coverage + prebuild really + # gate today. Tighten this gate (drop `continue-on-error` and/or + # change merge-guard to require `success`) once the addon's flake + # surface is small enough to make a hard gate cheap. run-integration-tests: timeout-minutes: 60 continue-on-error: true @@ -146,7 +154,22 @@ jobs: shell: bash run: | sudo apt-get update - sudo apt-get install -y libgomp1 mesa-vulkan-drivers + # libgomp1 - GNU OpenMP runtime, used when ggml-cpu's libgomp + # path is selected. + # libomp5 - LLVM OpenMP runtime (libomp.so.5). tts-cpp is + # built with clang under our linux-clang toolchain + # and links against libomp at -fopenmp time, so + # load fails with `libomp.so.5: cannot open shared + # object file` without this package. + # libblas3 - runtime BLAS soname (libblas.so.3) required by + # the Linux prebuilds on x64. + # mesa-vulkan-drivers - software / Mesa Vulkan ICD so the + # Vulkan-enabled prebuild's runtime backend probe + # has at least one Vulkan device to enumerate + # (works for the no_gpu matrix entries too, + # because the test runners explicitly force + # useGPU=false when NO_GPU=true). + sudo apt-get install -y libgomp1 libomp5 libblas3 mesa-vulkan-drivers - name: Windows - install Vulkan runtime if: matrix.platform == 'win32' @@ -155,22 +178,45 @@ jobs: platform: ${{ matrix.platform }} arch: ${{ matrix.arch }} + # Pin to 3.11 (not 3.12) because numba 0.65+ stopped shipping + # darwin-x86_64 wheels for Python 3.12. librosa pulls in numba + # transitively; on 3.12 + macos-15-large pip falls back to building + # llvmlite from source which fails because the runner doesn't have + # an LLVM 15 development install. 3.11 has prebuilt darwin-x64 + # wheels for the entire numba/llvmlite/librosa stack and is fine + # for every other converter dependency we use. - name: Setup Python (HF -> .gguf conversion) uses: actions/setup-python@v5 with: - python-version: '3.12' - - # Stage every Chatterbox + Supertonic GGUF the integration suite can - # exercise. npm run setup-models internally runs setup-venv.sh + - # convert-models.sh -t all which: - # - chatterbox-t3-turbo.gguf + chatterbox-s3gen.gguf (English) - # - chatterbox-t3-mtl.gguf + chatterbox-s3gen-mtl.gguf (multilingual) - # - supertonic.gguf (English) - # - supertonic2.gguf (multilingual) - # Each `ensure*Models` helper in test/utils/downloadModel.js gracefully - # skips its test when the matching GGUF isn't on disk, so partial - # cache failures degrade rather than abort the whole matrix. + python-version: '3.11' + + # Cache the Python venv + every Chatterbox + Supertonic GGUF the + # integration suite can exercise. Mirrors the mobile workflow's + # caching pattern (integration-mobile-test-tts-ggml.yml). Without + # this every desktop matrix entry re-pays the multi-GB HF download + # + Torch install + multi-arch GGUF convert (~10-30 min cold) on + # six runners per PR. Each `ensure*Models` helper in + # test/utils/downloadModel.js gracefully skips its test when the + # matching GGUF isn't on disk, so partial cache misses degrade + # rather than abort the whole matrix. + - name: Cache TTS GGML venv + GGUFs + id: cache-tts-ggml-models + uses: actions/cache@v4 + with: + path: | + ${{ github.workspace }}/packages/tts-ggml/venv + ${{ github.workspace }}/packages/tts-ggml/models/chatterbox-t3-turbo.gguf + ${{ github.workspace }}/packages/tts-ggml/models/chatterbox-s3gen.gguf + ${{ github.workspace }}/packages/tts-ggml/models/chatterbox-t3-mtl.gguf + ${{ github.workspace }}/packages/tts-ggml/models/chatterbox-s3gen-mtl.gguf + ${{ github.workspace }}/packages/tts-ggml/models/supertonic.gguf + ${{ github.workspace }}/packages/tts-ggml/models/supertonic2.gguf + key: tts-ggml-desktop-all-gguf-v1-${{ matrix.platform }}-${{ matrix.arch }}-${{ hashFiles('packages/tts-ggml/scripts/convert-models.sh', 'packages/tts-ggml/scripts/setup-venv.sh', 'packages/tts-ggml/scripts/requirements.txt', 'packages/tts-ggml/scripts/convert-t3-turbo-to-gguf.py', 'packages/tts-ggml/scripts/convert-t3-mtl-to-gguf.py', 'packages/tts-ggml/scripts/convert-s3gen-to-gguf.py', 'packages/tts-ggml/scripts/convert-supertonic2-to-gguf.py') }} + restore-keys: | + tts-ggml-desktop-all-gguf-v1-${{ matrix.platform }}-${{ matrix.arch }}- + - name: Stage GGUF models (npm run setup-models) + if: steps.cache-tts-ggml-models.outputs.cache-hit != 'true' shell: bash working-directory: ${{ github.workspace }}/packages/tts-ggml run: npm run setup-models diff --git a/packages/tts-ggml/.gitignore b/packages/tts-ggml/.gitignore new file mode 100644 index 0000000000..7adbbd6e9e --- /dev/null +++ b/packages/tts-ggml/.gitignore @@ -0,0 +1,122 @@ +# Dependencies +node_modules/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Build outputs +build/ +build_*/ +.clang-format +.clang-tidy +.valgrind.supp +dist/ +out/ +*.o +*.so +*.dylib +*.dll +*.exe + +# Prebuilds (platform-specific binaries) +prebuilds/ + +# CMake build files +CMakeCache.txt +CMakeFiles/ +cmake_install.cmake +Makefile + +# IDE and editor files +.vscode/ +.idea/ +*.swp +*.swo +*~ +.cache/ + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Logs +logs/ +*.log + +# Runtime data +pids/ +*.pid +*.seed +*.pid.lock + +# Coverage directory used by tools like istanbul +coverage/ + +# nyc test coverage +.nyc_output + +# Dependency directories +jspm_packages/ + +# Optional npm cache directory +.npm + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variables file +.env + +# Temporary folders +tmp/ +temp/ +.tmp/ + + +# Model files +models/* +model/ + +# VCPKG installed files +vcpkg_installed/* + +# Integration test outputs +*.wav +!benchmarks/assets/ref.wav +!test/reference-audio/*.wav +!test/mobile/testAssets/*.wav + +# Package lock file +package-lock.json + +# npmrc file +.npmrc + +# Benchmark data +benchmarks/shared-data/ +benchmarks/results/*.wav +benchmarks/results/*.raw + +benchmarks/client/poetry.lock +benchmarks/client/venv +benchmarks/client/__pycache__ +benchmarks/client/src/__pycache__ +benchmarks/client/src/tts/__pycache__ +benchmarks/python-server/venv +benchmarks/python-server/__pycache__ +benchmarks/python-server/src/__pycache__ +benchmarks/server/node_modules +benchmarks/server/package-lock.json + +__pycache__/ diff --git a/packages/tts-ggml/CHANGELOG.md b/packages/tts-ggml/CHANGELOG.md new file mode 100644 index 0000000000..43ca641d3f --- /dev/null +++ b/packages/tts-ggml/CHANGELOG.md @@ -0,0 +1,87 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.1.0] + +Initial release of `@qvac/tts-ggml`, a GGML-backed TTS addon wrapping the +`tts-cpp` library. Exposes both `tts_cpp::chatterbox::Engine` and +`tts_cpp::supertonic::Engine` behind a single engine-agnostic JS surface, +intended as a substitute for `@qvac/tts-onnx`. + +### Added + +- **Chatterbox engine** (English + multilingual via `chatterbox-t3-mtl.gguf` / + `chatterbox-s3gen-mtl.gguf`). 24 kHz native output. Supports voice cloning + from a reference wav and baked voice-conditioning tensors via `voiceDir` / + `voicesDir`. +- **Supertonic engine** (single-file `supertonic.gguf`). 44.1 kHz native + output. Voice selection via `voice` / `voiceName` (e.g. `'F1'`, `'M1'`). +- **Engine auto-detection** from `files` (chatterbox-\* gguf vs supertonic.gguf), + with explicit override through the `engine: 'chatterbox' | 'supertonic'` + option. Static constants `TTSGgml.ENGINE_CHATTERBOX` / `ENGINE_SUPERTONIC` + and `getEngineType()` method. +- **GPU backend cascade** at load time. Chatterbox routes through Metal / + CUDA / Vulkan / OpenCL when available; pass `nGpuLayers: 99` to fully + offload. `useGPU` defaults `true` for Chatterbox. `RuntimeStats` now + reports the active backend via `backendDevice` (0 = CPU, 1 = GPU) and + `backendId` (0 = CPU, 1 = Metal, 2 = CUDA, 3 = Vulkan, 4 = OpenCL, + 99 = other-GPU). +- **Streaming APIs** aligned with `@qvac/tts-onnx`: + - `run({ streamOutput: true, ... })` — sentence-chunked synthesis with + `onUpdate` PCM emission. + - `runStream(text, options?)` — convenience wrapper over `run`. + - `runStreaming(textStream, options?)` — `string | string[] | Iterable | + AsyncIterable` text input, PCM out per flushed job. +- **Chatterbox-only native streaming knobs:** `streamChunkTokens` (speech + tokens per native chunk; 25 ≈ 1 s of audio, `0` disables), + `streamFirstChunkTokens` (smaller first chunk for low TTFB), `cfmSteps` + (CFM Euler step count; `1` halves cost, `2` matches Python meanflow). +- **Supertonic-only knobs:** `steps` (vector-estimator CFM steps; `0` = + GGUF default), `speed` (speech-rate factor), `noiseNpyPath` (optional + `.npy` initial-noise tensor for byte-exact reference reproduction). +- **Cross-compat aliases with `@qvac/tts-onnx`:** `voiceName` (alias of + `voice`) and `numInferenceSteps` (alias of `steps`) accepted on options + so call sites migrating from tts-onnx need fewer changes. +- **Output sample-rate control:** `runtimeConfig.outputSampleRate` and + per-job `TTSRunInput.outputSampleRate` (8000–192000 Hz) resample the + engine's native rate before emission. `TTSOutputChunk.sampleRate` is + reported on every chunk. +- **Pre-chunked streaming metadata:** `SentenceStreamChunkMeta.isLast` + flag on the final chunk of `runStream` / `run({ streamOutput: true })`. +- **Tuning knobs:** `seed` (RNG for CFM initial noise + SineGen + excitation / Supertonic latent), `threads` (overrides + `std::thread::hardware_concurrency()`), `nGpuLayers`. +- **File-path inputs:** `TTSGgmlFiles` accepts `modelDir` plus per-component + GGUF paths (`t3Model`, `s3genModel`, `supertonicModel`) with `*Path` + long-form and short aliases (`t3`, `s3gen`, `supertonic`). +- **C++ unit tests** (GoogleTest) and `coverage:cpp` target (llvm-cov). +- **Mobile integration test** generator (`test:mobile:generate` / + `test:mobile:validate`). + +### Differences vs `@qvac/tts-onnx` + +Call sites migrating from `@qvac/tts-onnx` should be aware of the +following — these are not bugs, just intentional surface differences: + +- **No LavaSR enhancer.** `EnhancerConfig` / `LavaSREnhancerConfig`, the + constructor `enhancer` option, and the per-job `TTSRunInput.enhancer` + override do not exist in `@qvac/tts-ggml`. There is no neural + bandwidth-extension or denoiser path in the GGML backend today. +- **`referenceAudio` is a path string**, not `Float32Array | number[]`. + Pass the absolute wav path; the native layer reads it. +- **`numThreads` → `threads`.** The ONNX-style `numThreads` is not + accepted; use `threads` instead. +- **`supertonicMultilingual` is removed.** Multilingual mode is driven by + the loaded GGUF (`chatterbox-*-mtl.gguf`) and engine selection rather + than a runtime boolean. +- **GPU semantics differ for Supertonic.** `useGPU: true` and any non-zero + `nGpuLayers` are **rejected at construction time** on Supertonic — the + engine is CPU-only today. (Chatterbox accepts both and defaults + `useGPU` to `true`.) +- **ONNX-style `*Path` file aliases are not accepted.** The GGML backend + is single-GGUF-per-component, so the file set is much smaller; only the + ggml-native field names listed under `TTSGgmlFiles` are honored. diff --git a/packages/tts-ggml/CMakeLists.txt b/packages/tts-ggml/CMakeLists.txt new file mode 100644 index 0000000000..1e31c5bd3c --- /dev/null +++ b/packages/tts-ggml/CMakeLists.txt @@ -0,0 +1,145 @@ +cmake_minimum_required(VERSION 3.25) + +option(BUILD_TESTING "Build tests" OFF) +option(ENABLE_COVERAGE "Enable coverage" OFF) +option(ENABLE_VULKAN "Enable Vulkan GPU acceleration" OFF) + +if(BUILD_TESTING) + list(APPEND VCPKG_MANIFEST_FEATURES "tests") +endif() + +if(ENABLE_VULKAN) + list(APPEND VCPKG_MANIFEST_FEATURES "vulkan") +endif() + +find_package(cmake-bare REQUIRED PATHS node_modules/cmake-bare) +find_package(cmake-vcpkg REQUIRED PATHS node_modules/cmake-vcpkg) + +set(VCPKG_OVERLAY_TRIPLETS "${CMAKE_CURRENT_SOURCE_DIR}/vcpkg/triplets;${VCPKG_OVERLAY_TRIPLETS}") + +project(tts-ggml CXX C) + +if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + add_compile_options(-stdlib=libc++) + add_link_options(-stdlib=libc++ -static-libstdc++) +endif() + +find_path(VCPKG_INSTALLED_PATH share/lint-cpp/.clang-format REQUIRED) +configure_file(${VCPKG_INSTALLED_PATH}/share/lint-cpp/.clang-format + ${CMAKE_CURRENT_SOURCE_DIR}/.clang-format COPYONLY) +configure_file(${VCPKG_INSTALLED_PATH}/share/lint-cpp/.clang-tidy + ${CMAKE_CURRENT_SOURCE_DIR}/.clang-tidy COPYONLY) +configure_file(${VCPKG_INSTALLED_PATH}/share/lint-cpp/.valgrind.supp + ${CMAKE_CURRENT_SOURCE_DIR}/.valgrind.supp COPYONLY) +configure_file(${VCPKG_INSTALLED_PATH}/tools/lint-cpp/hooks/pre-commit + ${CMAKE_CURRENT_SOURCE_DIR}/.git/hooks/pre-commit COPYONLY) + +find_path(QVAC_LIB_INFERENCE_ADDON_CPP_INCLUDE_DIRS "inference-addon-cpp/ModelInterfaces.hpp") +find_package(tts-cpp CONFIG REQUIRED) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + add_definitions(-D_DEBUG) +endif() + +if(WIN32) + add_definitions(-DNOMINMAX -DWIN32_LEAN_AND_MEAN -DNOGDI) +endif() + +add_bare_module(tts-ggml EXPORTS) + +if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + target_link_options(${tts-ggml}_module PRIVATE -Wl,--exclude-libs,ALL) +endif() + +target_sources( + ${tts-ggml} + PRIVATE + ${PROJECT_SOURCE_DIR}/addon/src/js-interface/binding.cpp + ${PROJECT_SOURCE_DIR}/addon/src/js-interface/JSAdapter.cpp + ${PROJECT_SOURCE_DIR}/addon/src/model-interface/chatterbox/ChatterboxModel.cpp + ${PROJECT_SOURCE_DIR}/addon/src/model-interface/supertonic/SupertonicModel.cpp +) + +target_include_directories( + ${tts-ggml} + PRIVATE + ${PROJECT_SOURCE_DIR}/addon + ${PROJECT_SOURCE_DIR}/addon/src + ${CMAKE_BINARY_DIR}/_bare/node_modules/bare-headers/include + ${QVAC_LIB_INFERENCE_ADDON_CPP_INCLUDE_DIRS} +) + +target_link_libraries( + ${tts-ggml} + PRIVATE + tts-cpp::tts-cpp +) + +target_compile_definitions(${tts-ggml} PUBLIC JS_LOGGER) + +if(WIN32) + target_link_libraries( + ${tts-ggml} + PRIVATE + msvcrt.lib + ) +endif() + +if(BUILD_TESTING) + enable_testing() + find_package(GTest CONFIG REQUIRED) + include(GoogleTest) + + # Test exec recompiles ChatterboxModel.cpp + SupertonicModel.cpp from + # source (rather than linking the bare-module artefact) so coverage + # instrumentation lands on those translation units. JSAdapter.cpp is + # excluded because it depends on Bare's , which isn't available + # outside the bare module link. + add_executable( + tts_ggml_tests + addon/tests/test_backend_utils.cpp + addon/tests/test_chatterbox_config.cpp + addon/tests/test_supertonic_config.cpp + addon/tests/AddonCppTest.cpp + addon/src/model-interface/chatterbox/ChatterboxModel.cpp + addon/src/model-interface/supertonic/SupertonicModel.cpp + ) + + target_include_directories(tts_ggml_tests + PRIVATE + ${PROJECT_SOURCE_DIR}/addon + ${PROJECT_SOURCE_DIR}/addon/src + ${QVAC_LIB_INFERENCE_ADDON_CPP_INCLUDE_DIRS} + ) + + target_link_libraries(tts_ggml_tests + PRIVATE + GTest::gmock_main + tts-cpp::tts-cpp + ) + + if(WIN32) + target_link_libraries(tts_ggml_tests PRIVATE msvcrt.lib) + endif() + + if(ENABLE_COVERAGE) + # LLVM source-based coverage (clang). Same flag set parakeet uses. + target_compile_options(tts_ggml_tests PRIVATE + -fprofile-instr-generate + -fcoverage-mapping + ) + target_link_options(tts_ggml_tests PRIVATE + -fprofile-instr-generate + ) + message(STATUS "LLVM Coverage enabled for tts-ggml tests") + endif() + + # CI invokes the test binary directly via npm scripts (so it can pass + # gtest_output= and capture results.xml); avoid running the binary + # during configure-time discovery to keep configure cheap. + add_test(NAME tts_ggml_tests COMMAND tts_ggml_tests) +endif() diff --git a/packages/tts-ggml/LICENSE b/packages/tts-ggml/LICENSE new file mode 100644 index 0000000000..7d199ae333 --- /dev/null +++ b/packages/tts-ggml/LICENSE @@ -0,0 +1,179 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + +Copyright 2026 Tether Data, S.A. de C.V. diff --git a/packages/tts-ggml/NOTICE b/packages/tts-ggml/NOTICE new file mode 100644 index 0000000000..33c7b5cea2 --- /dev/null +++ b/packages/tts-ggml/NOTICE @@ -0,0 +1,256 @@ +@qvac/tts-ggml +Copyright 2026 Tether Data, S.A. de C.V. + +This product includes third-party components under their +respective licenses. @qvac/tts-ggml itself is licensed under +Apache-2.0; bundled dependencies are governed by the licenses +listed below. + +========================================================================= +JavaScript Dependencies +========================================================================= + +--- apache-2.0 (Apache License 2.0) --- + + @hyperswarm/secret-stream@6.9.1 + https://github.com/holepunchto/hyperswarm-secret-stream + @qvac/diagnostics@0.1.1 + @qvac/dl-base@0.1.1 + @qvac/dl-hyperdrive@0.1.1 + @qvac/error@0.1.1 + @qvac/infer-base@0.1.1 + @qvac/infer-base@0.4.0 + https://github.com/tetherto/qvac + @qvac/logging@0.1.0 + @qvac/response@0.1.2 + adaptive-timeout@1.0.1 + https://github.com/holepunchto/adaptive-timeout + b4a@1.8.0 + https://github.com/holepunchto/b4a + bare-addon-resolve@1.10.0 + https://github.com/holepunchto/bare-addon-resolve + bare-ansi-escapes@2.2.3 + https://github.com/holepunchto/bare-ansi-escapes + bare-assert@1.2.0 + https://github.com/holepunchto/bare-assert + bare-buffer@3.6.0 + https://github.com/holepunchto/bare-buffer + bare-events@2.4.2 + https://github.com/holepunchto/bare-events + bare-events@2.8.2 + https://github.com/holepunchto/bare-events + bare-fs@4.7.1 + https://github.com/holepunchto/bare-fs + bare-inspect@3.1.4 + https://github.com/holepunchto/bare-inspect + bare-module-resolve@1.12.1 + https://github.com/holepunchto/bare-module-resolve + bare-os@3.8.7 + https://github.com/holepunchto/bare-os + bare-path@3.0.0 + https://github.com/holepunchto/bare-path + bare-semver@1.0.3 + https://github.com/holepunchto/bare-semver + bare-stream@2.13.0 + https://github.com/holepunchto/bare-stream + bare-type@1.1.0 + https://github.com/holepunchto/bare-type + bare-url@2.4.2 + https://github.com/holepunchto/bare-url + blind-relay@1.4.0 + https://github.com/holepunchto/blind-relay + compact-encoding@2.19.2 + https://github.com/holepunchto/compact-encoding + device-file@2.3.1 + https://github.com/holepunchto/device-file + events-universal@1.0.1 + https://github.com/holepunchto/events-universal + fd-lock@2.1.1 + https://github.com/holepunchto/fd-lock + fs-native-extensions@1.5.0 + https://github.com/holepunchto/fs-native-extensions + hyperblobs@2.11.1 + https://github.com/holepunchto/hyperblobs + hypercore-errors@1.5.0 + https://github.com/holepunchto/hypercore-errors + hypercore-id-encoding@1.3.0 + https://github.com/holepunchto/hypercore-id-encoding + hypercore-storage@2.8.0 + https://github.com/holepunchto/hypercore-storage + hyperdht-address@1.0.1 + https://github.com/holepunchto/hyperdht-address + hyperdrive@13.3.2 + https://github.com/holepunchto/hyperdrive + hyperschema@1.20.1 + https://github.com/holepunchto/hyperschema + index-encoder@3.5.0 + https://github.com/holepunchto/index-encoder + mirror-drive@1.14.1 + https://github.com/holepunchto/mirror-drive + noise-handshake@4.2.0 + https://github.com/holepunchto/noise-handshake + quickbit-native@2.4.8 + https://github.com/holepunchto/quickbit-native + rabin-native@2.0.0 + https://github.com/holepunchto/rabin-native + rabin-stream@2.0.0 + https://github.com/holepunchto/rabin-stream + rache@1.0.0 + https://github.com/holepunchto/rache + refcounter@1.0.0 + https://github.com/holepunchto/refcounter + require-addon@1.2.0 + https://github.com/holepunchto/require-addon + resource-on-exit@1.0.0 + https://github.com/holepunchto/bare-teardown + rocksdb-native@3.15.0 + https://github.com/holepunchto/rocksdb-native + scope-lock@1.2.4 + https://github.com/holepunchto/scope-lock + simdle-native@1.3.9 + https://github.com/holepunchto/simdle-native + sub-encoder@2.1.3 + https://github.com/holepunchto/sub-encoder + text-decoder@1.2.7 + https://github.com/holepunchto/text-decoder + udx-native@1.19.2 + https://github.com/holepunchto/udx-native + unslab@1.3.0 + https://github.com/holepunchto/unslab + which-runtime@1.3.2 + https://github.com/holepunchto/which-runtime + +--- isc (ISC License) --- + + bits-to-bytes@1.3.0 + https://github.com/holepunchto/bits-to-bytes + compact-encoding-bitfield@1.0.0 + https://github.com/compact-encoding/compact-encoding-bitfield + compact-encoding-net@1.2.0 + https://github.com/compact-encoding/compact-encoding-net + nanoassert@2.0.0 + https://github.com/emilbayes/nanoassert + noise-curve-ed@2.1.0 + https://github.com/chm-diederichs/noise-curve-ed + quickbit-universal@2.2.0 + https://github.com/holepunchto/quickbit-universal + simdle-universal@1.1.2 + https://github.com/holepunchto/simdle-universal + +--- mit (MIT License) --- + + big-sparse-array@1.0.3 + https://github.com/mafintosh/big-sparse-array + binary-stream-equals@1.0.0 + https://github.com/mafintosh/binary-stream-equals + bogon@1.2.0 + https://github.com/mafintosh/bogon + codecs@3.1.0 + https://github.com/mafintosh/codecs + corestore@7.9.2 + https://github.com/holepunchto/corestore + debounceify@1.1.0 + https://github.com/mafintosh/debounceify + dht-rpc@6.26.4 + https://github.com/holepunchto/dht-rpc + fast-fifo@1.3.2 + https://github.com/mafintosh/fast-fifo + flat-tree@1.13.0 + https://github.com/mafintosh/flat-tree + generate-object-property@2.0.0 + https://github.com/mafintosh/generate-object-property + generate-string@1.0.1 + https://github.com/mafintosh/generate-string + hyperbee@2.27.3 + https://github.com/holepunchto/hyperbee + hypercore@11.28.1 + https://github.com/holepunchto/hypercore + hypercore-crypto@3.6.1 + https://github.com/mafintosh/hypercore-crypto + hyperdht@6.30.0 + https://github.com/holepunchto/hyperdht + hyperswarm@4.17.0 + https://github.com/holepunchto/hyperswarm + is-options@1.0.2 + https://github.com/mafintosh/is-options + is-property@1.0.2 + https://github.com/mikolalysenko/is-property + kademlia-routing-table@1.0.6 + https://github.com/mafintosh/kademlia-routing-table + mutexify@1.4.0 + https://github.com/mafintosh/mutexify + nat-sampler@1.0.1 + https://github.com/mafintosh/nat-sampler + protocol-buffers-encodings@1.2.0 + https://github.com/mafintosh/protocol-buffers-encodings + protomux@3.10.1 + https://github.com/mafintosh/protomux + queue-tick@1.0.1 + https://github.com/mafintosh/queue-tick + random-array-iterator@1.0.0 + https://github.com/mafintosh/random-array-iterator + ready-resource@1.2.0 + https://github.com/holepunchto/ready-resource + record-cache@1.2.0 + https://github.com/mafintosh/record-cache + resolve-reject-promise@1.1.0 + https://github.com/mafintosh/resolve-reject-promise + safety-catch@1.0.3 + https://github.com/mafintosh/safety-catch + same-data@1.0.0 + https://github.com/mafintosh/same-data + shuffled-priority-queue@2.1.0 + https://github.com/mafintosh/shuffled-priority-queue + signal-promise@1.0.3 + https://github.com/mafintosh/signal-promise + signed-varint@2.0.1 + https://github.com/dominictarr/signed-varint + sodium-native@5.1.0 + https://github.com/holepunchto/sodium-native + sodium-secretstream@1.2.0 + https://github.com/mafintosh/sodium-secretstream + sodium-universal@5.0.1 + https://github.com/holepunchto/sodium-universal + speedometer@1.1.0 + https://github.com/mafintosh/speedometer + streamx@2.25.0 + https://github.com/mafintosh/streamx + teex@1.0.1 + https://github.com/mafintosh/teex + test-tmp@1.4.0 + https://github.com/mafintosh/test-tmp + time-ordered-set@2.0.1 + https://github.com/mafintosh/time-ordered-set + timeout-refresh@2.0.1 + https://github.com/mafintosh/timeout-refresh + unix-path-resolve@1.0.2 + https://github.com/mafintosh/unix-path-resolve + unordered-set@2.0.1 + https://github.com/mafintosh/unordered-set + varint@5.0.0 + https://github.com/chrisdickinson/varint + xache@1.2.1 + https://github.com/mafintosh/xache + z32@1.1.0 + https://github.com/mafintosh/z32 + + +========================================================================= +C++ Dependencies +========================================================================= + +--- apache-2.0-with-llvm-exception --- + + libc++ (LLVM C++ Standard Library) + https://github.com/llvm/llvm-project + +--- bsd-3-clause (BSD 3-Clause License) --- + + gtest + https://github.com/google/googletest + +--- mit (MIT License) --- + + tts-cpp + https://github.com/GustavoA1604/chatterbox.cpp + diff --git a/packages/tts-ggml/README.md b/packages/tts-ggml/README.md new file mode 100644 index 0000000000..1809c77d54 --- /dev/null +++ b/packages/tts-ggml/README.md @@ -0,0 +1,351 @@ +# @qvac/tts-ggml + +Text-to-speech Bare addon backed by the [`qvac-tts.cpp`][qvac-tts-cpp] +GGML library. Currently ships the **Chatterbox Turbo English** model; +additional engines will land under the same package as the upstream +library grows. + +Runs in-process with a persistent native engine — the GGUFs, the S3Gen +preload, the ggml backend, and any voice-conditioning tensors are +loaded once and reused across every synthesis call. GPU acceleration +(Metal on macOS/iOS, Vulkan on Linux/Windows/Android, CUDA when built) +is enabled by default; falls back to CPU if no GPU backend is available. + +[qvac-tts-cpp]: https://github.com/tetherto/qvac-ext-lib-whisper.cpp/tree/master/tts-cpp + +## Features + +- Batch synthesis (`run({ input })` → single PCM buffer). +- **Sentence-granularity streaming** — `runStreaming(asyncIterable)`: + yields one audio chunk per input sentence. +- **Native per-chunk streaming** — set `streamChunkTokens` and audio + flows out of the C++ engine chunk-by-chunk as T3 tokens produce + S3Gen+HiFT output; sub-second first-audio-out inside a single + utterance. +- **Voice cloning** from a reference wav (or a pre-baked profile dir). +- **GPU-by-default**, CPU selectable via `config.useGPU: false`. +- **Cancellation** via `model.cancel()` — stops T3 decode on the next + token; in-flight S3Gen chunk runs to completion. + +## Install + +```bash +npm install @qvac/tts-ggml +``` + +Requires [Bare](https://github.com/holepunchto/bare) `>=1.19.0`. +Prebuilds are published for darwin-arm64, android-arm64, ios-arm64; +Linux x64 / Windows prebuilds coming as demand warrants. If your +platform has no prebuild the package falls back to a local build via +`bare-make` + `cmake-vcpkg` (see [Build from source](#build-from-source)). + +## Model files + +Two engines are wrapped, each with its own GGUF layout under `models/`: + +``` +# Chatterbox turbo (English) +chatterbox-t3-turbo.gguf (~742 MB) — T3 GPT-2 Medium + BPE + VoiceEncoder +chatterbox-s3gen.gguf (~1.0 GB) — S3Gen encoder/CFM + HiFT + CAMPPlus + S3TokenizerV2 + +# Chatterbox multilingual (en/es/fr/de/pt/it/zh/ja/ko/...) +chatterbox-t3-mtl.gguf (~1.0 GB) +chatterbox-s3gen-mtl.gguf (~1.0 GB) + +# Supertonic English (Supertone/supertonic; 44.1 kHz, voice baked in) +supertonic.gguf (~263 MB) + +# Supertonic multilingual (Supertone/supertonic-2; en/ko/es/pt/fr) +supertonic2.gguf (~263 MB) +``` + +The package converts these from upstream Resemble Chatterbox / Supertone +checkpoints via a Python venv pipeline: + +```bash +npm run setup-models # creates ./venv, installs requirements.txt, runs convert-models.sh +``` + +Or step-by-step: + +```bash +npm run setup:venv +npm run convert-models +``` + +Point the addon at a custom location via `files.modelDir` (engine +auto-detected from the gguf filenames present), or pass explicit +`files.t3Model` + `files.s3genModel` (Chatterbox) / +`files.supertonicModel` (Supertonic). + +## Quick start + +```js +const TTSGgml = require('@qvac/tts-ggml') + +const model = new TTSGgml({ + files: { modelDir: './models' }, // contains chatterbox-{t3-turbo,s3gen}.gguf + config: { language: 'en' }, + opts: { stats: true } +}) + +await model.load() + +const response = await model.run({ + type: 'text', + input: 'Hello from qvac tts ggml.' +}) + +let pcm = [] +await response + .onUpdate(data => { + if (data && data.outputArray) pcm = pcm.concat(Array.from(data.outputArray)) + }) + .await() + +// pcm is Int16 mono @ 24 kHz +await model.unload() +``` + +## Streaming + +### Sentence streaming — `runStreaming(asyncIter)` + +Use when your text arrives as discrete sentences (e.g. buffered LLM +output) and you want the audio to flow sentence-by-sentence. One +`onUpdate` event per input yield. + +```js +async function * sentencesOverTime () { + yield 'First sentence.' + await new Promise(r => setTimeout(r, 200)) + yield 'The second arrives shortly after.' +} + +const response = await model.runStreaming(sentencesOverTime()) +await response.onUpdate(data => { + // data.outputArray — Int16 PCM for this sentence's audio + // data.chunkIndex — 0-based index of the yielded sentence + // data.sentenceChunk — the sentence text that produced this audio +}).await() +``` + +Full runnable demo (with streaming playback): +`bare examples/chatterbox-sentence-stream-tts.js` + +### Chunk streaming — `streamChunkTokens` + +Use when you want the fastest possible first-audio-out **within a +single utterance**. The C++ engine splits each synthesis into chunks +of `streamChunkTokens` speech tokens (25 ≈ 1 s of audio) and emits +audio per chunk, keeping HiFT's source cache phase-continuous across +seams so the joins are inaudible. + +```js +const model = new TTSGgml({ + files: { modelDir: './models' }, + referenceAudio: './voices/jfk.wav', // optional + streamChunkTokens: 25, // ~1 s of audio per chunk + streamFirstChunkTokens: 10, // smaller first chunk = faster first-audio-out + cfmSteps: 1, // 1-step meanflow: halves CFM cost + config: { language: 'en' } +}) + +await model.load() + +const response = await model.run({ input: 'A long sentence produces many chunks...' }) +await response.onUpdate(data => { + if (data && data.outputArray) playPcmChunk(data.outputArray) +}).await() +``` + +Full runnable demo (with gapless playback via `sox` or `ffplay`): +`bare examples/chatterbox-chunk-stream-tts.js` + +## Voice cloning + +Pass a mono wav ≥ 5 s of clean speech — the engine does the loudness +normalisation (−27 LUFS), resampling, and all conditioning (VoiceEncoder, +CAMPPlus, S3TokenizerV2, mel extraction) natively at `load()` time: + +```js +const model = new TTSGgml({ + files: { modelDir: './models' }, + referenceAudio: './voices/me.wav', + config: { language: 'en' } +}) +``` + +Alternatively point at a pre-baked profile directory produced by the +upstream CLI's `--save-voice DIR` (loads `.npy` tensors; skips the +preprocessing entirely): + +```js +new TTSGgml({ + files: { modelDir: './models' }, + voiceDir: './voices/me/', +}) +``` + +When both are supplied, missing tensors in `voiceDir` are backfilled +from `referenceAudio`. + +## API overview + +### Constructor — `new TTSGgml(options)` + +| Option | Type | Default | Notes | +|---------------------------|------------|------------|-------| +| `files.modelDir` | string | — | Dir containing the two GGUFs | +| `files.t3Model` | string | — | Overrides `modelDir` for T3 | +| `files.s3genModel` | string | — | Overrides `modelDir` for S3Gen | +| `referenceAudio` | string | — | Mono wav ≥ 5 s for voice cloning | +| `voiceDir` | string | — | Pre-baked voice profile | +| `seed` | number | 42 | RNG seed (CFM noise + sampling) | +| `nGpuLayers` | number | 0 / auto | Layers offloaded to GPU | +| `threads` | number | hw.concurrency capped at 4 | | +| `streamChunkTokens` | number | 0 | **>0 enables native chunk streaming** | +| `streamFirstChunkTokens` | number | = streamChunkTokens | Smaller first chunk for low first-audio-out | +| `cfmSteps` | number | 2 | 1 = faster (halved CFM cost) | +| `config.language` | string | `"en"` | Only English today | +| `config.useGPU` | boolean | `true` | Route through Metal / Vulkan / CUDA if available | +| `config.outputSampleRate` | number | 24000 | Resample native 24 kHz output | +| `opts.stats` | boolean | `false` | Populate `response.stats` with RTF etc. | +| `opts.exclusiveRun` | boolean | `false` | Serialize overlapping streaming runs | + +### Methods + +- `await model.load()` — construct the native engine (loads T3, preloads + S3Gen, bakes voice conditioning). Subsequent `run()` calls reuse all + of it. +- `await model.unload()` — release everything. Idempotent. +- `await model.reload(newConfig)` — re-create the engine with a new + config (`language`, `useGPU`, `outputSampleRate`, …). +- `await model.destroy()` — `unload()` + mark this instance dead. +- `await model.cancel()` — best-effort cancel of any in-flight run. +- `model.run({ input, type: 'text' })` → `QvacResponse`. +- `model.run({ input, streamOutput: true })` → sentence-chunked + synthesis driven by the JS-side sentence splitter (see + `lib/textChunker.js`). Equivalent to `runStream(input)`. +- `model.runStream(text, { locale?, maxChunkScalars? })` → same as + above, but the options read more naturally for the "split this long + string" use case. +- `model.runStreaming(textStream, opts)` → streaming input + streaming + output (see [Sentence streaming](#sentence-streaming--runstreamingasynciter)). + +### Response shape + +All `run*` methods return a `QvacResponse` (from `@qvac/infer-base`): + +```js +response.onUpdate(data => { + data.outputArray // Int16Array — 24 kHz mono PCM + data.sampleRate // 24000 + data.chunkIndex // present on sentence-streaming events only + data.sentenceChunk // present on sentence-streaming events only +}) +await response.await() + +// response.stats — only when constructor had `opts: { stats: true }` +response.stats.totalTime // seconds +response.stats.realTimeFactor // synthesis time / audio duration +response.stats.audioDurationMs +response.stats.totalSamples +response.stats.tokensPerSecond +``` + +## Examples + +Runnable demos under `examples/`: + +| Script | Demonstrates | +|---|---| +| `chatterbox-tts.js` | Batch synth + wav dump. `bare examples/chatterbox-tts.js "Hello"` | +| `chatterbox-sentence-stream-tts.js` | `runStreaming()` over an async iterator of sentences, with gapless streaming playback | +| `chatterbox-chunk-stream-tts.js` | Native per-chunk PCM streaming via `streamChunkTokens`, with gapless streaming playback | + +The two streaming examples feed PCM into a single long-running +`sox play` / `ffplay` process so chunks play back-to-back without any +per-chunk spawn gaps — install one of them (`brew install sox` or +`brew install ffmpeg` on macOS) to enable playback. Absent a player +the demos still run and write the concatenated wav. + +## Testing + +```bash +npm run test:unit # mocked binding; fast +npm run test:integration # spins up the real engine; needs models +npm run test # both +``` + +Integration tests scan a few candidate `models/` directories for the +required GGUFs (see `test/utils/downloadModel.js`) and skip cleanly when +files are absent. They cover, across both engines: + +* batch synthesis with full RuntimeStats, +* sentence-level streaming (`runStream` / `run({ streamOutput: true })` + / `runStreaming` over async iterators), +* native sub-sentence chunk streaming (Chatterbox-only via + `streamChunkTokens`), +* sequential-run / fresh-instance / reload-stability behaviour, +* strict GPU-backend assertion via `response.stats.backendDevice` + + `backendId` (set `NO_GPU=true` to skip on CPU-only runners, + `QVAC_TTS_GPU_SMOKE_RELAX=1` to downgrade the strict gate to a + warning), +* multilingual Chatterbox sweep (es/fr/de/pt) via `chatterbox-mtl.test.js`, +* on darwin the Chatterbox English batch path is additionally verified + for WER against the synthesized audio (whisper-small). + +To stress-test long inputs, set `INPUT_SENTENCES=medium` (or `long`) +and re-run the integration suite — `addon.test.js` reads the env var to +pick its sentence corpus from `test/data/sentences-{medium,long}.js`. + +## Build from source + +Prerequisites: `clang` with C++20 support, CMake ≥ 3.25, +[vcpkg](https://vcpkg.io/) (set `VCPKG_ROOT`), `bare-make`. + +```bash +npm install +npx bare-make generate # configures + fetches the tts-cpp port +npx bare-make build +npx bare-make install # copies the .bare into prebuilds// +``` + +The vcpkg port is hosted in +[`tetherto/qvac-registry-vcpkg`][registry] and pulls +[`qvac-tts.cpp`][qvac-tts-cpp] at a pinned REF. See +[`vcpkg-configuration.json`](./vcpkg-configuration.json) for the +baseline commit. + +GPU backends are controlled by the `tts-cpp` port's vcpkg features: +`metal` (default on osx/ios), `vulkan` (default on linux/windows/android). +CUDA is opt-in at port-build time. + +[registry]: https://github.com/tetherto/qvac-registry-vcpkg + +## Troubleshooting + +**`t3 model not found` / `supertonic model not found`** — the paths in +`files` are wrong or the GGUFs weren't generated. Run +`npm run setup-models` (creates the Python venv and converts the +upstream checkpoints into the four / five expected GGUF files). + +**`VoiceEncoder forward failed`** when passing `referenceAudio`** — +the reference wav is likely < 5 s of clean speech. Make it longer +(10–15 s gives the best similarity). + +**Crash on process exit with Metal's `[rsets->data count] == 0` +assertion** — you're running on a build *before* the `s3gen_unload()` +teardown fix; bump the `tts-cpp` port to `>= 2026-04-21` port-version. + +**Slower-than-expected RTF on darwin** — double-check that the port +was built with the `metal` feature (default) and that you're not +overriding `useGPU: false`. Also confirm your reference wav's mel +was baked (`Using C++ VoiceEncoder` / `C++ S3TokenizerV2` messages in +the log) — if voice conditioning falls back to CPU, a chunk of the +first-call overhead is visible in RTF. + +## License + +Apache-2.0. See [LICENSE](./LICENSE). diff --git a/packages/tts-ggml/addon/src/addon/AddonCpp.hpp b/packages/tts-ggml/addon/src/addon/AddonCpp.hpp new file mode 100644 index 0000000000..bb6105ef91 --- /dev/null +++ b/packages/tts-ggml/addon/src/addon/AddonCpp.hpp @@ -0,0 +1,78 @@ +#pragma once + +// Generic AddonCpp helper used by the C++ unit tests in addon/tests/. +// +// The production AddonJs path (addon/src/addon/AddonJs.hpp) wires up +// js_value_t* output handlers tied to the Bare runtime; the C++ test +// suite needs an equivalent factory that uses the pure-C++ +// CppQueuedOutputHandlers instead so tests can pop synthesis results, +// stats, and errors without spinning up an embedded JS engine. +// +// The helper is generic over `IModel` so AddonCppTest can drive it with +// a mock model (e.g. BlockingBusyModel) — constructing a real +// ChatterboxModel / SupertonicModel from C++ requires a real GGUF on +// disk and is therefore covered by the QVAC_TEST_GGUF-gated +// integration tests in test_chatterbox_config.cpp / test_supertonic_config.cpp. + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace qvac::ttsggml { + +struct AddonInstance { + std::unique_ptr addon; + std::shared_ptr< + qvac_lib_inference_addon_cpp::out_handl::CppQueuedOutputHandler< + std::vector>> + audioOutput; + std::shared_ptr< + qvac_lib_inference_addon_cpp::out_handl::CppQueuedOutputHandler< + qvac_lib_inference_addon_cpp::RuntimeStats>> + statsOutput; + std::shared_ptr< + qvac_lib_inference_addon_cpp::out_handl::CppQueuedOutputHandler< + qvac_lib_inference_addon_cpp::Output::Error>> + errorOutput; +}; + +inline AddonInstance createInstance( + std::unique_ptr model) { + using namespace qvac_lib_inference_addon_cpp; + using namespace std; + + auto audioOutput = + make_shared>>(); + auto statsOutput = + make_shared>(); + auto errorOutput = + make_shared>(); + + out_handl::OutputHandlers> + outputHandlers; + outputHandlers.add(audioOutput); + outputHandlers.add(statsOutput); + outputHandlers.add(errorOutput); + + unique_ptr callback = + make_unique(std::move(outputHandlers)); + + auto addon = + make_unique(std::move(callback), std::move(model)); + + return { + std::move(addon), + std::move(audioOutput), + std::move(statsOutput), + std::move(errorOutput)}; +} + +} diff --git a/packages/tts-ggml/addon/src/addon/AddonJs.hpp b/packages/tts-ggml/addon/src/addon/AddonJs.hpp new file mode 100644 index 0000000000..17495e261d --- /dev/null +++ b/packages/tts-ggml/addon/src/addon/AddonJs.hpp @@ -0,0 +1,211 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "js-interface/JSAdapter.hpp" +#include "model-interface/chatterbox/ChatterboxModel.hpp" +#include "model-interface/supertonic/SupertonicModel.hpp" + +namespace qvac::ttsggml::addon_js { + +namespace js = qvac_lib_inference_addon_cpp::js; + +using chatterbox::ChatterboxModel; +using supertonic::SupertonicModel; + +struct JsAudioOutputHandler + : qvac_lib_inference_addon_cpp::out_handl::JsBaseOutputHandler< + std::vector> { + explicit JsAudioOutputHandler(int sampleRate) + : qvac_lib_inference_addon_cpp::out_handl::JsBaseOutputHandler< + std::vector>( + [this, sampleRate]( + const std::vector& data) -> js_value_t* { + auto result = js::Object::create(this->env_); + std::span outputSpan(data.data(), data.size()); + auto typedArray = + js::TypedArray::create(this->env_, outputSpan); + result.setProperty(this->env_, "outputArray", typedArray); + result.setProperty( + this->env_, "sampleRate", + js::Number::create(this->env_, sampleRate)); + return result; + }) {} +}; + +struct StreamingPcmChunk { + std::vector pcm; + int chunkIndex = 0; + bool isLast = false; +}; + +struct JsStreamingPcmHandler + : qvac_lib_inference_addon_cpp::out_handl::JsBaseOutputHandler< + StreamingPcmChunk> { + explicit JsStreamingPcmHandler(int sampleRate) + : qvac_lib_inference_addon_cpp::out_handl::JsBaseOutputHandler< + StreamingPcmChunk>( + [this, sampleRate](const StreamingPcmChunk& chunk) -> js_value_t* { + auto result = js::Object::create(this->env_); + std::span outputSpan(chunk.pcm.data(), chunk.pcm.size()); + auto typedArray = + js::TypedArray::create(this->env_, outputSpan); + result.setProperty(this->env_, "outputArray", typedArray); + result.setProperty( + this->env_, "sampleRate", + js::Number::create(this->env_, sampleRate)); + result.setProperty( + this->env_, "chunkIndex", + js::Number::create(this->env_, chunk.chunkIndex)); + result.setProperty( + this->env_, "isLast", + js::Boolean::create(this->env_, chunk.isLast)); + return result; + }) {} +}; + +inline js_value_t* createInstance(js_env_t* env, js_callback_info_t* info) try { + using namespace qvac_lib_inference_addon_cpp; + using namespace std; + + JsArgsParser args(env, info); + auto configurationParams = args.getJsObject(1, "configurationParams"); + + JSAdapter adapter; + const EngineType engineType = adapter.readEngineType(configurationParams, env); + + unique_ptr model; + int sampleRate = 24000; + + if (engineType == EngineType::Supertonic) { + auto cfg = adapter.buildSupertonicConfig(configurationParams, env); + auto stm = make_unique(std::move(cfg)); + sampleRate = stm->sampleRate(); + model = std::move(stm); + } else { + auto cfg = adapter.buildChatterboxConfig(configurationParams, env); + sampleRate = 24000; + model = make_unique(std::move(cfg)); + } + + out_handl::OutputHandlers outHandlers; + outHandlers.add(make_shared(sampleRate)); + outHandlers.add(make_shared(sampleRate)); + unique_ptr callback = make_unique( + env, args.get(0, "jsHandle"), args.getFunction(2, "outputCallback"), + std::move(outHandlers)); + + auto addon = make_unique(env, std::move(callback), std::move(model)); + return JsInterface::createInstance(env, std::move(addon)); +} +JSCATCH + +inline js_value_t* runJob(js_env_t* env, js_callback_info_t* info) try { + using namespace qvac_lib_inference_addon_cpp; + using namespace std; + + JsArgsParser args(env, info); + AddonJs& instance = JsInterface::getInstance(env, args.get(0, "instance")); + auto [type, jsInput] = JsInterface::getInput(args); + + if (type != "text") { + throw qvac_errors::StatusError( + qvac_errors::general_error::InvalidArgument, + "Unknown input type: " + type); + } + + if (auto* st = dynamic_cast(&instance.addonCpp->model.get())) { + SupertonicModel::AnyInput modelInput; + modelInput.text = js::String(env, jsInput).as(env); + return instance.runJob(std::any(std::move(modelInput))); + } + + ChatterboxModel::AnyInput modelInput; + modelInput.text = js::String(env, jsInput).as(env); + + auto outputQueue = instance.addonCpp->outputQueue; + modelInput.chunkCallback = [outputQueue]( + std::vector&& pcm, int chunkIndex, bool isLast) { + StreamingPcmChunk chunk{std::move(pcm), chunkIndex, isLast}; + outputQueue->queueResult(std::any(std::move(chunk))); + }; + + return instance.runJob(std::any(std::move(modelInput))); +} +JSCATCH + +// Async wrapper around AddonCpp::activate() so the deferred GGUF parse +// (ChatterboxModel / SupertonicModel construct without loading; the +// real load happens in waitForLoadInitialization() via IModelAsyncLoad) +// runs on a JsAsyncTask worker thread instead of stalling the JS event +// loop. Replaces the default sync JsInterface::activate registration in +// binding.cpp. +inline js_value_t* activate(js_env_t* env, js_callback_info_t* info) try { + using namespace qvac_lib_inference_addon_cpp; + + JsArgsParser args(env, info); + AddonJs& instance = JsInterface::getInstance(env, args.get(0, "instance")); + + return js::JsAsyncTask::run( + env, [addonCpp = instance.addonCpp]() { addonCpp->activate(); }); +} +JSCATCH + +inline js_value_t* reload(js_env_t* env, js_callback_info_t* info) try { + using namespace qvac_lib_inference_addon_cpp; + using namespace std; + + JsArgsParser args(env, info); + AddonJs& instance = JsInterface::getInstance(env, args.get(0, "instance")); + auto configurationParams = args.getJsObject(1, "configurationParams"); + JSAdapter adapter; + + if (auto* st = dynamic_cast(&instance.addonCpp->model.get())) { + auto newCfg = adapter.buildSupertonicConfig(configurationParams, env); + return js::JsAsyncTask::run( + env, + [addonCpp = instance.addonCpp, newCfg = std::move(newCfg)]() mutable { + auto* stm = + dynamic_cast(&addonCpp->model.get()); + if (stm == nullptr) { + throw qvac_errors::StatusError( + qvac_errors::general_error::InternalError, + "reload: model is not a SupertonicModel"); + } + stm->setConfig(std::move(newCfg)); + stm->reload(); + }); + } + + auto newCfg = adapter.buildChatterboxConfig(configurationParams, env); + return js::JsAsyncTask::run( + env, + [addonCpp = instance.addonCpp, newCfg = std::move(newCfg)]() mutable { + auto* chatterbox = + dynamic_cast(&addonCpp->model.get()); + if (chatterbox == nullptr) { + throw qvac_errors::StatusError( + qvac_errors::general_error::InternalError, + "reload: model is not a ChatterboxModel"); + } + chatterbox->setConfig(std::move(newCfg)); + chatterbox->reload(); + }); +} +JSCATCH + +} diff --git a/packages/tts-ggml/addon/src/addon/TTSErrors.hpp b/packages/tts-ggml/addon/src/addon/TTSErrors.hpp new file mode 100644 index 0000000000..bf497ed258 --- /dev/null +++ b/packages/tts-ggml/addon/src/addon/TTSErrors.hpp @@ -0,0 +1,55 @@ +#pragma once + +#include +#include + +#include "inference-addon-cpp/Errors.hpp" + +namespace qvac_errors { + +namespace tts_error { + +constexpr std::string_view TTSAddonId = + /* NOLINT(readability-identifier-naming) */ "TTS"; + +enum TTSErrorCode : uint32_t { + OK = 0, + ModelNotLoaded = 1, + ModelFileNotFound = 2, + ConfigFileNotFound = 3, + InvalidAPI = 4, + InitializationFailed = 5, + SynthesisFailed = 6, +}; + +inline std::string toString(uint32_t code) { + switch (code) { + case 0: + return "OK"; + case 1: + return "ModelNotLoaded"; + case 2: + return "ModelFileNotFound"; + case 3: + return "ConfigFileNotFound"; + case 4: + return "InvalidAPI"; + case 5: + return "InitializationFailed"; + case 6: + return "SynthesisFailed"; + default: + return "UnknownTTSError"; + } +} + +} // namespace tts_error + +// Convenience function to create TTS-specific StatusError +inline StatusError createTTSError(tts_error::TTSErrorCode code, + const std::string &message) { + return StatusError(std::string(tts_error::TTSAddonId), + tts_error::toString(code), message); +} + +} // namespace qvac_errors \ No newline at end of file diff --git a/packages/tts-ggml/addon/src/js-interface/JSAdapter.cpp b/packages/tts-ggml/addon/src/js-interface/JSAdapter.cpp new file mode 100644 index 0000000000..3242e9742c --- /dev/null +++ b/packages/tts-ggml/addon/src/js-interface/JSAdapter.cpp @@ -0,0 +1,149 @@ +#include "js-interface/JSAdapter.hpp" + +#include +#include + +#include "inference-addon-cpp/Errors.hpp" + +namespace qvac::ttsggml { + +namespace js = qvac_lib_inference_addon_cpp::js; +namespace general_error = qvac_errors::general_error; + +namespace { + +std::optional readOptionalInt( + js::Object obj, js_env_t* env, const char* key) { + js_value_t* raw = obj.getProperty(env, key); + if (js::is(env, raw) || js::is(env, raw)) { + return std::nullopt; + } + if (js::is(env, raw)) { + return static_cast(js::Number::fromValue(raw).as(env)); + } + if (js::is(env, raw)) { + const std::string str = js::String::fromValue(raw).as(env); + try { + return std::stoi(str); + } catch (const std::exception&) { + throw qvac_errors::StatusError( + general_error::InvalidArgument, + std::string("Property '") + key + + "' must be an integer (got non-numeric string \"" + str + "\")"); + } + } + throw qvac_errors::StatusError( + general_error::InvalidArgument, + std::string("Property '") + key + "' must be a number or numeric string"); +} + +std::optional readOptionalFloat( + js::Object obj, js_env_t* env, const char* key) { + js_value_t* raw = obj.getProperty(env, key); + if (js::is(env, raw) || js::is(env, raw)) { + return std::nullopt; + } + if (js::is(env, raw)) { + return static_cast(js::Number::fromValue(raw).as(env)); + } + if (js::is(env, raw)) { + const std::string str = js::String::fromValue(raw).as(env); + try { + return std::stof(str); + } catch (const std::exception&) { + throw qvac_errors::StatusError( + general_error::InvalidArgument, + std::string("Property '") + key + + "' must be a number (got non-numeric string \"" + str + "\")"); + } + } + throw qvac_errors::StatusError( + general_error::InvalidArgument, + std::string("Property '") + key + "' must be a number or numeric string"); +} + +std::string readOptionalString( + js::Object obj, js_env_t* env, const char* key) { + auto v = obj.getOptionalPropertyAs(env, key); + return v.value_or(std::string{}); +} + +std::optional readOptionalBool( + js::Object obj, js_env_t* env, const char* key) { + return obj.getOptionalPropertyAs(env, key); +} + +} + +EngineType JSAdapter::readEngineType( + js::Object configurationParams, js_env_t* env) { + const std::string explicitType = + readOptionalString(configurationParams, env, "engineType"); + if (explicitType == "chatterbox") return EngineType::Chatterbox; + if (explicitType == "supertonic") return EngineType::Supertonic; + if (!explicitType.empty()) { + throw qvac_errors::StatusError( + general_error::InvalidArgument, + "engineType must be 'chatterbox' or 'supertonic' (got '" + + explicitType + "')"); + } + + const std::string supertonicPath = + readOptionalString(configurationParams, env, "supertonicModelPath"); + if (!supertonicPath.empty()) return EngineType::Supertonic; + + const std::string t3Path = + readOptionalString(configurationParams, env, "t3ModelPath"); + if (!t3Path.empty()) return EngineType::Chatterbox; + + return EngineType::Chatterbox; +} + +chatterbox::ChatterboxConfig JSAdapter::buildChatterboxConfig( + js::Object configurationParams, js_env_t* env) { + chatterbox::ChatterboxConfig cfg; + cfg.t3ModelPath = readOptionalString(configurationParams, env, "t3ModelPath"); + cfg.s3genModelPath = readOptionalString(configurationParams, env, "s3genModelPath"); + { + auto lang = readOptionalString(configurationParams, env, "language"); + if (!lang.empty()) cfg.language = std::move(lang); + } + cfg.referenceAudio = readOptionalString(configurationParams, env, "referenceAudio"); + cfg.voiceDir = readOptionalString(configurationParams, env, "voiceDir"); + cfg.seed = readOptionalInt(configurationParams, env, "seed"); + cfg.threads = readOptionalInt(configurationParams, env, "threads"); + cfg.nGpuLayers = readOptionalInt(configurationParams, env, "nGpuLayers"); + cfg.outputSampleRate = readOptionalInt(configurationParams, env, "outputSampleRate"); + cfg.streamChunkTokens = readOptionalInt(configurationParams, env, "streamChunkTokens"); + cfg.streamFirstChunkTokens = readOptionalInt(configurationParams, env, "streamFirstChunkTokens"); + cfg.streamCfmSteps = readOptionalInt(configurationParams, env, "cfmSteps"); + // useGPU is tri-state on the C++ side: std::nullopt means "unspecified" + // (let the engine pick its default); true/false are explicit user + // intent. ChatterboxModel::validateConfig rejects useGPU/nGpuLayers + // conflicts, and toEngineOptions translates explicit-false into + // n_gpu_layers=0 so CPU is actually forced. + cfg.useGpu = readOptionalBool(configurationParams, env, "useGPU"); + return cfg; +} + +supertonic::SupertonicConfig JSAdapter::buildSupertonicConfig( + js::Object configurationParams, js_env_t* env) { + supertonic::SupertonicConfig cfg; + cfg.modelGgufPath = readOptionalString(configurationParams, env, "supertonicModelPath"); + cfg.voice = readOptionalString(configurationParams, env, "voice"); + { + auto lang = readOptionalString(configurationParams, env, "language"); + if (!lang.empty()) cfg.language = std::move(lang); + } + cfg.steps = readOptionalInt(configurationParams, env, "steps"); + cfg.speed = readOptionalFloat(configurationParams, env, "speed"); + cfg.seed = readOptionalInt(configurationParams, env, "seed"); + cfg.threads = readOptionalInt(configurationParams, env, "threads"); + cfg.nGpuLayers = readOptionalInt(configurationParams, env, "nGpuLayers"); + cfg.outputSampleRate = readOptionalInt(configurationParams, env, "outputSampleRate"); + cfg.useGpu = readOptionalBool(configurationParams, env, "useGPU"); + cfg.noiseNpyPath = readOptionalString(configurationParams, env, "noiseNpyPath"); + return cfg; +} + +} diff --git a/packages/tts-ggml/addon/src/js-interface/JSAdapter.hpp b/packages/tts-ggml/addon/src/js-interface/JSAdapter.hpp new file mode 100644 index 0000000000..6f9467e0e9 --- /dev/null +++ b/packages/tts-ggml/addon/src/js-interface/JSAdapter.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include + +#include +#include + +#include "model-interface/chatterbox/ChatterboxConfig.hpp" +#include "model-interface/supertonic/SupertonicConfig.hpp" + +namespace qvac::ttsggml { + +enum class EngineType { + Chatterbox, + Supertonic, +}; + +class JSAdapter { +public: + JSAdapter() = default; + + EngineType readEngineType( + qvac_lib_inference_addon_cpp::js::Object configurationParams, + js_env_t* env); + + chatterbox::ChatterboxConfig buildChatterboxConfig( + qvac_lib_inference_addon_cpp::js::Object configurationParams, + js_env_t* env); + + supertonic::SupertonicConfig buildSupertonicConfig( + qvac_lib_inference_addon_cpp::js::Object configurationParams, + js_env_t* env); +}; + +} diff --git a/packages/tts-ggml/addon/src/js-interface/binding.cpp b/packages/tts-ggml/addon/src/js-interface/binding.cpp new file mode 100644 index 0000000000..ce1e435e64 --- /dev/null +++ b/packages/tts-ggml/addon/src/js-interface/binding.cpp @@ -0,0 +1,41 @@ +#include + +#include "addon/AddonJs.hpp" + +// NOLINTBEGIN(cppcoreguidelines-macro-usage,readability-function-cognitive-complexity,modernize-use-trailing-return-type,readability-identifier-naming) +auto qvac_tts_ggml_exports(js_env_t* env, js_value_t* exports) -> js_value_t* { + +#define V(name, fn) \ + { \ + js_value_t* val; \ + if (js_create_function(env, name, -1, fn, nullptr, &val) != 0) { \ + return nullptr; \ + } \ + if (js_set_named_property(env, exports, name, val) != 0) { \ + return nullptr; \ + } \ + } + + V("createInstance", qvac::ttsggml::addon_js::createInstance) + V("runJob", qvac::ttsggml::addon_js::runJob) + V("reload", qvac::ttsggml::addon_js::reload) + // Override the framework's sync JsInterface::activate with our + // JsAsyncTask::run-wrapped version so the deferred GGUF parse + // (IModelAsyncLoad::waitForLoadInitialization) runs on a worker thread. + V("activate", qvac::ttsggml::addon_js::activate) + + V("loadWeights", qvac_lib_inference_addon_cpp::JsInterface::loadWeights) + V("cancel", qvac_lib_inference_addon_cpp::JsInterface::cancel) + V("destroyInstance", + qvac_lib_inference_addon_cpp::JsInterface::destroyInstance) + V("setLogger", qvac_lib_inference_addon_cpp::JsInterface::setLogger) + V("releaseLogger", + qvac_lib_inference_addon_cpp::JsInterface::releaseLogger) + +#undef V + + return exports; +} + +BARE_MODULE(qvac_tts_ggml, qvac_tts_ggml_exports) +// NOLINTEND(cppcoreguidelines-macro-usage,readability-function-cognitive-complexity,modernize-use-trailing-return-type,readability-identifier-naming) diff --git a/packages/tts-ggml/addon/src/model-interface/BackendUtils.hpp b/packages/tts-ggml/addon/src/model-interface/BackendUtils.hpp new file mode 100644 index 0000000000..f1cba0c9e8 --- /dev/null +++ b/packages/tts-ggml/addon/src/model-interface/BackendUtils.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "tts-cpp/backend.h" + +namespace qvac::ttsggml { + +inline int backendIdFromName(const std::string& name) { + if (name == "CPU") return 0; + if (name.rfind("Metal", 0) == 0 || name.rfind("MTL", 0) == 0) return 1; + if (name.rfind("CUDA", 0) == 0) return 2; + if (name.rfind("Vulkan", 0) == 0) return 3; + if (name.rfind("OpenCL", 0) == 0) return 4; + return 99; +} + +inline int backendDeviceCode(tts_cpp::BackendDevice d) { + return d == tts_cpp::BackendDevice::GPU ? 1 : 0; +} + +} diff --git a/packages/tts-ggml/addon/src/model-interface/chatterbox/ChatterboxConfig.hpp b/packages/tts-ggml/addon/src/model-interface/chatterbox/ChatterboxConfig.hpp new file mode 100644 index 0000000000..268a2ff6ae --- /dev/null +++ b/packages/tts-ggml/addon/src/model-interface/chatterbox/ChatterboxConfig.hpp @@ -0,0 +1,60 @@ +#pragma once + +#include +#include + +namespace qvac::ttsggml::chatterbox { + +/** + * Configuration for the Chatterbox engine wrapping tts-cpp::tts-cpp. + * + * Mapped 1:1 into `tts_cpp::chatterbox::EngineOptions` by + * {@link ChatterboxModel::load} and then passed to a persistent Engine that + * owns the T3 + S3Gen + voice-conditioning state for the lifetime of the + * addon. The Engine is re-created on reload() when any of these fields + * change (ex: a new reference voice or a flip between CPU / GPU). + */ +struct ChatterboxConfig { + /** Path to the T3 (text -> speech tokens) GGUF. */ + std::string t3ModelPath; + /** Path to the S3Gen + HiFT (speech tokens -> 24 kHz wav) GGUF. */ + std::string s3genModelPath; + /** Language code; only "en" is supported by the current Chatterbox model. */ + std::string language = "en"; + /** Voice-cloning reference wav path. */ + std::string referenceAudio; + /** Directory of baked voice-conditioning tensors (`tts-cpp --ref-dir`). */ + std::string voiceDir; + /** RNG seed for CFM initial noise + SineGen excitation. */ + std::optional seed; + /** std::thread::hardware_concurrency() override. */ + std::optional threads; + /** Layers to move to the GPU backend. 99 (or any large number) = all. */ + std::optional nGpuLayers; + /** Post-processing output sample rate. Currently unused (engine always emits 24 kHz). */ + std::optional outputSampleRate; + /** + * Tri-state GPU intent: + * - std::nullopt: unspecified, let the engine use its library default. + * - true: if nGpuLayers unset, maps to nGpuLayers=99. + * - false: if nGpuLayers unset, forces nGpuLayers=0 (CPU). + * + * Conflicts with nGpuLayers (true + 0, or false + !=0) are rejected + * by ChatterboxModel::validateConfig so callers can't silently get + * the opposite backend they asked for. + */ + std::optional useGpu; + /** + * Native streaming controls. When `streamChunkTokens > 0` and the + * caller passes a chunk callback on the job input, the engine runs + * the chunked S3Gen+HiFT loop and emits PCM per chunk (~25 tokens + * = 1 s of audio). 0 = batch synthesis. + */ + std::optional streamChunkTokens; + /** Smaller first chunk for low first-audio-out latency. 0 = same as streamChunkTokens. */ + std::optional streamFirstChunkTokens; + /** CFM Euler steps for streaming chunks. 0 = library default (2). */ + std::optional streamCfmSteps; +}; + +} // namespace qvac::ttsggml::chatterbox diff --git a/packages/tts-ggml/addon/src/model-interface/chatterbox/ChatterboxModel.cpp b/packages/tts-ggml/addon/src/model-interface/chatterbox/ChatterboxModel.cpp new file mode 100644 index 0000000000..b4cc2daf17 --- /dev/null +++ b/packages/tts-ggml/addon/src/model-interface/chatterbox/ChatterboxModel.cpp @@ -0,0 +1,312 @@ +#include "model-interface/chatterbox/ChatterboxModel.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "addon/TTSErrors.hpp" +#include "model-interface/BackendUtils.hpp" +#include "inference-addon-cpp/Errors.hpp" + +namespace qvac::ttsggml::chatterbox { + +namespace { + +using qvac_errors::createTTSError; +using qvac_errors::StatusError; +using qvac_errors::tts_error::TTSErrorCode; +namespace general_error = qvac_errors::general_error; + +tts_cpp::chatterbox::EngineOptions toEngineOptions(const ChatterboxConfig& cfg) { + tts_cpp::chatterbox::EngineOptions opts; + opts.t3_gguf_path = cfg.t3ModelPath; + opts.s3gen_gguf_path = cfg.s3genModelPath; + opts.reference_audio = cfg.referenceAudio; + opts.voice_dir = cfg.voiceDir; + if (!cfg.language.empty()) opts.language = cfg.language; + if (cfg.seed.has_value()) opts.seed = *cfg.seed; + if (cfg.threads.has_value()) opts.n_threads = *cfg.threads; + if (cfg.nGpuLayers.has_value()) { + opts.n_gpu_layers = *cfg.nGpuLayers; + } else if (cfg.useGpu.has_value()) { + // Explicit useGpu must produce an explicit n_gpu_layers so we don't + // depend on the tts-cpp library default flipping out from under us + // (see also: gpu-smoke.test.js asserts backendDevice from this). + opts.n_gpu_layers = *cfg.useGpu ? 99 : 0; + } + if (cfg.streamChunkTokens.has_value()) opts.stream_chunk_tokens = *cfg.streamChunkTokens; + if (cfg.streamFirstChunkTokens.has_value()) opts.stream_first_chunk_tokens = *cfg.streamFirstChunkTokens; + if (cfg.streamCfmSteps.has_value()) opts.stream_cfm_steps = *cfg.streamCfmSteps; + return opts; +} + +std::vector pcmFloatToInt16(const float* pcm, size_t samples) { + std::vector out; + out.resize(samples); + for (size_t i = 0; i < samples; ++i) { + float s = std::clamp(pcm[i], -1.0f, 1.0f); + out[i] = static_cast(std::lround(s * 32767.0f)); + } + return out; +} + +std::vector pcmFloatToInt16(const std::vector& pcm) { + return pcmFloatToInt16(pcm.data(), pcm.size()); +} + +} // namespace + +ChatterboxModel::ChatterboxModel(ChatterboxConfig config) + : cfg_(std::move(config)) { + validateConfig(cfg_); + // Constructor deliberately does NOT call load(): GGUF parsing is the + // multi-hundred-MB step (ggml_backend_alloc_ctx_tensors + voice- + // conditioning bake) and used to stall the Bare event loop because + // qvac_lib_inference_addon_cpp::JsInterface::createInstance is + // synchronous. AddonCpp::activate() (driven by the JsAsyncTask::run + // wrapper in addon_js::activate) now calls + // waitForLoadInitialization() on a worker thread, which delegates to + // load() lazily. Direct C++ callers (and the unit-test suite in + // addon/tests/) can still invoke load() explicitly when they want + // synchronous semantics. +} + +ChatterboxModel::~ChatterboxModel() noexcept = default; + +void ChatterboxModel::validateConfig(const ChatterboxConfig& cfg) { + if (cfg.useGpu.has_value() && cfg.nGpuLayers.has_value()) { + const bool wantsGpu = *cfg.useGpu; + const int layers = *cfg.nGpuLayers; + // `layers != 0` (rather than `layers > 0`) so a llama.cpp-style + // sentinel like nGpuLayers=-1 ("offload all layers") is treated as + // "wants GPU" and doesn't falsely pass through against useGPU:true. + const bool layersWantGpu = layers != 0; + if (wantsGpu != layersWantGpu) { + throw StatusError( + general_error::InvalidArgument, + std::string("ChatterboxModel: useGPU=") + + (wantsGpu ? "true" : "false") + + " conflicts with nGpuLayers=" + std::to_string(layers) + + ". Either drop one of the two, or make them agree " + "(useGPU:true + nGpuLayers!=0, or useGPU:false + nGpuLayers=0)."); + } + } + if (cfg.t3ModelPath.empty()) { + throw StatusError(general_error::InvalidArgument, "t3ModelPath is required"); + } + if (cfg.s3genModelPath.empty()) { + throw StatusError(general_error::InvalidArgument, "s3genModelPath is required"); + } + if (!std::filesystem::exists(cfg.t3ModelPath)) { + throw createTTSError(TTSErrorCode::ModelFileNotFound, "t3 model not found: " + cfg.t3ModelPath); + } + if (!std::filesystem::exists(cfg.s3genModelPath)) { + throw createTTSError(TTSErrorCode::ModelFileNotFound, "s3gen model not found: " + cfg.s3genModelPath); + } + if (!cfg.referenceAudio.empty() && + !std::filesystem::exists(cfg.referenceAudio)) { + throw createTTSError(TTSErrorCode::ModelFileNotFound, "reference audio not found: " + cfg.referenceAudio); + } + if (!cfg.voiceDir.empty()) { + if (!std::filesystem::exists(cfg.voiceDir)) { + throw createTTSError(TTSErrorCode::ModelFileNotFound, "voice dir not found: " + cfg.voiceDir); + } + if (!std::filesystem::is_directory(cfg.voiceDir)) { + throw StatusError( + general_error::InvalidArgument, + "voiceDir path exists but is not a directory: " + cfg.voiceDir); + } + } + // No JS-side allow-list of language codes: the active GGUF variant + // (turbo English vs multilingual) determines what's supported, and + // tts_cpp::chatterbox::Engine throws a clear runtime error when the + // requested language doesn't match the loaded variant. Forcing a + // hard-coded "en"-only check here would leak the turbo-variant + // assumption into the addon and silently reject the multilingual + // GGUFs (chatterbox-t3-mtl + chatterbox-s3gen-mtl) the converter + // pipeline already produces. +} + +void ChatterboxModel::load() { + std::lock_guard lk(engineMu_); + loadLocked(); +} + +void ChatterboxModel::unload() { + std::lock_guard lk(engineMu_); + unloadLocked(); +} + +void ChatterboxModel::reload() { + std::lock_guard lk(engineMu_); + unloadLocked(); + loadLocked(); +} + +void ChatterboxModel::loadLocked() { + if (engine_) return; + try { + engine_ = std::make_shared(toEngineOptions(cfg_)); + } catch (const std::exception& e) { + engine_.reset(); + throw createTTSError( + TTSErrorCode::InitializationFailed, + std::string("ChatterboxModel::load: ") + e.what()); + } + + backendName_ = engine_->backend_name(); + backendDevice_ = backendDeviceCode(engine_->backend_device()); + backendId_ = backendIdFromName(backendName_); +} + +void ChatterboxModel::unloadLocked() { + engine_.reset(); +} + +void ChatterboxModel::cancel() const { + cancelRequested_.store(true, std::memory_order_relaxed); + // Grab a local copy of engine_ under the lock so we can invoke + // cancel() safely even if another thread calls unload()/reload() in + // parallel. The Engine itself is responsible for making cancel() + // thread-safe against its in-flight synthesize(). + std::shared_ptr e; + { + std::lock_guard lk(engineMu_); + e = engine_; + } + if (e) e->cancel(); +} + +ChatterboxModel::SynthesizeResult ChatterboxModel::synthesize( + const std::string& text, const ChunkCallback& chunkCallback) { + // Capture the engine under the lock; keep it alive for the duration + // of synthesize() via the local `engine` shared_ptr even if reload() + // concurrently swaps a new one in. Reload's new engine takes effect + // on the NEXT synthesize call. + std::shared_ptr engine; + { + std::lock_guard lk(engineMu_); + engine = engine_; + } + if (!engine) { + throw createTTSError(TTSErrorCode::ModelNotLoaded, + "ChatterboxModel::synthesize: engine not loaded"); + } + if (cancelRequested_.load(std::memory_order_relaxed)) { + throw createTTSError(TTSErrorCode::SynthesisFailed, + "synthesis cancelled before it started"); + } + + // Snapshot the streaming decision against the engine we're actually + // about to call, BEFORE process() needs it. Reading engine_ / + // engine->options() outside the lock from process() would race with + // reload() swapping a new engine in; pinning the decision here keeps + // the read tied to the local `engine` shared_ptr for the call's + // lifetime. + const bool wasStreaming = + static_cast(chunkCallback) && + engine->options().stream_chunk_tokens > 0; + + const auto tStart = std::chrono::steady_clock::now(); + + tts_cpp::chatterbox::SynthesisResult result; + try { + if (wasStreaming) { + result = engine->synthesize( + text, + [&chunkCallback](const float* pcm, std::size_t samples, + int chunkIndex, bool isLast) { + chunkCallback(pcmFloatToInt16(pcm, samples), chunkIndex, isLast); + }); + } else { + result = engine->synthesize(text); + } + } catch (const std::exception& e) { + throw createTTSError(TTSErrorCode::SynthesisFailed, + std::string("engine.synthesize: ") + e.what()); + } + + std::vector pcm = pcmFloatToInt16(result.pcm); + + const auto tEnd = std::chrono::steady_clock::now(); + const double elapsedSec = + std::chrono::duration(tEnd - tStart).count(); + + totalTime_ = elapsedSec; + totalSamples_ = static_cast(pcm.size()); + audioDurationMs_ = result.sample_rate > 0 + ? (static_cast(pcm.size()) * 1000.0 / + static_cast(result.sample_rate)) + : 0.0; + realTimeFactor_ = + audioDurationMs_ > 0 ? (elapsedSec * 1000.0) / audioDurationMs_ : 0.0; + textLength_ = text.size(); + tokensPerSecond_ = + elapsedSec > 0 ? static_cast(textLength_) / elapsedSec : 0.0; + + return {std::move(pcm), wasStreaming}; +} + +std::any ChatterboxModel::process(const std::any& input) { + const auto* anyInput = std::any_cast(&input); + if (anyInput == nullptr) { + throw StatusError( + general_error::InvalidArgument, + "ChatterboxModel::process: expected AnyInput (text + chunkCallback)"); + } + if (anyInput->text.empty()) { + throw StatusError( + general_error::InvalidArgument, "ChatterboxModel::process: empty text"); + } + + // Serialize concurrent process() calls. The outer JobRunner already + // queues jobs sequentially, but a direct C++ caller (or a future + // pipeline that bypasses JobRunner) could still overlap — fail fast + // with a clear error instead of data-racing on engine_ state. + bool expected = false; + if (!jobInProgress_.compare_exchange_strong( + expected, true, std::memory_order_acq_rel)) { + throw StatusError( + general_error::InvalidArgument, + "ChatterboxModel::process: another synthesis job is already in progress"); + } + struct InProgressGuard { + std::atomic_bool& flag; + ~InProgressGuard() { flag.store(false, std::memory_order_release); } + } guard{jobInProgress_}; + + cancelRequested_.store(false, std::memory_order_relaxed); + auto result = synthesize(anyInput->text, anyInput->chunkCallback); + // Streaming mode: chunks have already been published via chunkCallback + // → OutputQueue. Returning the concatenated PCM here would cause a + // duplicate final `outputArray` event after all the chunks. Return an + // empty std::any so no output handler matches — JobRunner still emits + // JobEnded with runtimeStats on its own. We trust the wasStreaming + // bit captured under the engine lock inside synthesize() rather than + // re-reading engine_ here (which would race with a concurrent + // reload()). + if (result.wasStreaming) return {}; + return std::any(std::move(result.pcm)); +} + +qvac_lib_inference_addon_cpp::RuntimeStats ChatterboxModel::runtimeStats() const { + qvac_lib_inference_addon_cpp::RuntimeStats stats; + stats.emplace_back("totalTime", totalTime_); + stats.emplace_back("tokensPerSecond", tokensPerSecond_); + stats.emplace_back("realTimeFactor", realTimeFactor_); + stats.emplace_back("audioDurationMs", audioDurationMs_); + stats.emplace_back("totalSamples", totalSamples_); + stats.emplace_back("backendDevice", static_cast(backendDevice_)); + stats.emplace_back("backendId", static_cast(backendId_)); + return stats; +} + +} // namespace qvac::ttsggml::chatterbox diff --git a/packages/tts-ggml/addon/src/model-interface/chatterbox/ChatterboxModel.hpp b/packages/tts-ggml/addon/src/model-interface/chatterbox/ChatterboxModel.hpp new file mode 100644 index 0000000000..d24a48b7ee --- /dev/null +++ b/packages/tts-ggml/addon/src/model-interface/chatterbox/ChatterboxModel.hpp @@ -0,0 +1,149 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "inference-addon-cpp/ModelInterfaces.hpp" +#include "inference-addon-cpp/RuntimeStats.hpp" + +#include "model-interface/chatterbox/ChatterboxConfig.hpp" + +namespace tts_cpp::chatterbox { +class Engine; +} // namespace tts_cpp::chatterbox + +namespace qvac::ttsggml::chatterbox { + +/** + * IModel implementation that wraps the tts-cpp::tts-cpp static library + * (Chatterbox English GGUF). Holds a persistent + * `tts_cpp::chatterbox::Engine` so that each {@link process} call pays only + * the T3 autoregressive decode + S3Gen + HiFT synthesis cost. The T3 GGUF, + * S3Gen GGUF, and voice-conditioning tensors are loaded once in {@link load} + * and reused until {@link unload} / destruction. + * + * Constructor only validates the config + records the deferred load + * closure; the actual GGUF parse runs lazily on the first + * {@link waitForLoadInitialization} or {@link load} call. The JS + * binding wraps `addon.activate()` (which calls + * `waitForLoadInitialization`) inside `JsAsyncTask::run` so the + * multi-hundred-MB ggml parse happens on a worker thread instead of + * stalling the JS event loop. + */ +class ChatterboxModel + : public qvac_lib_inference_addon_cpp::model::IModel, + public qvac_lib_inference_addon_cpp::model::IModelCancel, + public qvac_lib_inference_addon_cpp::model::IModelAsyncLoad { +public: + using Input = std::string; + using InputView = std::string_view; + using Output = std::vector; + + /** + * Per-chunk callback used when native streaming is enabled. Receives + * each chunk's PCM (already converted to 16-bit) plus its 0-based index + * and an `isLast` flag. Wired from the JS binding onto + * addonCpp->outputQueue so every chunk materialises as an onUpdate + * event on the JS side (same pattern as qvac-lib-infer-llamacpp-llm's + * per-token outputCallback). + */ + using ChunkCallback = std::function< + void(std::vector&& pcm, int chunkIndex, bool isLast)>; + + struct AnyInput { + std::string text; + /** Non-empty = native streaming; empty = batch. The engine also needs `streamChunkTokens > 0` in its construction config. */ + ChunkCallback chunkCallback; + }; + + explicit ChatterboxModel(ChatterboxConfig config); + ~ChatterboxModel() noexcept override; + + // IModel + std::string getName() const override { return "ChatterboxModel"; } + std::any process(const std::any& input) override; + qvac_lib_inference_addon_cpp::RuntimeStats runtimeStats() const override; + + // IModelCancel — flips a cancellation flag on the underlying engine; the + // T3 decode loop checks it per token and throws out of synthesize() on + // the next iteration. S3Gen + HiFT is not yet cancellable mid-chunk; + // that lands with the streaming milestone. + void cancel() const override; + + void load(); + void unload(); + void reload(); + bool isLoaded() const { + std::lock_guard lk(engineMu_); + return static_cast(engine_); + } + + // IModelAsyncLoad — invoked by AddonCpp::activate() (which the JS + // binding wraps in JsAsyncTask::run, see addon_js::activate in + // AddonJs.hpp). Calls load() lazily on the worker thread; idempotent + // because loadLocked() returns early if engine_ is already set. + void waitForLoadInitialization() override { load(); } + // Not supported: tts-ggml loads GGUFs from on-disk paths configured at + // construction time, not from incremental byte streams. + void setWeightsForFile( + const std::string&, + std::unique_ptr>&&) override {} + + void setConfig(ChatterboxConfig config) { cfg_ = std::move(config); } + const ChatterboxConfig& config() const { return cfg_; } + +private: + struct SynthesizeResult { + Output pcm; + /** True iff synthesize() routed through the chunk-streaming path + * (chunks already published via chunkCallback / OutputQueue). + * Captured under the engine lock so process() doesn't have to + * re-read engine_ state outside the lock to make the streaming + * decision. */ + bool wasStreaming = false; + }; + SynthesizeResult synthesize(const std::string& text, + const ChunkCallback& chunkCallback); + static void validateConfig(const ChatterboxConfig& cfg); + + // Called under `engineMu_`. + void loadLocked(); + void unloadLocked(); + + ChatterboxConfig cfg_; + + // `engine_` is read by `cancel()` (which can be invoked from any + // thread) while `load()` / `unload()` / `reload()` mutate it from the + // job thread — guard both reads and writes with this mutex. We keep a + // `shared_ptr` so `cancel()` (and the long-running `synthesize()`) can + // take a cheap local copy under the lock and then work outside it. + mutable std::mutex engineMu_; + std::shared_ptr engine_; + + // Rejects concurrent `process()` invocations; the outer JobRunner also + // serializes jobs, but belt-and-suspenders enforcement here keeps + // direct C++ callers honest too. + std::atomic_bool jobInProgress_{false}; + + double totalTime_ = 0.0; + double audioDurationMs_ = 0.0; + int64_t totalSamples_ = 0; + double realTimeFactor_ = 0.0; + double tokensPerSecond_ = 0.0; + size_t textLength_ = 0; + + int backendDevice_ = 0; + int backendId_ = 0; + std::string backendName_ = "CPU"; + + mutable std::atomic_bool cancelRequested_{false}; +}; + +} // namespace qvac::ttsggml::chatterbox diff --git a/packages/tts-ggml/addon/src/model-interface/supertonic/SupertonicConfig.hpp b/packages/tts-ggml/addon/src/model-interface/supertonic/SupertonicConfig.hpp new file mode 100644 index 0000000000..39d3cf1225 --- /dev/null +++ b/packages/tts-ggml/addon/src/model-interface/supertonic/SupertonicConfig.hpp @@ -0,0 +1,36 @@ +#pragma once + +#include +#include + +namespace qvac::ttsggml::supertonic { + +struct SupertonicConfig { + std::string modelGgufPath; + std::string voice; + std::string language = "en"; + std::optional steps; + std::optional speed; + std::optional seed; + std::optional threads; + std::optional nGpuLayers; + std::optional outputSampleRate; + /** + * Tri-state GPU intent (mirrors ChatterboxConfig::useGpu): + * - std::nullopt: unspecified, let the engine use its library default. + * - true: if nGpuLayers unset, maps to nGpuLayers=99. + * Note: SupertonicModel::validateConfig still rejects + * any GPU intent today because the Supertonic + * engine is CPU-only ("CPU only today" — see + * tts-cpp include/tts-cpp/supertonic/engine.h). + * - false: if nGpuLayers unset, forces nGpuLayers=0 (CPU). + * + * Conflicts with nGpuLayers (true + 0, or false + !=0) are rejected + * by validateConfig so callers can't silently get the opposite + * backend they asked for. + */ + std::optional useGpu; + std::string noiseNpyPath; +}; + +} diff --git a/packages/tts-ggml/addon/src/model-interface/supertonic/SupertonicModel.cpp b/packages/tts-ggml/addon/src/model-interface/supertonic/SupertonicModel.cpp new file mode 100644 index 0000000000..705c2fe75a --- /dev/null +++ b/packages/tts-ggml/addon/src/model-interface/supertonic/SupertonicModel.cpp @@ -0,0 +1,250 @@ +#include "model-interface/supertonic/SupertonicModel.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "addon/TTSErrors.hpp" +#include "model-interface/BackendUtils.hpp" +#include "inference-addon-cpp/Errors.hpp" + +namespace qvac::ttsggml::supertonic { + +namespace { + +using qvac_errors::createTTSError; +using qvac_errors::StatusError; +using qvac_errors::tts_error::TTSErrorCode; +namespace general_error = qvac_errors::general_error; + +tts_cpp::supertonic::EngineOptions toEngineOptions(const SupertonicConfig& cfg) { + tts_cpp::supertonic::EngineOptions opts; + opts.model_gguf_path = cfg.modelGgufPath; + opts.voice = cfg.voice; + if (!cfg.language.empty()) opts.language = cfg.language; + if (cfg.steps.has_value()) opts.steps = *cfg.steps; + if (cfg.speed.has_value()) opts.speed = *cfg.speed; + if (cfg.seed.has_value()) opts.seed = *cfg.seed; + if (cfg.threads.has_value()) opts.n_threads = *cfg.threads; + if (cfg.nGpuLayers.has_value()) { + opts.n_gpu_layers = *cfg.nGpuLayers; + } else if (cfg.useGpu.has_value()) { + opts.n_gpu_layers = *cfg.useGpu ? 99 : 0; + } + opts.noise_npy_path = cfg.noiseNpyPath; + return opts; +} + +std::vector pcmFloatToInt16(const float* pcm, size_t samples) { + std::vector out; + out.resize(samples); + for (size_t i = 0; i < samples; ++i) { + float s = std::clamp(pcm[i], -1.0f, 1.0f); + out[i] = static_cast(std::lround(s * 32767.0f)); + } + return out; +} + +} + +SupertonicModel::SupertonicModel(SupertonicConfig config) + : cfg_(std::move(config)) { + validateConfig(cfg_); + // See ChatterboxModel ctor: load() is deferred to + // waitForLoadInitialization() so the GGUF parse runs off the JS event + // loop via JsAsyncTask::run-driven addon.activate(). +} + +SupertonicModel::~SupertonicModel() noexcept = default; + +void SupertonicModel::validateConfig(const SupertonicConfig& cfg) { + if (cfg.modelGgufPath.empty()) { + throw StatusError(general_error::InvalidArgument, + "supertonicModelPath is required"); + } + if (!std::filesystem::exists(cfg.modelGgufPath)) { + throw createTTSError(TTSErrorCode::ModelFileNotFound, + "supertonic model not found: " + cfg.modelGgufPath); + } + if (cfg.steps.has_value() && *cfg.steps < 0) { + throw StatusError(general_error::InvalidArgument, + "steps must be >= 0"); + } + if (cfg.speed.has_value() && *cfg.speed < 0.0f) { + throw StatusError(general_error::InvalidArgument, + "speed must be >= 0"); + } + if (!cfg.noiseNpyPath.empty() && + !std::filesystem::exists(cfg.noiseNpyPath)) { + throw createTTSError(TTSErrorCode::ModelFileNotFound, + "noise npy not found: " + cfg.noiseNpyPath); + } + // Defense-in-depth: the JS layer (index.js::_validateConfig) runs the + // same conflict check before this method is reached, so direct C++ + // callers are the only ones who can actually trip this branch. + // Mirror the Chatterbox suffix verbatim so users see an identical + // hint regardless of which engine they instantiated. `layers != 0` + // matches llama.cpp's "-1 = offload all" sentinel convention. + if (cfg.useGpu.has_value() && cfg.nGpuLayers.has_value()) { + const bool wantsGpuFlag = *cfg.useGpu; + const int layers = *cfg.nGpuLayers; + const bool layersWantGpu = layers != 0; + if (wantsGpuFlag != layersWantGpu) { + throw StatusError( + general_error::InvalidArgument, + std::string("SupertonicModel: useGPU=") + + (wantsGpuFlag ? "true" : "false") + + " conflicts with nGpuLayers=" + std::to_string(layers) + + ". Either drop one of the two, or make them agree " + "(useGPU:true + nGpuLayers!=0, or useGPU:false + nGpuLayers=0)."); + } + } + const bool wantsGpu = + cfg.useGpu.value_or(false) || + (cfg.nGpuLayers.has_value() && *cfg.nGpuLayers != 0); + if (wantsGpu) { + throw StatusError( + general_error::InvalidArgument, + "SupertonicModel: GPU execution is not supported by the Supertonic " + "engine yet (see tts-cpp include/tts-cpp/supertonic/engine.h: \"CPU " + "only today\"). GPU output is currently silently wrong " + "(~4x quieter, slightly truncated) on the Vulkan vector-estimator " + "+ vocoder path. Pass useGPU: false (and leave nGpuLayers unset or " + "0) when constructing a Supertonic model."); + } +} + +void SupertonicModel::load() { + std::lock_guard lk(engineMu_); + loadLocked(); +} + +void SupertonicModel::unload() { + std::lock_guard lk(engineMu_); + unloadLocked(); +} + +void SupertonicModel::reload() { + std::lock_guard lk(engineMu_); + unloadLocked(); + loadLocked(); +} + +void SupertonicModel::loadLocked() { + if (engine_) return; + try { + engine_ = std::make_shared(toEngineOptions(cfg_)); + } catch (const std::exception& e) { + engine_.reset(); + throw createTTSError( + TTSErrorCode::InitializationFailed, + std::string("SupertonicModel::load: ") + e.what()); + } + + backendName_ = engine_->backend_name(); + backendDevice_ = backendDeviceCode(engine_->backend_device()); + backendId_ = backendIdFromName(backendName_); +} + +void SupertonicModel::unloadLocked() { + engine_.reset(); +} + +void SupertonicModel::cancel() const { + cancelRequested_.store(true, std::memory_order_relaxed); + std::shared_ptr e; + { + std::lock_guard lk(engineMu_); + e = engine_; + } + if (e) e->cancel(); +} + +SupertonicModel::Output SupertonicModel::synthesize(const std::string& text) { + std::shared_ptr engine; + { + std::lock_guard lk(engineMu_); + engine = engine_; + } + if (!engine) { + throw createTTSError(TTSErrorCode::InitializationFailed, + "SupertonicModel::synthesize: engine not loaded"); + } + if (cancelRequested_.load(std::memory_order_relaxed)) { + throw createTTSError(TTSErrorCode::SynthesisFailed, + "synthesis cancelled before it started"); + } + + textLength_ = text.size(); + + const auto t0 = std::chrono::steady_clock::now(); + tts_cpp::supertonic::SynthesisResult result; + try { + result = engine->synthesize(text); + } catch (const std::exception& e) { + throw createTTSError(TTSErrorCode::SynthesisFailed, + std::string("supertonic.synthesize: ") + e.what()); + } + const auto t1 = std::chrono::steady_clock::now(); + + sampleRate_ = result.sample_rate; + totalSamples_ = static_cast(result.pcm.size()); + audioDurationMs_ = result.duration_s > 0.0f + ? result.duration_s * 1000.0 + : (sampleRate_ > 0 ? (static_cast(totalSamples_) * 1000.0 / + static_cast(sampleRate_)) + : 0.0); + totalTime_ = std::chrono::duration(t1 - t0).count(); + realTimeFactor_ = audioDurationMs_ > 0.0 + ? (totalTime_ * 1000.0) / audioDurationMs_ + : 0.0; + tokensPerSecond_ = totalTime_ > 0.0 + ? static_cast(textLength_) / totalTime_ + : 0.0; + + return pcmFloatToInt16(result.pcm.data(), result.pcm.size()); +} + +std::any SupertonicModel::process(const std::any& input) { + const auto* anyInput = std::any_cast(&input); + if (!anyInput) { + throw StatusError(general_error::InvalidArgument, + "SupertonicModel::process: input must be AnyInput"); + } + + bool expected = false; + if (!jobInProgress_.compare_exchange_strong(expected, true, + std::memory_order_acq_rel)) { + throw StatusError(general_error::InternalError, + "SupertonicModel::process: job already in progress"); + } + struct InProgressGuard { + std::atomic_bool& flag; + ~InProgressGuard() { flag.store(false, std::memory_order_release); } + } guard{jobInProgress_}; + + cancelRequested_.store(false, std::memory_order_relaxed); + return std::any(synthesize(anyInput->text)); +} + +qvac_lib_inference_addon_cpp::RuntimeStats SupertonicModel::runtimeStats() const { + qvac_lib_inference_addon_cpp::RuntimeStats stats; + stats.emplace_back("totalTime", totalTime_); + stats.emplace_back("tokensPerSecond", tokensPerSecond_); + stats.emplace_back("realTimeFactor", realTimeFactor_); + stats.emplace_back("audioDurationMs", audioDurationMs_); + stats.emplace_back("totalSamples", totalSamples_); + stats.emplace_back("backendDevice", static_cast(backendDevice_)); + stats.emplace_back("backendId", static_cast(backendId_)); + return stats; +} + +} diff --git a/packages/tts-ggml/addon/src/model-interface/supertonic/SupertonicModel.hpp b/packages/tts-ggml/addon/src/model-interface/supertonic/SupertonicModel.hpp new file mode 100644 index 0000000000..06f4c87e62 --- /dev/null +++ b/packages/tts-ggml/addon/src/model-interface/supertonic/SupertonicModel.hpp @@ -0,0 +1,100 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "inference-addon-cpp/ModelInterfaces.hpp" +#include "inference-addon-cpp/RuntimeStats.hpp" + +#include "model-interface/supertonic/SupertonicConfig.hpp" + +namespace tts_cpp::supertonic { +class Engine; +} + +namespace qvac::ttsggml::supertonic { + +class SupertonicModel + : public qvac_lib_inference_addon_cpp::model::IModel, + public qvac_lib_inference_addon_cpp::model::IModelCancel, + public qvac_lib_inference_addon_cpp::model::IModelAsyncLoad { +public: + using Input = std::string; + using Output = std::vector; + + struct AnyInput { + std::string text; + }; + + explicit SupertonicModel(SupertonicConfig config); + ~SupertonicModel() noexcept override; + + std::string getName() const override { return "SupertonicModel"; } + std::any process(const std::any& input) override; + qvac_lib_inference_addon_cpp::RuntimeStats runtimeStats() const override; + + void cancel() const override; + + void load(); + void unload(); + void reload(); + bool isLoaded() const { + std::lock_guard lk(engineMu_); + return static_cast(engine_); + } + + // IModelAsyncLoad — see the equivalent comment on ChatterboxModel. + // AddonCpp::activate() (wrapped in JsAsyncTask::run by + // addon_js::activate) calls this on a worker thread; load() is + // idempotent. + void waitForLoadInitialization() override { load(); } + void setWeightsForFile( + const std::string&, + std::unique_ptr>&&) override {} + + void setConfig(SupertonicConfig config) { cfg_ = std::move(config); } + const SupertonicConfig& config() const { return cfg_; } + + int sampleRate() const { return sampleRate_; } + +private: + Output synthesize(const std::string& text); + static void validateConfig(const SupertonicConfig& cfg); + + void loadLocked(); + void unloadLocked(); + + SupertonicConfig cfg_; + + mutable std::mutex engineMu_; + std::shared_ptr engine_; + + std::atomic_bool jobInProgress_{false}; + + // Mirrors ChatterboxModel::cancelRequested_: a JS-side cancel issued + // between two run() calls (or before the first one) sets this flag; + // process() consumes it on entry so a stale cancel doesn't poison the + // next synthesis. cancel() also forwards to the underlying engine, + // but the per-process reset here is defence-in-depth against + // tts_cpp::supertonic::Engine ever growing a sticky cancel flag. + mutable std::atomic_bool cancelRequested_{false}; + + double totalTime_ = 0.0; + double audioDurationMs_ = 0.0; + int64_t totalSamples_ = 0; + double realTimeFactor_ = 0.0; + double tokensPerSecond_ = 0.0; + size_t textLength_ = 0; + int sampleRate_ = 44100; + + int backendDevice_ = 0; + int backendId_ = 0; + std::string backendName_ = "CPU"; +}; + +} diff --git a/packages/tts-ggml/addon/tests/AddonCppTest.cpp b/packages/tts-ggml/addon/tests/AddonCppTest.cpp new file mode 100644 index 0000000000..61fadcd4a0 --- /dev/null +++ b/packages/tts-ggml/addon/tests/AddonCppTest.cpp @@ -0,0 +1,174 @@ +// Addon-level integration tests for the tts-ggml shell. +// +// Constructing a real ChatterboxModel / SupertonicModel from C++ requires +// a real GGUF on disk (see test_chatterbox_config.cpp / +// test_supertonic_config.cpp). These tests instead drive `AddonCpp` +// with a fake `IModel` so we exercise: +// - the runJob -> outputCallback -> CppQueuedOutputHandler chain +// - busy rejection when a job is already in flight +// - cooperative cancel + restart +// +// Mirrors the pattern in qvac-lib-infer-parakeet/addon/tests/AddonCppTest.cpp. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "addon/AddonCpp.hpp" + +namespace { + +using qvac::ttsggml::AddonInstance; +using qvac::ttsggml::createInstance; + +// IModel that: +// - returns a fixed PCM buffer for "ok" inputs +// - blocks on a condvar for "blocking" inputs (so we can race a second +// runJob and assert it gets rejected) +// - throws "Job cancelled" when cancel() is invoked while blocked +class StubAudioModel + : public qvac_lib_inference_addon_cpp::model::IModel, + public qvac_lib_inference_addon_cpp::model::IModelCancel { +public: + std::string getName() const override { return "StubAudioModel"; } + + qvac_lib_inference_addon_cpp::RuntimeStats runtimeStats() const override { + qvac_lib_inference_addon_cpp::RuntimeStats stats; + stats.emplace_back("totalSamples", static_cast(lastSampleCount_)); + stats.emplace_back("audioDurationMs", + static_cast(lastSampleCount_) * 1000.0 / 24000.0); + stats.emplace_back("totalTime", 0.001); + return stats; + } + + std::any process(const std::any& input) override { + const auto& text = std::any_cast(input); + + if (text == "blocking") { + std::unique_lock lk(mu_); + blocked_ = true; + cv_.notify_all(); + cv_.wait(lk, [this] { return !blocked_ || cancelled_; }); + if (cancelled_) { + cancelled_ = false; + throw std::runtime_error("Job cancelled"); + } + } + + std::vector pcm(static_cast(text.size()) * 240, 0); + lastSampleCount_ = pcm.size(); + return std::any(std::move(pcm)); + } + + void cancel() const override { + std::lock_guard lk(mu_); + cancelled_ = true; + blocked_ = false; + cv_.notify_all(); + } + + void waitUntilBlocked() { + std::unique_lock lk(mu_); + ASSERT_TRUE(cv_.wait_for(lk, std::chrono::seconds(2), + [this] { return blocked_; })); + } + + void unblock() { + std::lock_guard lk(mu_); + blocked_ = false; + cv_.notify_all(); + } + +private: + mutable std::mutex mu_; + mutable std::condition_variable cv_; + mutable bool blocked_{false}; + mutable bool cancelled_{false}; + std::size_t lastSampleCount_{0}; +}; + +std::pair createStubAddon() { + auto model = std::make_unique(); + auto* modelPtr = model.get(); + auto instance = createInstance(std::move(model)); + return {std::move(instance), modelPtr}; +} + +} + +TEST(TtsGgmlAddonCpp, RunJobEmitsAudioAndStats) { + auto [instance, model] = createStubAddon(); + ASSERT_TRUE(instance.addon->runJob(std::any(std::string("ok-input")))); + + auto pcm = instance.audioOutput->tryPop(std::chrono::seconds(5)); + ASSERT_TRUE(pcm.has_value()); + EXPECT_FALSE(pcm->empty()); + + auto stats = instance.statsOutput->tryPop(std::chrono::seconds(5)); + ASSERT_TRUE(stats.has_value()); + bool sawTotalSamples = false; + bool sawAudioDurationMs = false; + for (const auto& [k, _v] : *stats) { + if (k == "totalSamples") sawTotalSamples = true; + if (k == "audioDurationMs") sawAudioDurationMs = true; + } + EXPECT_TRUE(sawTotalSamples); + EXPECT_TRUE(sawAudioDurationMs); +} + +TEST(TtsGgmlAddonCpp, RejectsSecondRunWhileBusy) { + auto [instance, model] = createStubAddon(); + + ASSERT_TRUE(instance.addon->runJob(std::any(std::string("blocking")))); + model->waitUntilBlocked(); + + EXPECT_FALSE(instance.addon->runJob(std::any(std::string("second")))); + + model->unblock(); + // Drain any remaining outputs so the JobRunner unwinds cleanly. + instance.audioOutput->tryPop(std::chrono::seconds(2)); + instance.statsOutput->tryPop(std::chrono::seconds(2)); +} + +TEST(TtsGgmlAddonCpp, CancelInFlightAllowsNextRun) { + auto [instance, model] = createStubAddon(); + + ASSERT_TRUE(instance.addon->runJob(std::any(std::string("blocking")))); + model->waitUntilBlocked(); + + instance.addon->cancelJob(); + auto err = instance.errorOutput->tryPop(std::chrono::seconds(5)); + if (!err.has_value()) { + instance.audioOutput->tryPop(std::chrono::seconds(1)); + instance.statsOutput->tryPop(std::chrono::seconds(1)); + } + + bool accepted = false; + const auto deadline = + std::chrono::steady_clock::now() + std::chrono::seconds(5); + while (std::chrono::steady_clock::now() < deadline) { + if (instance.addon->runJob(std::any(std::string("ok-after-cancel")))) { + accepted = true; + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + ASSERT_TRUE(accepted); + + auto pcm = instance.audioOutput->tryPop(std::chrono::seconds(5)); + ASSERT_TRUE(pcm.has_value()); + EXPECT_FALSE(pcm->empty()); +} diff --git a/packages/tts-ggml/addon/tests/test_backend_utils.cpp b/packages/tts-ggml/addon/tests/test_backend_utils.cpp new file mode 100644 index 0000000000..b896b576ff --- /dev/null +++ b/packages/tts-ggml/addon/tests/test_backend_utils.cpp @@ -0,0 +1,88 @@ +#include + +#include "addon/TTSErrors.hpp" +#include "model-interface/BackendUtils.hpp" + +using qvac::ttsggml::backendDeviceCode; +using qvac::ttsggml::backendIdFromName; +using qvac_errors::createTTSError; +using qvac_errors::tts_error::TTSAddonId; +using qvac_errors::tts_error::TTSErrorCode; +using qvac_errors::tts_error::toString; + +TEST(BackendUtils, BackendIdCpu) { + EXPECT_EQ(backendIdFromName("CPU"), 0); +} + +TEST(BackendUtils, BackendIdMetalPrefix) { + EXPECT_EQ(backendIdFromName("Metal"), 1); + EXPECT_EQ(backendIdFromName("Metal-A17"), 1); + EXPECT_EQ(backendIdFromName("MTL"), 1); + EXPECT_EQ(backendIdFromName("MTL_M3_Ultra"), 1); +} + +TEST(BackendUtils, BackendIdCudaPrefix) { + EXPECT_EQ(backendIdFromName("CUDA"), 2); + EXPECT_EQ(backendIdFromName("CUDA0"), 2); + EXPECT_EQ(backendIdFromName("CUDA_RTX4090"), 2); +} + +TEST(BackendUtils, BackendIdVulkanPrefix) { + EXPECT_EQ(backendIdFromName("Vulkan"), 3); + EXPECT_EQ(backendIdFromName("Vulkan0"), 3); + EXPECT_EQ(backendIdFromName("Vulkan_AMD_RX_7600_XT"), 3); +} + +TEST(BackendUtils, BackendIdOpenClPrefix) { + EXPECT_EQ(backendIdFromName("OpenCL"), 4); + EXPECT_EQ(backendIdFromName("OpenCL_Adreno_750"), 4); +} + +TEST(BackendUtils, BackendIdUnknownReturnsSentinel) { + EXPECT_EQ(backendIdFromName(""), 99); + EXPECT_EQ(backendIdFromName("ZLUDA"), 99); + EXPECT_EQ(backendIdFromName("cpu"), 99) << "case-sensitive: lowercase 'cpu' is not the CPU backend"; + EXPECT_EQ(backendIdFromName("Metalorama"), 1) << "rfind(prefix, 0) only checks at start"; +} + +TEST(BackendUtils, BackendDeviceCodeMatchesGgmlEnum) { + EXPECT_EQ(backendDeviceCode(tts_cpp::BackendDevice::CPU), 0); + EXPECT_EQ(backendDeviceCode(tts_cpp::BackendDevice::GPU), 1); +} + +TEST(TTSErrors, ToStringCoversAllKnownCodes) { + EXPECT_EQ(toString(TTSErrorCode::OK), "OK"); + EXPECT_EQ(toString(TTSErrorCode::ModelNotLoaded), "ModelNotLoaded"); + EXPECT_EQ(toString(TTSErrorCode::ModelFileNotFound), "ModelFileNotFound"); + EXPECT_EQ(toString(TTSErrorCode::ConfigFileNotFound), "ConfigFileNotFound"); + EXPECT_EQ(toString(TTSErrorCode::InvalidAPI), "InvalidAPI"); + EXPECT_EQ(toString(TTSErrorCode::InitializationFailed), "InitializationFailed"); + EXPECT_EQ(toString(TTSErrorCode::SynthesisFailed), "SynthesisFailed"); +} + +TEST(TTSErrors, ToStringFallsBackForUnknownCodes) { + EXPECT_EQ(toString(7), "UnknownTTSError"); + EXPECT_EQ(toString(99), "UnknownTTSError"); + EXPECT_EQ(toString(0xDEADBEEF), "UnknownTTSError"); +} + +TEST(TTSErrors, CreateTTSErrorTagsErrorWithTTSAddonId) { + const auto err = createTTSError(TTSErrorCode::ModelFileNotFound, + "missing model.gguf"); + const std::string code = err.codeString(); + EXPECT_NE(code.find(std::string(TTSAddonId)), std::string::npos) + << "codeString should embed addonId; got: " << code; + EXPECT_NE(code.find("ModelFileNotFound"), std::string::npos) + << "codeString should embed local code; got: " << code; + EXPECT_NE(std::string(err.what()).find("missing model.gguf"), + std::string::npos); + EXPECT_FALSE(err.isJSError()); +} + +TEST(TTSErrors, CreateTTSErrorWithUnknownCodeFallsBack) { + const auto err = createTTSError(static_cast(123), "oops"); + const std::string code = err.codeString(); + EXPECT_NE(code.find("UnknownTTSError"), std::string::npos) + << "codeString should embed UnknownTTSError fallback; got: " << code; + EXPECT_NE(std::string(err.what()).find("oops"), std::string::npos); +} diff --git a/packages/tts-ggml/addon/tests/test_chatterbox_config.cpp b/packages/tts-ggml/addon/tests/test_chatterbox_config.cpp new file mode 100644 index 0000000000..da3ce9f68c --- /dev/null +++ b/packages/tts-ggml/addon/tests/test_chatterbox_config.cpp @@ -0,0 +1,205 @@ +// Constructor-validation tests for ChatterboxModel. +// +// `ChatterboxModel::validateConfig()` is private but the constructor calls +// it before `load()`, so any config that fails validation throws before the +// expensive (real-GGUF) load step. We exercise validateConfig indirectly +// by attempting construction with bad configs and asserting the throw +// path / error code. +// +// Real-GGUF tests (full construct + process round-trip) are gated behind +// QVAC_TEST_CHATTERBOX_T3_GGUF + QVAC_TEST_CHATTERBOX_S3GEN_GGUF env +// vars. When unset, the gated tests skip cleanly via GTEST_SKIP() so +// the suite stays green in environments without converted models. + +#include + +#include +#include +#include +#include +#include +#include + +#include "model-interface/chatterbox/ChatterboxConfig.hpp" +#include "model-interface/chatterbox/ChatterboxModel.hpp" +#include "inference-addon-cpp/Errors.hpp" + +using qvac::ttsggml::chatterbox::ChatterboxConfig; +using qvac::ttsggml::chatterbox::ChatterboxModel; +using qvac_errors::StatusError; + +namespace { + +std::filesystem::path testTempDir() { + return std::filesystem::temp_directory_path() / "qvac-tts-ggml-chatterbox-tests"; +} + +std::filesystem::path tempPath(const std::string& suffix) { + auto dir = testTempDir(); + std::filesystem::create_directories(dir); + return dir / suffix; +} + +void writeStubFile(const std::filesystem::path& p, + const std::string& contents = "stub") { + std::ofstream(p, std::ios::binary) << contents; +} + +std::string envOrEmpty(const char* name) { + if (const char* v = std::getenv(name)) return v; + return ""; +} + +ChatterboxConfig minimallyValidStubConfig() { + ChatterboxConfig cfg; + cfg.t3ModelPath = tempPath("t3-stub.gguf").string(); + cfg.s3genModelPath = tempPath("s3gen-stub.gguf").string(); + writeStubFile(cfg.t3ModelPath); + writeStubFile(cfg.s3genModelPath); + return cfg; +} + +} + +TEST(ChatterboxValidate, EmptyT3PathRejected) { + ChatterboxConfig cfg; + EXPECT_THROW(ChatterboxModel{cfg}, StatusError); +} + +TEST(ChatterboxValidate, EmptyS3genPathRejected) { + ChatterboxConfig cfg; + cfg.t3ModelPath = tempPath("t3.gguf").string(); + writeStubFile(cfg.t3ModelPath); + EXPECT_THROW(ChatterboxModel{cfg}, StatusError); +} + +TEST(ChatterboxValidate, NonexistentT3PathRejected) { + ChatterboxConfig cfg; + cfg.t3ModelPath = "/definitely/does/not/exist/t3.gguf"; + cfg.s3genModelPath = "/definitely/does/not/exist/s3gen.gguf"; + EXPECT_THROW(ChatterboxModel{cfg}, StatusError); +} + +TEST(ChatterboxValidate, NonexistentS3genPathRejected) { + ChatterboxConfig cfg; + cfg.t3ModelPath = tempPath("t3-only.gguf").string(); + writeStubFile(cfg.t3ModelPath); + cfg.s3genModelPath = "/definitely/does/not/exist/s3gen.gguf"; + EXPECT_THROW(ChatterboxModel{cfg}, StatusError); +} + +TEST(ChatterboxValidate, NonexistentReferenceAudioRejected) { + auto cfg = minimallyValidStubConfig(); + cfg.referenceAudio = "/definitely/does/not/exist/ref.wav"; + // Validation rejects before load, so we don't need a real GGUF to hit + // this branch. + EXPECT_THROW(ChatterboxModel{cfg}, StatusError); +} + +TEST(ChatterboxValidate, NonexistentVoiceDirRejected) { + auto cfg = minimallyValidStubConfig(); + cfg.voiceDir = "/definitely/does/not/exist/voice/"; + EXPECT_THROW(ChatterboxModel{cfg}, StatusError); +} + +TEST(ChatterboxValidate, VoiceDirPointingAtFileRejected) { + auto cfg = minimallyValidStubConfig(); + // Point at the t3 stub file (definitely a file, definitely not a dir). + cfg.voiceDir = cfg.t3ModelPath; + EXPECT_THROW(ChatterboxModel{cfg}, StatusError); +} + +TEST(ChatterboxValidate, ValidStubPathsConstructAndDeferLoad) { + auto cfg = minimallyValidStubConfig(); + // Stub files pass `std::filesystem::exists()` so validation succeeds. + // Construction now defers GGUF parsing to waitForLoadInitialization() + // (called by AddonCpp::activate() on a JsAsyncTask worker thread), so + // the stub-file InitializationFailed throw happens on load(), not in + // the constructor. This proves validation passes AND that load is + // truly deferred (otherwise this would still throw at construction). + std::unique_ptr m; + EXPECT_NO_THROW(m = std::make_unique(cfg)); + ASSERT_NE(m, nullptr); + EXPECT_FALSE(m->isLoaded()); + EXPECT_THROW(m->load(), StatusError); + EXPECT_FALSE(m->isLoaded()); +} + +TEST(ChatterboxValidate, WaitForLoadInitializationDelegatesToLoad) { + auto cfg = minimallyValidStubConfig(); + ChatterboxModel m(cfg); + EXPECT_FALSE(m.isLoaded()); + // waitForLoadInitialization() is the IModelAsyncLoad entry point + // AddonCpp::activate() ultimately calls; it should propagate the same + // load-failure as load() itself. + EXPECT_THROW(m.waitForLoadInitialization(), StatusError); +} + +TEST(ChatterboxValidate, ConfigDefaultLanguageIsEnglish) { + ChatterboxConfig cfg; + EXPECT_EQ(cfg.language, "en"); +} + +TEST(ChatterboxValidate, ConfigUseGpuDefaultIsFalse) { + ChatterboxConfig cfg; + EXPECT_FALSE(cfg.useGpu.has_value()); + EXPECT_FALSE(cfg.seed.has_value()); + EXPECT_FALSE(cfg.threads.has_value()); + EXPECT_FALSE(cfg.nGpuLayers.has_value()); + EXPECT_FALSE(cfg.streamChunkTokens.has_value()); +} + +// ───────────────────────────────────────────────────────────────────── +// Real-GGUF round-trip (env-var gated). +// ───────────────────────────────────────────────────────────────────── + +TEST(ChatterboxRealGguf, ConstructAndUnloadIfAvailable) { + const auto t3 = envOrEmpty("QVAC_TEST_CHATTERBOX_T3_GGUF"); + const auto s3 = envOrEmpty("QVAC_TEST_CHATTERBOX_S3GEN_GGUF"); + if (t3.empty() || s3.empty()) { + GTEST_SKIP() << "Set QVAC_TEST_CHATTERBOX_T3_GGUF + " + "QVAC_TEST_CHATTERBOX_S3GEN_GGUF to enable."; + } + if (!std::filesystem::exists(t3) || !std::filesystem::exists(s3)) { + GTEST_SKIP() << "Configured GGUFs do not exist on disk."; + } + + ChatterboxConfig cfg; + cfg.t3ModelPath = t3; + cfg.s3genModelPath = s3; + cfg.useGpu = false; + + ChatterboxModel m(cfg); + EXPECT_FALSE(m.isLoaded()) << "load is now deferred until activate()/load()"; + EXPECT_EQ(m.getName(), "ChatterboxModel"); + EXPECT_NO_THROW(m.load()); + EXPECT_TRUE(m.isLoaded()); + EXPECT_NO_THROW(m.unload()); + EXPECT_FALSE(m.isLoaded()); +} + +TEST(ChatterboxRealGguf, ProcessRejectsWrongAnyInputType) { + const auto t3 = envOrEmpty("QVAC_TEST_CHATTERBOX_T3_GGUF"); + const auto s3 = envOrEmpty("QVAC_TEST_CHATTERBOX_S3GEN_GGUF"); + if (t3.empty() || s3.empty()) { + GTEST_SKIP() << "Set QVAC_TEST_CHATTERBOX_T3_GGUF + " + "QVAC_TEST_CHATTERBOX_S3GEN_GGUF to enable."; + } + if (!std::filesystem::exists(t3) || !std::filesystem::exists(s3)) { + GTEST_SKIP() << "Configured GGUFs do not exist on disk."; + } + + ChatterboxConfig cfg; + cfg.t3ModelPath = t3; + cfg.s3genModelPath = s3; + cfg.useGpu = false; + + ChatterboxModel m(cfg); + m.load(); // load is deferred since the constructor refactor; trigger it here + EXPECT_THROW(m.process(std::any{std::string{"raw string instead of AnyInput"}}), + StatusError); + EXPECT_THROW(m.process(std::any{int64_t{42}}), StatusError); + + ChatterboxModel::AnyInput emptyText{}; + EXPECT_THROW(m.process(std::any{emptyText}), StatusError); +} diff --git a/packages/tts-ggml/addon/tests/test_supertonic_config.cpp b/packages/tts-ggml/addon/tests/test_supertonic_config.cpp new file mode 100644 index 0000000000..0c19464848 --- /dev/null +++ b/packages/tts-ggml/addon/tests/test_supertonic_config.cpp @@ -0,0 +1,188 @@ +// Constructor-validation tests for SupertonicModel. Same shape as +// test_chatterbox_config.cpp: validateConfig is private so we drive it +// indirectly via the public constructor and assert the throw path. +// +// Real-GGUF round-trip is gated behind QVAC_TEST_SUPERTONIC_GGUF. + +#include + +#include +#include +#include +#include +#include + +#include "model-interface/supertonic/SupertonicConfig.hpp" +#include "model-interface/supertonic/SupertonicModel.hpp" +#include "inference-addon-cpp/Errors.hpp" + +using qvac::ttsggml::supertonic::SupertonicConfig; +using qvac::ttsggml::supertonic::SupertonicModel; +using qvac_errors::StatusError; + +namespace { + +std::filesystem::path testTempDir() { + return std::filesystem::temp_directory_path() / "qvac-tts-ggml-supertonic-tests"; +} + +std::filesystem::path tempPath(const std::string& suffix) { + auto dir = testTempDir(); + std::filesystem::create_directories(dir); + return dir / suffix; +} + +void writeStubFile(const std::filesystem::path& p, + const std::string& contents = "stub") { + std::ofstream(p, std::ios::binary) << contents; +} + +std::string envOrEmpty(const char* name) { + if (const char* v = std::getenv(name)) return v; + return ""; +} + +SupertonicConfig minimallyValidStubConfig() { + SupertonicConfig cfg; + cfg.modelGgufPath = tempPath("supertonic-stub.gguf").string(); + writeStubFile(cfg.modelGgufPath); + return cfg; +} + +} + +TEST(SupertonicValidate, EmptyModelPathRejected) { + SupertonicConfig cfg; + EXPECT_THROW(SupertonicModel{cfg}, StatusError); +} + +TEST(SupertonicValidate, NonexistentModelPathRejected) { + SupertonicConfig cfg; + cfg.modelGgufPath = "/definitely/does/not/exist/supertonic.gguf"; + EXPECT_THROW(SupertonicModel{cfg}, StatusError); +} + +TEST(SupertonicValidate, NegativeStepsRejected) { + auto cfg = minimallyValidStubConfig(); + cfg.steps = -1; + EXPECT_THROW(SupertonicModel{cfg}, StatusError); +} + +TEST(SupertonicValidate, NegativeSpeedRejected) { + auto cfg = minimallyValidStubConfig(); + cfg.speed = -0.5f; + EXPECT_THROW(SupertonicModel{cfg}, StatusError); +} + +TEST(SupertonicValidate, NonexistentNoiseNpyRejected) { + auto cfg = minimallyValidStubConfig(); + cfg.noiseNpyPath = "/definitely/does/not/exist/noise.npy"; + EXPECT_THROW(SupertonicModel{cfg}, StatusError); +} + +TEST(SupertonicValidate, UseGpuTrueRejectedWithExplanation) { + auto cfg = minimallyValidStubConfig(); + cfg.useGpu = true; + bool threw = false; + try { + SupertonicModel m(cfg); + } catch (const StatusError& e) { + threw = true; + const std::string what = e.what(); + EXPECT_NE(what.find("GPU"), std::string::npos) + << "error should mention GPU; got: " << what; + EXPECT_NE(what.find("Supertonic"), std::string::npos) + << "error should mention Supertonic engine; got: " << what; + } + EXPECT_TRUE(threw); +} + +TEST(SupertonicValidate, NGpuLayersGreaterThanZeroRejected) { + auto cfg = minimallyValidStubConfig(); + cfg.nGpuLayers = 99; + EXPECT_THROW(SupertonicModel{cfg}, StatusError); +} + +TEST(SupertonicValidate, NGpuLayersZeroAcceptedAndDeferredLoad) { + auto cfg = minimallyValidStubConfig(); + cfg.nGpuLayers = 0; + // Validation passes (CPU-only path); the stub file then fails GGUF + // parsing on load() (not at construction — load is now deferred to + // waitForLoadInitialization). The eventual throw must NOT be the + // GPU-rejection branch. + std::unique_ptr m; + EXPECT_NO_THROW(m = std::make_unique(cfg)); + ASSERT_NE(m, nullptr); + EXPECT_FALSE(m->isLoaded()); + bool threw = false; + try { + m->load(); + } catch (const StatusError& e) { + threw = true; + const std::string what = e.what(); + EXPECT_EQ(what.find("GPU"), std::string::npos) + << "nGpuLayers=0 should not trigger the GPU-rejection path; got: " << what; + } + EXPECT_TRUE(threw); + EXPECT_FALSE(m->isLoaded()); +} + +TEST(SupertonicValidate, WaitForLoadInitializationDelegatesToLoad) { + auto cfg = minimallyValidStubConfig(); + SupertonicModel m(cfg); + EXPECT_FALSE(m.isLoaded()); + EXPECT_THROW(m.waitForLoadInitialization(), StatusError); +} + +TEST(SupertonicValidate, ConfigDefaultsAreCpuFriendly) { + SupertonicConfig cfg; + EXPECT_EQ(cfg.language, "en"); + EXPECT_FALSE(cfg.useGpu.has_value()); + EXPECT_FALSE(cfg.nGpuLayers.has_value()); + EXPECT_FALSE(cfg.steps.has_value()); + EXPECT_FALSE(cfg.speed.has_value()); +} + +// ───────────────────────────────────────────────────────────────────── +// Real-GGUF round-trip (env-var gated). +// ───────────────────────────────────────────────────────────────────── + +TEST(SupertonicRealGguf, ConstructAndUnloadIfAvailable) { + const auto path = envOrEmpty("QVAC_TEST_SUPERTONIC_GGUF"); + if (path.empty() || !std::filesystem::exists(path)) { + GTEST_SKIP() << "Set QVAC_TEST_SUPERTONIC_GGUF to enable."; + } + + SupertonicConfig cfg; + cfg.modelGgufPath = path; + cfg.useGpu = false; + cfg.voice = "F1"; + + SupertonicModel m(cfg); + EXPECT_FALSE(m.isLoaded()) << "load is now deferred until activate()/load()"; + EXPECT_EQ(m.getName(), "SupertonicModel"); + EXPECT_NO_THROW(m.load()); + EXPECT_TRUE(m.isLoaded()); + EXPECT_GT(m.sampleRate(), 0); + EXPECT_NO_THROW(m.unload()); + EXPECT_FALSE(m.isLoaded()); +} + +TEST(SupertonicRealGguf, ProcessRejectsWrongAnyInputType) { + const auto path = envOrEmpty("QVAC_TEST_SUPERTONIC_GGUF"); + if (path.empty() || !std::filesystem::exists(path)) { + GTEST_SKIP() << "Set QVAC_TEST_SUPERTONIC_GGUF to enable."; + } + + SupertonicConfig cfg; + cfg.modelGgufPath = path; + cfg.useGpu = false; + + SupertonicModel m(cfg); + m.load(); // load is deferred since the constructor refactor; trigger it here + // Wrong AnyInput type is the only well-defined invariant SupertonicModel + // checks at the boundary; empty-text behaviour is delegated to the + // underlying tts_cpp::supertonic::Engine and intentionally left + // untested here to avoid coupling to engine-internal policy. + EXPECT_THROW(m.process(std::any{int64_t{42}}), StatusError); +} diff --git a/packages/tts-ggml/addonLogging.d.ts b/packages/tts-ggml/addonLogging.d.ts new file mode 100644 index 0000000000..bd687d60bc --- /dev/null +++ b/packages/tts-ggml/addonLogging.d.ts @@ -0,0 +1,7 @@ +export interface AddonLogging { + setLogger(callback: (priority: number, message: string) => void): void + releaseLogger(): void +} + +declare const addonLogging: AddonLogging +export default addonLogging diff --git a/packages/tts-ggml/addonLogging.js b/packages/tts-ggml/addonLogging.js new file mode 100644 index 0000000000..f9e036afa6 --- /dev/null +++ b/packages/tts-ggml/addonLogging.js @@ -0,0 +1,8 @@ +'use strict' + +const binding = require('./binding') + +module.exports = { + setLogger: binding.setLogger, + releaseLogger: binding.releaseLogger +} diff --git a/packages/tts-ggml/binding.js b/packages/tts-ggml/binding.js new file mode 100644 index 0000000000..cea46308c0 --- /dev/null +++ b/packages/tts-ggml/binding.js @@ -0,0 +1 @@ +module.exports = require.addon() diff --git a/packages/tts-ggml/examples/chatterbox-chunk-stream-tts.js b/packages/tts-ggml/examples/chatterbox-chunk-stream-tts.js new file mode 100644 index 0000000000..8fce75219a --- /dev/null +++ b/packages/tts-ggml/examples/chatterbox-chunk-stream-tts.js @@ -0,0 +1,169 @@ +'use strict' + +/** + * Chatterbox (ggml) — sub-sentence chunk streaming. + * + * Input is a single string; the *C++ Engine* splits its own synthesis + * into fixed `streamChunkTokens`-size chunks (~25 T3 tokens ~= 1 s of + * audio) and emits each chunk's PCM to JS via `onUpdate` the moment + * it's produced. The Engine runs the chunked S3Gen+HiFT loop with + * phase-continuous `hift_cache_source` across chunks, so the seams are + * inaudible — listeners get sub-second audio latency inside a single + * utterance (first-audio-out typically ~280 ms of synthesis wall time + * after T3 finishes). + * + * Contrast with `chatterbox-sentence-stream-tts.js`, which streams + * *sentences* in and emits *one audio chunk per sentence* out — that + * one mirrors the API of @qvac/tts-onnx and works on any backend; this + * one requires the Engine's streaming hook added in qvac-tts.cpp. + * + * Usage: + * bare examples/chatterbox-chunk-stream-tts.js [path/to/reference.wav] + */ + +const fs = require('bare-fs') +const path = require('bare-path') +const TTSGgml = require('../') +const { createWav } = require('./wav-helper') +const { setLogger, releaseLogger } = require('../addonLogging') +const { canPlayPcmChunks, createStreamingPlayer } = require('./pcm-chunk-player') + +const CHATTERBOX_SAMPLE_RATE = 24000 + +const argv = global.Bare ? global.Bare.argv : process.argv +const refAudioArg = argv[2] + +const pkgRoot = path.join(__dirname, '..') +const modelDir = path.join(pkgRoot, 'models') +const t3Model = path.join(modelDir, 'chatterbox-t3-turbo.gguf') +const s3genModel = path.join(modelDir, 'chatterbox-s3gen.gguf') + +for (const f of [t3Model, s3genModel]) { + if (!fs.existsSync(f)) { + console.error(`Missing model file: ${f}`) + console.error('Run "npm run setup-models" to set up the venv + convert the Resemble Chatterbox checkpoint to GGUF.') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) + } +} + +if (refAudioArg && !fs.existsSync(refAudioArg)) { + console.error(`Reference audio not found: ${refAudioArg}`) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} +if (!refAudioArg) { + console.log('No reference audio provided, using the voice baked into the S3Gen GGUF.') +} + +async function main () { + setLogger((priority, message) => { + if (priority > 1) return + const names = { 0: 'ERROR', 1: 'WARNING', 2: 'INFO', 3: 'DEBUG', 4: 'OFF' } + const name = names[priority] || 'UNKNOWN' + console.log(`[${new Date().toISOString()}] [C++ log] [${name}]: ${message}`) + }) + + const text = + 'Hello from native chatterbox streaming. This sentence should split into multiple chunks ' + + 'on the C++ side so audio starts flowing well before the full synthesis completes.' + + // streamChunkTokens activates the C++ chunked S3Gen+HiFT loop. + // streamFirstChunkTokens keeps first-audio-out low (small first chunk). + // cfmSteps 1 halves CFM cost with minor quality cost; + // 2 matches Python's meanflow default. + const model = new TTSGgml({ + files: { modelDir }, + ...(refAudioArg ? { referenceAudio: refAudioArg } : {}), + streamChunkTokens: 25, + streamFirstChunkTokens: 10, + cfmSteps: 1, + config: { language: 'en' }, + logger: console, + opts: { stats: true } + }) + + const outputFile = path.join(__dirname, 'chatterbox-chunk-stream-output.wav') + + try { + console.log('Loading Chatterbox TTS model (native streaming)...') + await model.load() + console.log('Model loaded.') + + const player = canPlayPcmChunks() + ? createStreamingPlayer({ sampleRate: CHATTERBOX_SAMPLE_RATE }) + : null + if (player) { + console.log(`Streaming playback via ${player.backend}: chunks flow to stdin as they arrive.`) + } else { + console.warn( + 'No supported player found (install ffmpeg / sox / alsa-utils). Chunks will be logged only.' + ) + } + + console.log(`\nSynthesizing: "${text.slice(0, 80)}${text.length > 80 ? '…' : ''}"\n`) + + const t0 = Date.now() + let firstChunkMs = -1 + let chunkCount = 0 + let pcmConcat = [] + + const response = await model.run({ input: text, type: 'text' }) + + await response + .onUpdate(data => { + if (data && data.outputArray) { + if (firstChunkMs < 0) firstChunkMs = Date.now() - t0 + chunkCount += 1 + const samples = Array.from(data.outputArray) + pcmConcat = pcmConcat.concat(samples) + const chunkMs = (samples.length / CHATTERBOX_SAMPLE_RATE) * 1000 + console.log( + `[native chunk ${chunkCount}] ${samples.length} samples (${chunkMs.toFixed(0)} ms of audio) at t+${Date.now() - t0} ms` + ) + if (player) player.write(samples) + } + }) + .await() + + const totalMs = Date.now() - t0 + const audioMs = (pcmConcat.length / CHATTERBOX_SAMPLE_RATE) * 1000 + console.log( + `\nSynthesis done: ${chunkCount} chunks, ${pcmConcat.length} samples (${audioMs.toFixed(0)} ms of audio), ` + + `first-audio-out ${firstChunkMs} ms, total ${totalMs} ms, RTF ${(totalMs / audioMs).toFixed(3)}` + ) + + if (player) { + console.log('Waiting for playback to finish...') + await player.end() + console.log('Playback finished!') + } + + if (response.stats) { + const s = response.stats + console.log( + `Stats: totalTime=${s.totalTime?.toFixed(2)}s rtf=${s.realTimeFactor?.toFixed(2)} ` + + `audio=${s.audioDurationMs}ms samples=${s.totalSamples}` + ) + } + + if (pcmConcat.length > 0) { + console.log(`\nWriting concatenated PCM to ${outputFile}`) + createWav(pcmConcat, CHATTERBOX_SAMPLE_RATE, outputFile) + } + } catch (err) { + console.error('Error during TTS processing:', err) + throw err + } finally { + console.log('Unloading model...') + await model.unload() + console.log('Model unloaded.') + releaseLogger() + } +} + +main().catch(err => { + console.error(err) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +}) diff --git a/packages/tts-ggml/examples/chatterbox-mtl-sweep-tts.js b/packages/tts-ggml/examples/chatterbox-mtl-sweep-tts.js new file mode 100644 index 0000000000..86b68e6e37 --- /dev/null +++ b/packages/tts-ggml/examples/chatterbox-mtl-sweep-tts.js @@ -0,0 +1,140 @@ +'use strict' + +/** + * Chatterbox MULTILINGUAL TTS sweep demo for @qvac/tts-ggml. + * + * Loads the multilingual GGUFs (chatterbox-t3-mtl + chatterbox-s3gen-mtl) + * and synthesizes a short sweep of sentences across several languages + * back-to-back on the same engine instance, calling `model.reload({ language })` + * between sentences to flip the tokenizer language tag. Useful to spot + * regressions across the tier-1 set. + * + * For the recommended single-sentence entry point with automatic + * language detection, see chatterbox-mtl-tts.js (npm run example:chatterbox-mtl). + * + * Usage: + * bare examples/chatterbox-mtl-sweep-tts.js [path/to/reference.wav] + * + * Examples: + * bare examples/chatterbox-mtl-sweep-tts.js + * bare examples/chatterbox-mtl-sweep-tts.js ~/voices/me.wav + * + * Expects the multilingual GGUF files at: + * models/chatterbox-t3-mtl.gguf + * models/chatterbox-s3gen-mtl.gguf + * + * Convert models with `npm run setup-models`. The English turbo + * variant (chatterbox-t3-turbo + chatterbox-s3gen) lives in + * chatterbox-tts.js. + */ + +const fs = require('bare-fs') +const path = require('bare-path') +const TTSGgml = require('../') +const { createWav } = require('./wav-helper') +const { setLogger, releaseLogger } = require('../addonLogging') + +const CHATTERBOX_SAMPLE_RATE = 24000 + +const argv = global.Bare ? global.Bare.argv : process.argv +const refAudioArg = argv[2] + +const pkgRoot = path.join(__dirname, '..') +const modelDir = path.join(pkgRoot, 'models') +const t3Model = path.join(modelDir, 'chatterbox-t3-mtl.gguf') +const s3genModel = path.join(modelDir, 'chatterbox-s3gen-mtl.gguf') + +for (const f of [t3Model, s3genModel]) { + if (!fs.existsSync(f)) { + console.error(`Missing model file: ${f}`) + console.error('Run "npm run setup-models" to set up the venv and convert the multilingual Chatterbox checkpoint to GGUF.') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) + } +} + +if (refAudioArg && !fs.existsSync(refAudioArg)) { + console.error(`Reference audio not found: ${refAudioArg}`) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} + +if (!refAudioArg) { + console.log('No reference audio provided, using the voice baked into the S3Gen GGUF.') +} + +const SENTENCES = [ + { lang: 'en', text: 'Hello from the multilingual Chatterbox engine.' }, + { lang: 'es', text: 'El zorro marrón salta sobre el perro perezoso.' }, + { lang: 'fr', text: 'Le renard brun saute par-dessus le chien paresseux.' }, + { lang: 'de', text: 'Der braune Fuchs springt über den faulen Hund.' }, + { lang: 'pt', text: 'A raposa marrom pula sobre o cachorro preguiçoso.' } +] + +async function main () { + setLogger((priority, message) => { + if (priority > 1) return + const names = { 0: 'ERROR', 1: 'WARNING', 2: 'INFO', 3: 'DEBUG', 4: 'OFF' } + const name = names[priority] || 'UNKNOWN' + console.log(`[${new Date().toISOString()}] [C++ log] [${name}]: ${message}`) + }) + + const model = new TTSGgml({ + files: { t3Model, s3genModel }, + ...(refAudioArg ? { referenceAudio: refAudioArg } : {}), + config: { language: SENTENCES[0].lang }, + logger: console, + opts: { stats: true } + }) + + try { + console.log('Loading Chatterbox MTL TTS model...') + await model.load() + console.log('Model loaded.\n') + + for (let i = 0; i < SENTENCES.length; i++) { + const { lang, text } = SENTENCES[i] + const preview = text.length > 80 ? `${text.slice(0, 80)}…` : text + console.log(`--- ${i + 1}/${SENTENCES.length} [${lang}] "${preview}"`) + + if (i > 0) { + await model.reload({ language: lang }) + } + + const response = await model.run({ input: text, type: 'text' }) + const buffer = [] + await response + .onUpdate(data => { + if (data && data.outputArray) { + for (const s of data.outputArray) buffer.push(s) + } + }) + .await() + + if (response.stats) { + const s = response.stats + console.log( + ` samples=${buffer.length} duration=${s.audioDurationMs}ms rtf=${s.realTimeFactor?.toFixed(3)} synth=${s.totalTime?.toFixed(2)}s` + ) + } + + const out = path.join(__dirname, `chatterbox-mtl-sweep-${lang}.wav`) + createWav(buffer, CHATTERBOX_SAMPLE_RATE, out) + console.log(` wrote ${path.relative(pkgRoot, out)}\n`) + } + } catch (err) { + console.error('Error during MTL TTS processing:', err) + throw err + } finally { + console.log('Unloading model...') + await model.unload() + console.log('Model unloaded.') + releaseLogger() + } +} + +main().catch(err => { + console.error(err) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +}) diff --git a/packages/tts-ggml/examples/chatterbox-mtl-tts.js b/packages/tts-ggml/examples/chatterbox-mtl-tts.js new file mode 100644 index 0000000000..2bdc810ddb --- /dev/null +++ b/packages/tts-ggml/examples/chatterbox-mtl-tts.js @@ -0,0 +1,182 @@ +'use strict' + +/** + * Chatterbox MULTILINGUAL TTS for @qvac/tts-ggml (auto language detect). + * + * Loads the multilingual GGUFs (chatterbox-t3-mtl + chatterbox-s3gen-mtl) + * and synthesizes a single sentence whose language is auto-detected via + * @qvac/langdetect-text. Falls back to "en" with a warning when the + * detected code isn't in the MTL tier-1 set or when detection is + * undetermined. Mirrors the API surface of chatterbox-tts.js: pass the + * sentence on the command line, optionally followed by a reference wav. + * + * Usage: + * bare examples/chatterbox-mtl-tts.js "" [path/to/reference.wav] + * + * Examples: + * bare examples/chatterbox-mtl-tts.js "Hello from the multilingual Chatterbox engine." + * bare examples/chatterbox-mtl-tts.js "El zorro marron salta sobre el perro perezoso." + * bare examples/chatterbox-mtl-tts.js "Bonjour tout le monde." ~/voices/me.wav + * + * Expects the multilingual GGUF files at: + * models/chatterbox-t3-mtl.gguf + * models/chatterbox-s3gen-mtl.gguf + * + * Convert models with `npm run setup-models`. For a back-to-back sweep + * across the tier-1 set see chatterbox-mtl-sweep-tts.js; for the English + * Turbo variant see chatterbox-tts.js. + */ + +const fs = require('bare-fs') +const path = require('bare-path') +const { detectOne, detectMultiple } = require('@qvac/langdetect-text') +const TTSGgml = require('../') +const { createWav } = require('./wav-helper') +const { setLogger, releaseLogger } = require('../addonLogging') + +const CHATTERBOX_SAMPLE_RATE = 24000 + +const SUPPORTED_MTL_LANGUAGES = new Set([ + 'en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'pl', 'tr', + 'sv', 'da', 'fi', 'no', 'el', 'ms', 'sw', 'ar', 'ko' +]) + +const argv = global.Bare ? global.Bare.argv : process.argv +const textArg = argv[2] +const refAudioArg = argv[3] + +if (!textArg || typeof textArg !== 'string' || textArg.trim().length === 0) { + console.error('Usage: chatterbox-mtl-tts.js "" [path/to/reference.wav]') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} + +const pkgRoot = path.join(__dirname, '..') +const modelDir = path.join(pkgRoot, 'models') +const t3Model = path.join(modelDir, 'chatterbox-t3-mtl.gguf') +const s3genModel = path.join(modelDir, 'chatterbox-s3gen-mtl.gguf') + +for (const f of [t3Model, s3genModel]) { + if (!fs.existsSync(f)) { + console.error(`Missing model file: ${f}`) + console.error('Run "npm run setup-models" to set up the venv and convert the multilingual Chatterbox checkpoint to GGUF.') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) + } +} + +if (refAudioArg && !fs.existsSync(refAudioArg)) { + console.error(`Reference audio not found: ${refAudioArg}`) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} + +if (!refAudioArg) { + console.log('No reference audio provided, using the voice baked into the S3Gen GGUF.') +} + +function selectLanguage (text) { + const detected = detectOne(text) || {} + const rawCode = typeof detected.code === 'string' ? detected.code.toLowerCase() : 'und' + const detectedName = typeof detected.language === 'string' ? detected.language : 'unknown' + + if (SUPPORTED_MTL_LANGUAGES.has(rawCode)) { + return { code: rawCode, detectedCode: rawCode, detectedName, fallbackReason: '' } + } + + // Top-1 wasn't supported. tinyld often misclassifies short Romance + // sentences with brand terms as Latin / Undetermined — scan the + // top-K and pick the highest-ranked supported candidate before + // surrendering to English. + let topK = [] + try { topK = detectMultiple(text, 5) || [] } catch (_e) {} + for (const c of topK) { + const code = typeof c.code === 'string' ? c.code.toLowerCase() : '' + if (SUPPORTED_MTL_LANGUAGES.has(code)) { + return { + code, + detectedCode: rawCode, + detectedName, + fallbackReason: `top-1 "${rawCode}" not in tier-1 set; using highest-ranked supported candidate "${code}"` + } + } + } + + const fallbackReason = rawCode === 'und' + ? 'language detection was undetermined and no supported candidate found; falling back to English' + : `language "${rawCode}" is not in the MTL tier-1 set and no supported candidate found; falling back to English` + + return { code: 'en', detectedCode: rawCode, detectedName, fallbackReason } +} + +async function main () { + setLogger((priority, message) => { + if (priority > 1) return + const names = { 0: 'ERROR', 1: 'WARNING', 2: 'INFO', 3: 'DEBUG', 4: 'OFF' } + const name = names[priority] || 'UNKNOWN' + console.log(`[${new Date().toISOString()}] [C++ log] [${name}]: ${message}`) + }) + + const selection = selectLanguage(textArg) + const outputFile = path.join(__dirname, `chatterbox-mtl-${selection.code}.wav`) + + console.log(`Input text: "${textArg}"`) + console.log(`Detected language: ${selection.detectedName} (${selection.detectedCode})`) + console.log(`Effective TTS language: ${selection.code}`) + if (selection.fallbackReason) { + console.warn(`Language fallback: ${selection.fallbackReason}`) + } + console.log(`Output file: ${outputFile}\n`) + + const model = new TTSGgml({ + files: { t3Model, s3genModel }, + ...(refAudioArg ? { referenceAudio: refAudioArg } : {}), + config: { language: selection.code }, + logger: console, + opts: { stats: true } + }) + + try { + console.log('Loading Chatterbox MTL TTS model...') + await model.load() + console.log('Model loaded.') + + console.log(`Running TTS on: "${textArg}"`) + const response = await model.run({ input: textArg, type: 'text' }) + + console.log('Waiting for TTS results...') + let buffer = [] + + await response + .onUpdate(data => { + if (data && data.outputArray) { + buffer = buffer.concat(Array.from(data.outputArray)) + } + }) + .await() + + console.log('TTS finished!') + if (response.stats) { + const s = response.stats + console.log(`Inference stats: totalTime=${s.totalTime.toFixed(2)}s, tokensPerSecond=${s.tokensPerSecond.toFixed(2)}, realTimeFactor=${s.realTimeFactor.toFixed(2)}, audioDuration=${s.audioDurationMs}ms, totalSamples=${s.totalSamples}`) + } + + console.log('\nWriting to .wav file...') + createWav(buffer, CHATTERBOX_SAMPLE_RATE, outputFile) + console.log(`Finished writing to ${outputFile}`) + } catch (err) { + console.error('Error during TTS processing:', err) + throw err + } finally { + console.log('Unloading model...') + await model.unload() + console.log('Model unloaded.') + releaseLogger() + } +} + +main().catch(err => { + console.error(err) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +}) diff --git a/packages/tts-ggml/examples/chatterbox-sentence-stream-tts.js b/packages/tts-ggml/examples/chatterbox-sentence-stream-tts.js new file mode 100644 index 0000000000..306080eee0 --- /dev/null +++ b/packages/tts-ggml/examples/chatterbox-sentence-stream-tts.js @@ -0,0 +1,185 @@ +'use strict' + +/** + * Chatterbox (ggml) — sentence-granularity streaming. + * + * Streams *sentences in* and emits *one audio chunk per sentence out*. + * The chunking lives in the JS layer (TTSGgml.runStreaming): each + * yielded sentence triggers a full batch synthesize on the C++ side and + * the resulting PCM is published as a single `onUpdate` event. + * + * For sub-sentence native chunk streaming (one utterance split into + * many PCM events as the C++ engine produces them), see + * `chatterbox-chunk-stream-tts.js`. + * + * Usage: + * bare examples/chatterbox-sentence-stream-tts.js [path/to/reference.wav] + * + * Expects the two Chatterbox GGUF files at: + * models/chatterbox-t3-turbo.gguf + * models/chatterbox-s3gen.gguf + * + * Reference audio is optional; when omitted the built-in voice embedded + * in the S3Gen GGUF is used. + */ + +const fs = require('bare-fs') +const path = require('bare-path') +const TTSGgml = require('../') +const { createWav } = require('./wav-helper') +const { setLogger, releaseLogger } = require('../addonLogging') +const { canPlayPcmChunks, createStreamingPlayer } = require('./pcm-chunk-player') + +const CHATTERBOX_SAMPLE_RATE = 24000 +const BETWEEN_SENTENCE_MS = 200 + +function delay (ms) { + return new Promise(resolve => setTimeout(resolve, ms)) +} + +const argv = global.Bare ? global.Bare.argv : process.argv +const refAudioArg = argv[2] + +const pkgRoot = path.join(__dirname, '..') +const modelDir = path.join(pkgRoot, 'models') +const t3Model = path.join(modelDir, 'chatterbox-t3-turbo.gguf') +const s3genModel = path.join(modelDir, 'chatterbox-s3gen.gguf') + +for (const f of [t3Model, s3genModel]) { + if (!fs.existsSync(f)) { + console.error(`Missing model file: ${f}`) + console.error('Run "npm run setup-models" to set up the venv + convert the Resemble Chatterbox checkpoint to GGUF.') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) + } +} + +if (refAudioArg && !fs.existsSync(refAudioArg)) { + console.error(`Reference audio not found: ${refAudioArg}`) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} + +if (!refAudioArg) { + console.log('No reference audio provided, using the voice baked into the S3Gen GGUF.') +} + +async function main () { + setLogger((priority, message) => { + if (priority > 1) return + const names = { 0: 'ERROR', 1: 'WARNING', 2: 'INFO', 3: 'DEBUG', 4: 'OFF' } + const name = names[priority] || 'UNKNOWN' + console.log(`[${new Date().toISOString()}] [C++ log] [${name}]: ${message}`) + }) + + const sentences = [ + 'First sentence of the script.', + 'The second arrives after a short pause.', + 'Audio output still streams in chunks on each update.' + ] + + console.log(`Sentence-by-sentence input (${sentences.length} sentences), streaming PCM output.\n`) + + const model = new TTSGgml({ + files: { modelDir }, + ...(refAudioArg ? { referenceAudio: refAudioArg } : {}), + config: { language: 'en' }, + logger: console, + opts: { stats: true } + }) + + const outputFile = path.join(__dirname, 'chatterbox-sentence-stream-output.wav') + + try { + console.log('Loading Chatterbox TTS model...') + await model.load() + console.log('Model loaded.') + + const player = canPlayPcmChunks() + ? createStreamingPlayer({ sampleRate: CHATTERBOX_SAMPLE_RATE }) + : null + if (player) { + console.log(`Streaming playback via ${player.backend}: chunks flow to stdin as they arrive.`) + } else { + console.warn( + 'No supported player found (install ffmpeg / sox / alsa-utils). Chunks will be logged only.' + ) + } + + async function * sentencesOverTime () { + for (let i = 0; i < sentences.length; i++) { + if (i > 0) { + await delay(BETWEEN_SENTENCE_MS) + } + const s = sentences[i] + const preview = s.length > 60 ? `${s.slice(0, 60)}…` : s + console.log(`[stream in] sentence ${i}: "${preview}"`) + yield s + } + } + + let pcmConcat = [] + let chunkCount = 0 + + const response = await model.runStreaming(sentencesOverTime(), { + flushAfterMs: 500 + }) + + await response + .onUpdate(data => { + if (data && data.outputArray) { + const samples = Array.from(data.outputArray) + pcmConcat = pcmConcat.concat(samples) + chunkCount += 1 + + const idx = data.chunkIndex + const preview = + typeof data.sentenceChunk === 'string' + ? data.sentenceChunk.slice(0, 80).replace(/\s+/g, ' ') + : '' + if (idx !== undefined) { + console.log( + `[stream out] synthesis ${idx}: ${samples.length} samples; accumulated text: "${preview}${preview.length >= 80 ? '…' : ''}"` + ) + } else { + console.log(`Audio update: ${samples.length} samples (no chunk metadata)`) + } + + if (player) player.write(samples) + } + }) + .await() + + console.log(`Inference finished! (${chunkCount} synthesis chunk(s))`) + if (player) { + console.log('Waiting for playback to finish...') + await player.end() + console.log('Playback finished!') + } + + if (response.stats) { + const s = response.stats + console.log(`Inference stats: totalTime=${s.totalTime?.toFixed(2)}s, tokensPerSecond=${s.tokensPerSecond?.toFixed(2)}, realTimeFactor=${s.realTimeFactor?.toFixed(2)}, audioDuration=${s.audioDurationMs}ms, totalSamples=${s.totalSamples}`) + } + + if (pcmConcat.length > 0) { + console.log(`\nWriting concatenated PCM to ${outputFile}`) + createWav(pcmConcat, CHATTERBOX_SAMPLE_RATE, outputFile) + console.log('Done.') + } + } catch (err) { + console.error('Error during TTS processing:', err) + throw err + } finally { + console.log('Unloading model...') + await model.unload() + console.log('Model unloaded.') + releaseLogger() + } +} + +main().catch(err => { + console.error(err) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +}) diff --git a/packages/tts-ggml/examples/chatterbox-tts.js b/packages/tts-ggml/examples/chatterbox-tts.js new file mode 100644 index 0000000000..8dcaec8697 --- /dev/null +++ b/packages/tts-ggml/examples/chatterbox-tts.js @@ -0,0 +1,131 @@ +'use strict' + +/** + * End-to-end Chatterbox TTS batch synthesis for @qvac/tts-ggml. + * + * Usage: + * bare examples/chatterbox-tts.js "text to synthesize" [path/to/reference.wav] + * + * Examples: + * bare examples/chatterbox-tts.js "Hello from qvac-tts ggml" + * bare examples/chatterbox-tts.js "Quick brown fox" ~/voices/me.wav + * + * Expects the two Chatterbox turbo GGUF files at: + * models/chatterbox-t3-turbo.gguf + * models/chatterbox-s3gen.gguf + * + * For sentence-level streaming see chatterbox-sentence-stream-tts.js, + * for sub-sentence native streaming see chatterbox-chunk-stream-tts.js. + * Multilingual variant (chatterbox-t3-mtl + chatterbox-s3gen-mtl) + * lives in chatterbox-mtl-tts.js. + * + * Convert models with `npm run setup-models` (uses scripts/setup-venv.sh + * + scripts/convert-models.sh against the upstream Resemble Chatterbox + * checkpoints). + */ + +const fs = require('bare-fs') +const path = require('bare-path') +const TTSGgml = require('../') +const { createWav } = require('./wav-helper') +const { setLogger, releaseLogger } = require('../addonLogging') + +const CHATTERBOX_SAMPLE_RATE = 24000 + +const argv = global.Bare ? global.Bare.argv : process.argv +const textArg = argv[2] +const refAudioArg = argv[3] + +if (!textArg || typeof textArg !== 'string' || textArg.trim().length === 0) { + console.error('Usage: chatterbox-tts.js "" [path/to/reference.wav]') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} + +const pkgRoot = path.join(__dirname, '..') +const modelDir = path.join(pkgRoot, 'models') +const t3Model = path.join(modelDir, 'chatterbox-t3-turbo.gguf') +const s3genModel = path.join(modelDir, 'chatterbox-s3gen.gguf') + +for (const f of [t3Model, s3genModel]) { + if (!fs.existsSync(f)) { + console.error(`Missing model file: ${f}`) + console.error('Run "npm run setup-models" (sets up the venv and converts the upstream Resemble Chatterbox checkpoint).') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) + } +} + +if (refAudioArg && !fs.existsSync(refAudioArg)) { + console.error(`Reference audio not found: ${refAudioArg}`) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} + +if (!refAudioArg) { + console.log('No reference audio provided, using the voice baked into the S3Gen GGUF.') +} + +async function main () { + setLogger((priority, message) => { + if (priority > 1) return + const names = { 0: 'ERROR', 1: 'WARNING', 2: 'INFO', 3: 'DEBUG', 4: 'OFF' } + const name = names[priority] || 'UNKNOWN' + console.log(`[${new Date().toISOString()}] [C++ log] [${name}]: ${message}`) + }) + + const outputFile = path.join(__dirname, 'chatterbox-output.wav') + + const model = new TTSGgml({ + files: { modelDir }, + ...(refAudioArg ? { referenceAudio: refAudioArg } : {}), + config: { language: 'en' }, + logger: console, + opts: { stats: true } + }) + + try { + console.log('Loading Chatterbox TTS model...') + await model.load() + console.log('Model loaded.') + + console.log(`Running TTS on: "${textArg}"`) + + const response = await model.run({ input: textArg, type: 'text' }) + + console.log('Waiting for TTS results...') + let buffer = [] + + await response + .onUpdate(data => { + if (data && data.outputArray) { + buffer = buffer.concat(Array.from(data.outputArray)) + } + }) + .await() + + console.log('TTS finished!') + if (response.stats) { + const s = response.stats + console.log(`Inference stats: totalTime=${s.totalTime.toFixed(2)}s, tokensPerSecond=${s.tokensPerSecond.toFixed(2)}, realTimeFactor=${s.realTimeFactor.toFixed(2)}, audioDuration=${s.audioDurationMs}ms, totalSamples=${s.totalSamples}`) + } + + console.log('\nWriting to .wav file...') + createWav(buffer, CHATTERBOX_SAMPLE_RATE, outputFile) + console.log(`Finished writing to ${outputFile}`) + } catch (err) { + console.error('Error during TTS processing:', err) + throw err + } finally { + console.log('Unloading model...') + await model.unload() + console.log('Model unloaded.') + releaseLogger() + } +} + +main().catch(err => { + console.error(err) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +}) diff --git a/packages/tts-ggml/examples/pcm-chunk-player.js b/packages/tts-ggml/examples/pcm-chunk-player.js new file mode 100644 index 0000000000..289b66e9e2 --- /dev/null +++ b/packages/tts-ggml/examples/pcm-chunk-player.js @@ -0,0 +1,373 @@ +'use strict' + +/** + * Streaming audio player for raw int16 PCM chunks. + * + * Spawns a single long-running child process that reads raw `s16le` PCM + * @ 24 kHz mono from stdin and plays it to the default output. Chunks + * are written to the process's stdin as they arrive, so there are no + * per-chunk startup gaps — playback is continuous across the whole + * synthesis. + * + * Contrast with the older per-chunk-afplay approach, which writes each + * chunk to a tmp wav, spawns `afplay`, waits for it to exit, deletes + * the wav, repeat: that path adds ~150-300 ms of dead air between every + * chunk and is unusable for sub-second chunks (breaks words mid-stream). + * + * Supported backends (picked in this order): + * 1. `ffplay` (ffmpeg, any platform) — `-f s16le -i -` + * 2. `play` (sox, any platform) — `-t raw -r 24000 ...` + * 3. `aplay` (Linux ALSA) — `-t raw -f S16_LE ...` + * 4. Per-chunk `afplay` fallback (macOS only; has the gap problem) + * + * Old per-chunk helpers (`playInt16Chunk`, `playInt16ChunkSync`) are + * kept for back-compat but should not be used for native sub-second + * streaming. + */ + +const fs = require('bare-fs') +const os = require('bare-os') +const path = require('bare-path') +const { spawn, spawnSync } = require('bare-subprocess') +const { createWav } = require('./wav-helper') + +let _seq = 0 +let _hasFfplay +let _hasPlay +let _hasAplay + +function syncOk (cmd, args) { + try { + const r = spawnSync(cmd, args, { stdio: ['ignore', 'ignore', 'ignore'] }) + return (r.status | 0) === 0 + } catch { + return false + } +} + +function detectFfplay () { + if (_hasFfplay !== undefined) return _hasFfplay + _hasFfplay = syncOk('ffplay', ['-hide_banner', '-version']) + return _hasFfplay +} + +function detectPlay () { + if (_hasPlay !== undefined) return _hasPlay + _hasPlay = syncOk('play', ['--version']) + return _hasPlay +} + +function detectAplay () { + if (_hasAplay !== undefined) return _hasAplay + _hasAplay = os.platform() === 'linux' && syncOk('aplay', ['--version']) + return _hasAplay +} + +function canPlayPcmChunks () { + if (detectFfplay()) return true + if (detectPlay()) return true + if (detectAplay()) return true + if (os.platform() === 'darwin') return true + return false +} + +function toInt16Buffer (samples) { + const arr = samples instanceof Int16Array + ? samples + : Int16Array.from(samples) + return Buffer.from(arr.buffer, arr.byteOffset, arr.byteLength) +} + +function unlinkQuiet (p) { + try { + fs.unlinkSync(p) + } catch (_) {} +} + +function spawnAsync (cmd, args, opts) { + return new Promise((resolve, reject) => { + try { + const child = spawn(cmd, args, opts) + child.on('exit', (code) => resolve(code)) + child.on('error', reject) + } catch (err) { + reject(err) + } + }) +} + +/** + * Open a streaming player pipe. Returns `{ write(samples), end() }`. + * `write` buffers the chunk and passes it to the player immediately; + * `end` closes stdin and resolves after the player has drained and + * exited, so `await player.end()` guarantees every chunk has finished + * playing before it returns. + * + * `sampleRate` defaults to 24000 (Chatterbox native rate). + */ +function createStreamingPlayer ({ sampleRate = 24000, channels = 1 } = {}) { + // sox `play` is preferred on darwin: on some macOS builds ffplay's + // SDL output is silent for raw-piped audio; sox uses CoreAudio + // directly and works reliably. (qvac-tts.cpp's README documents + // the same caveat for its `--out -` CLI mode.) + const preferSox = os.platform() === 'darwin' + const order = preferSox + ? [trySox, tryFfplay, tryAplay] + : [tryFfplay, trySox, tryAplay] + for (const build of order) { + const p = build(sampleRate, channels) + if (p) return p + } + if (os.platform() === 'darwin') return createAfplayFallback(sampleRate) + return null +} + +function trySox (sampleRate, channels) { + if (!detectPlay()) return null + const args = [ + '-q', + '-t', 'raw', + '-r', String(sampleRate), + '-b', '16', + '-e', 'signed', + '-c', String(channels), + '-' + ] + return spawnStreamingPlayer('play', args) +} + +function tryFfplay (sampleRate, channels) { + if (!detectFfplay()) return null + const args = [ + '-hide_banner', + '-loglevel', 'error', + '-nodisp', + '-autoexit', + '-f', 's16le', + '-ar', String(sampleRate), + '-ac', String(channels), + '-i', 'pipe:0' + ] + return spawnStreamingPlayer('ffplay', args) +} + +function tryAplay (sampleRate, channels) { + if (!detectAplay()) return null + const args = [ + '-q', + '-t', 'raw', + '-f', 'S16_LE', + '-r', String(sampleRate), + '-c', String(channels) + ] + return spawnStreamingPlayer('aplay', args) +} + +function spawnStreamingPlayer (cmd, args) { + const child = spawn(cmd, args, { stdio: ['pipe', 'ignore', 'pipe'] }) + let exited = false + let exitResolve + const exitPromise = new Promise((resolve) => { exitResolve = resolve }) + child.on('exit', () => { + exited = true + exitResolve() + }) + child.on('error', () => { + exited = true + exitResolve() + }) + return { + backend: cmd, + write (samples) { + if (exited) return + const buf = toInt16Buffer(samples) + if (buf.length === 0) return + try { + child.stdin.write(buf) + } catch (_) { + // stdin may be closed if the player died early; swallow. + } + }, + async end () { + try { + child.stdin.end() + } catch (_) {} + await exitPromise + } + } +} + +/** + * macOS-only fallback: one afplay per chunk. Keeps the last audio + * working when neither ffplay/sox/aplay is installed, but has + * per-chunk gaps (~150-300 ms) that break sub-second streaming. A + * one-time warning is printed the first time it's used. + */ +let _afplayWarned = false +function createAfplayFallback (sampleRate) { + if (!_afplayWarned) { + console.warn( + '[pcm-chunk-player] No ffplay/sox/aplay found; falling back to per-chunk afplay. ' + + 'Install ffmpeg or sox for gapless streaming playback: `brew install ffmpeg` or `brew install sox`.' + ) + _afplayWarned = true + } + const queue = [] + let draining = false + let endResolve + const donePromise = new Promise((resolve) => { endResolve = resolve }) + let ended = false + + async function drain () { + if (draining) return + draining = true + while (queue.length > 0) { + const samples = queue.shift() + const id = `${Date.now()}-${++_seq}` + const tmpWav = path.join(os.tmpdir(), `qvac-tts-stream-${id}.wav`) + createWav(Array.from(samples), sampleRate, tmpWav) + await spawnAsync('afplay', [tmpWav], { stdio: 'ignore' }) + unlinkQuiet(tmpWav) + } + draining = false + if (ended) endResolve() + } + + return { + backend: 'afplay (per-chunk fallback)', + write (samples) { + const arr = samples instanceof Int16Array ? samples : Int16Array.from(samples) + if (arr.length === 0) return + queue.push(arr) + drain() + }, + async end () { + ended = true + if (!draining && queue.length === 0) endResolve() + await donePromise + } + } +} + +// --------------------------------------------------------------------- +// Legacy per-chunk helpers (kept for back-compat with older examples). +// --------------------------------------------------------------------- + +function playInt16ChunkSync (samples, sampleRate) { + const arr = samples instanceof Int16Array ? samples : Int16Array.from(samples) + if (arr.length === 0) return + + const id = `${Date.now()}-${++_seq}` + const tmpDir = os.tmpdir() + const plat = os.platform() + + if (plat === 'darwin') { + const tmpWav = path.join(tmpDir, `qvac-tts-stream-${id}.wav`) + createWav(Array.from(arr), sampleRate, tmpWav) + spawnSync('afplay', [tmpWav], { stdio: 'ignore' }) + unlinkQuiet(tmpWav) + return + } + if (detectFfplay()) { + const tmpWav = path.join(tmpDir, `qvac-tts-stream-${id}.wav`) + createWav(Array.from(arr), sampleRate, tmpWav) + spawnSync( + 'ffplay', + ['-nodisp', '-autoexit', '-loglevel', 'error', '-i', tmpWav], + { stdio: 'ignore' } + ) + unlinkQuiet(tmpWav) + return + } + if (detectAplay()) { + const rawPath = path.join(tmpDir, `qvac-tts-stream-${id}.raw`) + fs.writeFileSync(rawPath, toInt16Buffer(arr)) + spawnSync( + 'aplay', + ['-q', '-t', 'raw', '-f', 'S16_LE', '-r', String(sampleRate), '-c', '1', rawPath], + { stdio: 'ignore' } + ) + unlinkQuiet(rawPath) + } +} + +async function playInt16Chunk (samples, sampleRate) { + const arr = samples instanceof Int16Array ? samples : Int16Array.from(samples) + if (arr.length === 0) return + + const id = `${Date.now()}-${++_seq}` + const tmpDir = os.tmpdir() + const plat = os.platform() + + if (plat === 'darwin') { + const tmpWav = path.join(tmpDir, `qvac-tts-stream-${id}.wav`) + createWav(Array.from(arr), sampleRate, tmpWav) + await spawnAsync('afplay', [tmpWav], { stdio: 'ignore' }) + unlinkQuiet(tmpWav) + return + } + if (detectFfplay()) { + const tmpWav = path.join(tmpDir, `qvac-tts-stream-${id}.wav`) + createWav(Array.from(arr), sampleRate, tmpWav) + await spawnAsync( + 'ffplay', + ['-nodisp', '-autoexit', '-loglevel', 'error', '-i', tmpWav], + { stdio: 'ignore' } + ) + unlinkQuiet(tmpWav) + return + } + if (detectAplay()) { + const rawPath = path.join(tmpDir, `qvac-tts-stream-${id}.raw`) + fs.writeFileSync(rawPath, toInt16Buffer(arr)) + await spawnAsync( + 'aplay', + ['-q', '-t', 'raw', '-f', 'S16_LE', '-r', String(sampleRate), '-c', '1', rawPath], + { stdio: 'ignore' } + ) + unlinkQuiet(rawPath) + } +} + +function createChunkQueue () { + const queue = [] + let waiter = null + let done = false + + function push (item) { + queue.push(item) + if (waiter) { + waiter() + waiter = null + } + } + + function end () { + done = true + if (waiter) { + waiter() + waiter = null + } + } + + async function * drain () { + while (true) { + if (queue.length > 0) { + yield queue.shift() + continue + } + if (done) return + await new Promise((resolve) => { waiter = resolve }) + } + } + + return { push, end, drain } +} + +module.exports = { + canPlayPcmChunks, + createStreamingPlayer, + playInt16ChunkSync, + playInt16Chunk, + createChunkQueue +} diff --git a/packages/tts-ggml/examples/supertonic-mtl-sweep-tts.js b/packages/tts-ggml/examples/supertonic-mtl-sweep-tts.js new file mode 100644 index 0000000000..fae6bfc4da --- /dev/null +++ b/packages/tts-ggml/examples/supertonic-mtl-sweep-tts.js @@ -0,0 +1,137 @@ +'use strict' + +/** + * Supertonic MULTILINGUAL TTS sweep demo for @qvac/tts-ggml. + * + * Loads the multilingual Supertonic-2 GGUF (models/supertonic2.gguf) and + * synthesizes one canonical sentence per Supertonic-supported language + * back-to-back on the same engine instance, calling + * `model.reload({ language })` between sentences to flip the + * tokenizer / language-wrap mode. Useful to spot regressions across + * the (small) tier-1 set. + * + * Supertonic supports en/ko/es/pt/fr today (gated by tts-cpp's + * supertonic_preprocess.cpp::is_supported_language). + * + * For the recommended single-sentence entry point with automatic + * language detection, see supertonic-mtl-tts.js + * (npm run example:supertonic-mtl). + * + * Usage: + * bare examples/supertonic-mtl-sweep-tts.js [voice] + * + * Examples: + * bare examples/supertonic-mtl-sweep-tts.js + * bare examples/supertonic-mtl-sweep-tts.js M1 + * + * Expects the multilingual Supertonic GGUF at: + * models/supertonic2.gguf + * + * Convert with `npm run setup-models` (or + * `bash scripts/convert-models.sh -t supertonic-mtl`). The + * English-pinned single-sentence entry point lives in supertonic-tts.js. + * + * NOTE: Supertonic is CPU-only in tts-cpp today. This example sets + * useGPU=false explicitly to match. + */ + +const fs = require('bare-fs') +const path = require('bare-path') +const TTSGgml = require('../') +const { createWav } = require('./wav-helper') +const { setLogger, releaseLogger } = require('../addonLogging') + +const SUPERTONIC_SAMPLE_RATE = 44100 + +const argv = global.Bare ? global.Bare.argv : process.argv +const voiceArg = argv[2] + +const pkgRoot = path.join(__dirname, '..') +const modelDir = path.join(pkgRoot, 'models') +const supertonicModel = path.join(modelDir, 'supertonic2.gguf') + +if (!fs.existsSync(supertonicModel)) { + console.error(`Missing model file: ${supertonicModel}`) + console.error('Run "npm run setup-models" (or "bash scripts/convert-models.sh -t supertonic-mtl") to convert the Supertone Supertonic-2 ONNX bundle to GGUF.') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} + +const SENTENCES = [ + { lang: 'en', text: 'Hello from the multilingual Supertonic engine.' }, + { lang: 'es', text: 'El zorro marrón salta sobre el perro perezoso.' }, + { lang: 'fr', text: 'Le renard brun saute par-dessus le chien paresseux.' }, + { lang: 'pt', text: 'A raposa marrom pula sobre o cachorro preguiçoso.' }, + { lang: 'ko', text: '안녕하세요, 다국어 슈퍼토닉 엔진입니다.' } +] + +async function main () { + setLogger((priority, message) => { + if (priority > 1) return + const names = { 0: 'ERROR', 1: 'WARNING', 2: 'INFO', 3: 'DEBUG', 4: 'OFF' } + const name = names[priority] || 'UNKNOWN' + console.log(`[${new Date().toISOString()}] [C++ log] [${name}]: ${message}`) + }) + + const voice = voiceArg || 'F1' + + const model = new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: { supertonicModel }, + voice, + config: { language: SENTENCES[0].lang, useGPU: false }, + logger: console, + opts: { stats: true } + }) + + try { + console.log('Loading Supertonic MTL TTS model...') + await model.load() + console.log(`Model loaded. Voice=${voice}.\n`) + + for (let i = 0; i < SENTENCES.length; i++) { + const { lang, text } = SENTENCES[i] + const preview = text.length > 80 ? `${text.slice(0, 80)}…` : text + console.log(`--- ${i + 1}/${SENTENCES.length} [${lang}] "${preview}"`) + + if (i > 0) { + await model.reload({ language: lang }) + } + + const response = await model.run({ input: text, type: 'text' }) + const buffer = [] + await response + .onUpdate(data => { + if (data && data.outputArray) { + for (const s of data.outputArray) buffer.push(s) + } + }) + .await() + + if (response.stats) { + const s = response.stats + console.log( + ` samples=${buffer.length} duration=${s.audioDurationMs}ms rtf=${s.realTimeFactor?.toFixed(3)} synth=${s.totalTime?.toFixed(2)}s` + ) + } + + const out = path.join(__dirname, `supertonic-mtl-sweep-${lang}.wav`) + createWav(buffer, SUPERTONIC_SAMPLE_RATE, out) + console.log(` wrote ${path.relative(pkgRoot, out)}\n`) + } + } catch (err) { + console.error('Error during MTL TTS processing:', err) + throw err + } finally { + console.log('Unloading model...') + await model.unload() + console.log('Model unloaded.') + releaseLogger() + } +} + +main().catch(err => { + console.error(err) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +}) diff --git a/packages/tts-ggml/examples/supertonic-mtl-tts.js b/packages/tts-ggml/examples/supertonic-mtl-tts.js new file mode 100644 index 0000000000..80a1b6e9e2 --- /dev/null +++ b/packages/tts-ggml/examples/supertonic-mtl-tts.js @@ -0,0 +1,183 @@ +'use strict' + +/** + * Supertonic MULTILINGUAL TTS for @qvac/tts-ggml (auto language detect). + * + * Loads the multilingual Supertonic-2 GGUF (models/supertonic2.gguf, + * produced by `npm run setup-models` via + * convert-supertonic2-to-gguf.py --arch supertonic2) and synthesizes a + * single sentence whose language is auto-detected via + * @qvac/langdetect-text. Falls back to "en" with a warning when the + * detected code isn't in the Supertonic tier-1 set or when detection is + * undetermined. Mirrors the API surface of supertonic-tts.js: pass the + * sentence on the command line, optionally followed by a voice name. + * + * The English-only supertonic.gguf (Supertone/supertonic) is used by + * the simpler supertonic-tts.js example; this MTL example uses + * supertonic2.gguf instead. + * + * Supertonic supports a much smaller language set than Chatterbox MTL: + * en, ko, es, pt, fr + * (gated by tts-cpp's supertonic_preprocess.cpp::is_supported_language). + * + * Usage: + * bare examples/supertonic-mtl-tts.js "" [voice] + * + * Examples: + * bare examples/supertonic-mtl-tts.js "Hello from supertonic multilingual." + * bare examples/supertonic-mtl-tts.js "Hola desde supertonic." F1 + * bare examples/supertonic-mtl-tts.js "Bonjour tout le monde." M1 + * + * Expects the multilingual Supertonic GGUF at: + * models/supertonic2.gguf + * + * Convert with `npm run setup-models` (which now produces both + * supertonic.gguf for English and supertonic2.gguf for multilingual). + * For a back-to-back sweep across the tier-1 set see + * supertonic-mtl-sweep-tts.js; for the simpler English-pinned entry + * point see supertonic-tts.js. + * + * NOTE: Supertonic is CPU-only in tts-cpp today. This example sets + * useGPU=false explicitly to match. + */ + +const fs = require('bare-fs') +const path = require('bare-path') +const { detectOne, detectMultiple } = require('@qvac/langdetect-text') +const TTSGgml = require('../') +const { createWav } = require('./wav-helper') +const { setLogger, releaseLogger } = require('../addonLogging') + +const SUPERTONIC_SAMPLE_RATE = 44100 + +const SUPPORTED_SUPERTONIC_LANGUAGES = new Set([ + 'en', 'ko', 'es', 'pt', 'fr' +]) + +const argv = global.Bare ? global.Bare.argv : process.argv +const textArg = argv[2] +const voiceArg = argv[3] + +if (!textArg || typeof textArg !== 'string' || textArg.trim().length === 0) { + console.error('Usage: supertonic-mtl-tts.js "" [voice]') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} + +const pkgRoot = path.join(__dirname, '..') +const modelDir = path.join(pkgRoot, 'models') +const supertonicModel = path.join(modelDir, 'supertonic2.gguf') + +if (!fs.existsSync(supertonicModel)) { + console.error(`Missing model file: ${supertonicModel}`) + console.error('Run "npm run setup-models" (or "bash scripts/convert-models.sh -t supertonic-mtl") to convert the Supertone Supertonic-2 ONNX bundle to GGUF.') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} + +function selectLanguage (text) { + const detected = detectOne(text) || {} + const rawCode = typeof detected.code === 'string' ? detected.code.toLowerCase() : 'und' + const detectedName = typeof detected.language === 'string' ? detected.language : 'unknown' + + if (SUPPORTED_SUPERTONIC_LANGUAGES.has(rawCode)) { + return { code: rawCode, detectedCode: rawCode, detectedName, fallbackReason: '' } + } + + // Top-1 wasn't supported. tinyld often misclassifies short Romance + // sentences with brand terms as Latin / Undetermined — scan the + // top-K and pick the highest-ranked supported candidate before + // surrendering to English. + let topK = [] + try { topK = detectMultiple(text, 5) || [] } catch (_e) {} + for (const c of topK) { + const code = typeof c.code === 'string' ? c.code.toLowerCase() : '' + if (SUPPORTED_SUPERTONIC_LANGUAGES.has(code)) { + return { + code, + detectedCode: rawCode, + detectedName, + fallbackReason: `top-1 "${rawCode}" not in tier-1 set; using highest-ranked supported candidate "${code}"` + } + } + } + + const fallbackReason = rawCode === 'und' + ? 'language detection was undetermined and no supported candidate found; falling back to English' + : `language "${rawCode}" is not in the Supertonic tier-1 set (en/ko/es/pt/fr) and no supported candidate found; falling back to English` + + return { code: 'en', detectedCode: rawCode, detectedName, fallbackReason } +} + +async function main () { + setLogger((priority, message) => { + if (priority > 1) return + const names = { 0: 'ERROR', 1: 'WARNING', 2: 'INFO', 3: 'DEBUG', 4: 'OFF' } + const name = names[priority] || 'UNKNOWN' + console.log(`[${new Date().toISOString()}] [C++ log] [${name}]: ${message}`) + }) + + const selection = selectLanguage(textArg) + const voice = voiceArg || 'F1' + const outputFile = path.join(__dirname, `supertonic-mtl-${selection.code}.wav`) + + console.log(`Input text: "${textArg}"`) + console.log(`Detected language: ${selection.detectedName} (${selection.detectedCode})`) + console.log(`Effective TTS language: ${selection.code}`) + if (selection.fallbackReason) { + console.warn(`Language fallback: ${selection.fallbackReason}`) + } + console.log(`Voice: ${voice}`) + console.log(`Output file: ${outputFile}\n`) + + const model = new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: { supertonicModel }, + voice, + config: { language: selection.code, useGPU: false }, + logger: console, + opts: { stats: true } + }) + + try { + console.log('Loading Supertonic MTL TTS model...') + await model.load() + console.log('Model loaded.') + + console.log(`Running TTS on: "${textArg}" (voice=${voice})`) + const response = await model.run({ input: textArg, type: 'text' }) + + let buffer = [] + await response + .onUpdate(data => { + if (data && data.outputArray) { + buffer = buffer.concat(Array.from(data.outputArray)) + } + }) + .await() + + console.log('TTS finished!') + if (response.stats) { + const s = response.stats + console.log(`Inference stats: totalTime=${s.totalTime.toFixed(2)}s, tokensPerSecond=${s.tokensPerSecond.toFixed(2)}, realTimeFactor=${s.realTimeFactor.toFixed(3)}, audioDuration=${s.audioDurationMs}ms, totalSamples=${s.totalSamples}`) + } + + console.log('\nWriting to .wav file...') + createWav(buffer, SUPERTONIC_SAMPLE_RATE, outputFile) + console.log(`Finished writing to ${outputFile}`) + } catch (err) { + console.error('Error during TTS processing:', err) + throw err + } finally { + console.log('Unloading model...') + await model.unload() + console.log('Model unloaded.') + releaseLogger() + } +} + +main().catch(err => { + console.error(err) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +}) diff --git a/packages/tts-ggml/examples/supertonic-sentence-stream-tts.js b/packages/tts-ggml/examples/supertonic-sentence-stream-tts.js new file mode 100644 index 0000000000..0817e8a9f2 --- /dev/null +++ b/packages/tts-ggml/examples/supertonic-sentence-stream-tts.js @@ -0,0 +1,175 @@ +'use strict' + +/** + * Supertonic — sentence-granularity streaming. + * + * Streams *sentences in* (async iterator) and emits *one audio chunk + * per sentence out* via `runStreaming`. Same engine-agnostic JS-layer + * orchestrator that chatterbox-sentence-stream-tts.js uses; the addon + * dispatches each `runJob` call to whichever engine the model was + * constructed with. + * + * Sub-sentence native streaming (`streamChunkTokens`) is Chatterbox- + * only at the C++ engine level; the constructor rejects those knobs + * for Supertonic with a clear error. Use this sentence-level path + * for low-latency Supertonic streaming. + * + * Usage: + * bare examples/supertonic-sentence-stream-tts.js [voice] + * + * Expects the Supertonic GGUF at: + * models/supertonic.gguf + * + * NOTE: Supertonic is CPU-only in tts-cpp today; this example sets + * useGPU=false explicitly. See supertonic-tts.js for the full + * limitation context. + */ + +const fs = require('bare-fs') +const path = require('bare-path') +const TTSGgml = require('../') +const { createWav } = require('./wav-helper') +const { setLogger, releaseLogger } = require('../addonLogging') +const { canPlayPcmChunks, createStreamingPlayer } = require('./pcm-chunk-player') + +const SUPERTONIC_SAMPLE_RATE = 44100 +const BETWEEN_SENTENCE_MS = 200 + +function delay (ms) { + return new Promise(resolve => setTimeout(resolve, ms)) +} + +const argv = global.Bare ? global.Bare.argv : process.argv +const voiceArg = argv[2] + +const pkgRoot = path.join(__dirname, '..') +const modelDir = path.join(pkgRoot, 'models') +const supertonicModel = path.join(modelDir, 'supertonic.gguf') + +if (!fs.existsSync(supertonicModel)) { + console.error(`Missing model file: ${supertonicModel}`) + console.error('Run "npm run setup-models" to set up the venv and convert the Supertone Supertonic-2 ONNX bundle to GGUF.') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} + +async function main () { + setLogger((priority, message) => { + if (priority > 1) return + const names = { 0: 'ERROR', 1: 'WARNING', 2: 'INFO', 3: 'DEBUG', 4: 'OFF' } + const name = names[priority] || 'UNKNOWN' + console.log(`[${new Date().toISOString()}] [C++ log] [${name}]: ${message}`) + }) + + const sentences = [ + 'First sentence of the supertonic stream.', + 'The second arrives after a short pause.', + 'Audio output streams in chunks on each update, one chunk per sentence.' + ] + + console.log(`Sentence-by-sentence input (${sentences.length} sentences), streaming PCM output.\n`) + + const model = new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: { supertonicModel }, + voice: voiceArg || 'F1', + config: { language: 'en', useGPU: false }, + logger: console, + opts: { stats: true } + }) + + const outputFile = path.join(__dirname, 'supertonic-sentence-stream-output.wav') + + try { + console.log('Loading Supertonic TTS model...') + await model.load() + console.log('Model loaded.') + + const player = canPlayPcmChunks() + ? createStreamingPlayer({ sampleRate: SUPERTONIC_SAMPLE_RATE }) + : null + if (player) { + console.log(`Streaming playback via ${player.backend}: chunks flow to stdin as they arrive.`) + } else { + console.warn( + 'No supported player found (install ffmpeg / sox / alsa-utils). Chunks will be logged only.' + ) + } + + async function * sentencesOverTime () { + for (let i = 0; i < sentences.length; i++) { + if (i > 0) { + await delay(BETWEEN_SENTENCE_MS) + } + const s = sentences[i] + const preview = s.length > 60 ? `${s.slice(0, 60)}…` : s + console.log(`[stream in] sentence ${i}: "${preview}"`) + yield s + } + } + + let pcmConcat = [] + let chunkCount = 0 + + const response = await model.runStreaming(sentencesOverTime(), { + flushAfterMs: 500 + }) + + await response + .onUpdate(data => { + if (data && data.outputArray) { + const samples = Array.from(data.outputArray) + pcmConcat = pcmConcat.concat(samples) + chunkCount += 1 + + const idx = data.chunkIndex + const preview = + typeof data.sentenceChunk === 'string' + ? data.sentenceChunk.slice(0, 80).replace(/\s+/g, ' ') + : '' + if (idx !== undefined) { + console.log( + `[stream out] synthesis ${idx}: ${samples.length} samples; sentence: "${preview}${preview.length >= 80 ? '…' : ''}"` + ) + } else { + console.log(`Audio update: ${samples.length} samples (no chunk metadata)`) + } + + if (player) player.write(samples) + } + }) + .await() + + console.log(`Inference finished! (${chunkCount} synthesis chunk(s))`) + if (player) { + console.log('Waiting for playback to finish...') + await player.end() + console.log('Playback finished!') + } + + if (response.stats) { + const s = response.stats + console.log(`Inference stats: totalTime=${s.totalTime?.toFixed(2)}s, tokensPerSecond=${s.tokensPerSecond?.toFixed(2)}, realTimeFactor=${s.realTimeFactor?.toFixed(3)}, audioDuration=${s.audioDurationMs}ms, totalSamples=${s.totalSamples}`) + } + + if (pcmConcat.length > 0) { + console.log(`\nWriting concatenated PCM to ${outputFile}`) + createWav(pcmConcat, SUPERTONIC_SAMPLE_RATE, outputFile) + console.log('Done.') + } + } catch (err) { + console.error('Error during TTS processing:', err) + throw err + } finally { + console.log('Unloading model...') + await model.unload() + console.log('Model unloaded.') + releaseLogger() + } +} + +main().catch(err => { + console.error(err) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +}) diff --git a/packages/tts-ggml/examples/supertonic-tts.js b/packages/tts-ggml/examples/supertonic-tts.js new file mode 100644 index 0000000000..bc46c14d81 --- /dev/null +++ b/packages/tts-ggml/examples/supertonic-tts.js @@ -0,0 +1,129 @@ +'use strict' + +/** + * Supertonic TTS batch synthesis for @qvac/tts-ggml. + * + * Loads the English-only Supertone/supertonic GGUF and synthesizes a + * single utterance. Supertonic is a very fast, batch-only engine that + * emits native 44.1 kHz audio (no reference-audio voice cloning; voices + * are baked into the model under names like 'F1', 'F2', 'M1' ... — see + * the GGUF metadata for the full list). + * + * For multilingual synthesis (en/ko/es/pt/fr) load the Supertonic-2 + * GGUF instead, via supertonic-mtl-tts.js (auto language detect) or + * supertonic-mtl-sweep-tts.js (back-to-back sweep). + * + * Usage: + * bare examples/supertonic-tts.js "text to synthesize" [voice] + * + * Examples: + * bare examples/supertonic-tts.js "Hello from supertonic" + * bare examples/supertonic-tts.js "Hello there" M1 + * + * Expects the English Supertonic GGUF at: + * models/supertonic.gguf + * + * Convert with `npm run setup-models` (or + * `bash scripts/convert-models.sh -t supertonic-en`); the Python + * pipeline pulls Supertone/supertonic from Hugging Face and packs the + * ONNX bundle into a single .gguf via + * scripts/convert-supertonic2-to-gguf.py --arch supertonic. + * + * NOTE: Supertonic is CPU-only in tts-cpp today (engine docstring at + * include/tts-cpp/supertonic/engine.h: "CPU only today"). Passing + * useGPU=true throws at construction with a message pointing at the + * limitation; the example explicitly sets useGPU=false. Chatterbox + * (turbo + MTL) keeps GPU enabled by default. + */ + +const fs = require('bare-fs') +const path = require('bare-path') +const TTSGgml = require('../') +const { createWav } = require('./wav-helper') +const { setLogger, releaseLogger } = require('../addonLogging') + +const SUPERTONIC_SAMPLE_RATE = 44100 + +const argv = global.Bare ? global.Bare.argv : process.argv +const textArg = argv[2] +const voiceArg = argv[3] + +if (!textArg || typeof textArg !== 'string' || textArg.trim().length === 0) { + console.error('Usage: supertonic-tts.js "" [voice]') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} + +const pkgRoot = path.join(__dirname, '..') +const modelDir = path.join(pkgRoot, 'models') +const supertonicModel = path.join(modelDir, 'supertonic.gguf') + +if (!fs.existsSync(supertonicModel)) { + console.error(`Missing model file: ${supertonicModel}`) + console.error('Run "npm run setup-models" to set up the venv and convert the Supertone Supertonic-2 ONNX bundle to GGUF.') + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +} + +async function main () { + setLogger((priority, message) => { + if (priority > 1) return + const names = { 0: 'ERROR', 1: 'WARNING', 2: 'INFO', 3: 'DEBUG', 4: 'OFF' } + const name = names[priority] || 'UNKNOWN' + console.log(`[${new Date().toISOString()}] [C++ log] [${name}]: ${message}`) + }) + + const outputFile = path.join(__dirname, 'supertonic-output.wav') + + const model = new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: { supertonicModel }, + voice: voiceArg || 'F1', + config: { language: 'en', useGPU: false }, + logger: console, + opts: { stats: true } + }) + + try { + console.log('Loading Supertonic TTS model...') + await model.load() + console.log('Model loaded.') + + console.log(`Running TTS on: "${textArg}" (voice=${voiceArg || 'F1'})`) + + const response = await model.run({ input: textArg, type: 'text' }) + + let buffer = [] + await response + .onUpdate(data => { + if (data && data.outputArray) { + buffer = buffer.concat(Array.from(data.outputArray)) + } + }) + .await() + + console.log('TTS finished!') + if (response.stats) { + const s = response.stats + console.log(`Inference stats: totalTime=${s.totalTime.toFixed(2)}s, tokensPerSecond=${s.tokensPerSecond.toFixed(2)}, realTimeFactor=${s.realTimeFactor.toFixed(3)}, audioDuration=${s.audioDurationMs}ms, totalSamples=${s.totalSamples}`) + } + + console.log('\nWriting to .wav file...') + createWav(buffer, SUPERTONIC_SAMPLE_RATE, outputFile) + console.log(`Finished writing to ${outputFile}`) + } catch (err) { + console.error('Error during TTS processing:', err) + throw err + } finally { + console.log('Unloading model...') + await model.unload() + console.log('Model unloaded.') + releaseLogger() + } +} + +main().catch(err => { + console.error(err) + if (global.Bare) global.Bare.exit(1) + else process.exit(1) +}) diff --git a/packages/tts-ggml/examples/wav-helper.js b/packages/tts-ggml/examples/wav-helper.js new file mode 100644 index 0000000000..809f254bd4 --- /dev/null +++ b/packages/tts-ggml/examples/wav-helper.js @@ -0,0 +1,194 @@ +const fs = require('bare-fs') + +// Read a WAV file and return Float32Array of mono samples in [-1, 1]. +// Supports 16-bit PCM and 32-bit float; stereo is converted to mono (left channel). +function readWavAsFloat32 (wavPath) { + const buf = fs.readFileSync(wavPath) + if (buf.length < 44) throw new Error('WAV file too small') + + // Create DataView - handle both Buffer and Uint8Array + let arrayBuffer, byteOffset + if (buf.buffer && buf.byteOffset !== undefined) { + arrayBuffer = buf.buffer + byteOffset = buf.byteOffset + } else { + // Copy to ArrayBuffer if needed + arrayBuffer = new ArrayBuffer(buf.length) + new Uint8Array(arrayBuffer).set(buf) + byteOffset = 0 + } + const view = new DataView(arrayBuffer, byteOffset, buf.length) + + // Verify RIFF header + const riff = String.fromCharCode(buf[0], buf[1], buf[2], buf[3]) + const wave = String.fromCharCode(buf[8], buf[9], buf[10], buf[11]) + if (riff !== 'RIFF') throw new Error('Not a RIFF file') + if (wave !== 'WAVE') throw new Error('Not WAVE format') + + // Parse chunks + let fmtChunk = null + let dataChunk = null + let offset = 12 + + while (offset + 8 <= buf.length) { + const chunkId = String.fromCharCode(buf[offset], buf[offset + 1], buf[offset + 2], buf[offset + 3]) + const chunkSize = view.getUint32(offset + 4, true) + + if (chunkId === 'fmt ') { + fmtChunk = { offset: offset + 8, size: chunkSize } + } else if (chunkId === 'data') { + dataChunk = { offset: offset + 8, size: chunkSize } + } + + // Move to next chunk (chunks are word-aligned, so add padding if size is odd) + offset += 8 + chunkSize + if (chunkSize % 2 === 1 && offset < buf.length) { + offset += 1 // padding byte + } + } + + if (!fmtChunk) throw new Error('WAV missing fmt chunk') + if (!dataChunk) throw new Error('WAV missing data chunk') + + // Parse fmt chunk + const fmtOff = fmtChunk.offset + if (fmtOff + 16 > buf.length) throw new Error('fmt chunk truncated') + + const audioFormat = view.getUint16(fmtOff, true) + const numChannels = view.getUint16(fmtOff + 2, true) + const sampleRate = view.getUint32(fmtOff + 4, true) + const bitsPerSample = view.getUint16(fmtOff + 14, true) + + // Validate format: 1 = PCM, 3 = IEEE float + if (audioFormat !== 1 && audioFormat !== 3) { + throw new Error('Unsupported WAV audio format: ' + audioFormat + ' (only PCM=1 and IEEE_FLOAT=3 supported)') + } + + const dataOff = dataChunk.offset + const dataLen = Math.min(dataChunk.size, buf.length - dataOff) // clamp to actual buffer + + let samples + if (audioFormat === 1 && bitsPerSample === 16) { + // 16-bit PCM + const bytesPerSample = 2 + const numSamples = Math.floor(dataLen / bytesPerSample) + const numFrames = numChannels === 1 ? numSamples : Math.floor(numSamples / numChannels) + samples = new Float32Array(numFrames) + for (let i = 0; i < numFrames; i++) { + const idx = dataOff + (numChannels === 1 ? i * 2 : i * numChannels * 2) + if (idx + 2 > buf.length) break + const s = view.getInt16(idx, true) + samples[i] = s / 32768 + } + } else if (audioFormat === 1 && bitsPerSample === 24) { + // 24-bit PCM + const bytesPerSample = 3 + const numSamples = Math.floor(dataLen / bytesPerSample) + const numFrames = numChannels === 1 ? numSamples : Math.floor(numSamples / numChannels) + samples = new Float32Array(numFrames) + for (let i = 0; i < numFrames; i++) { + const idx = dataOff + (numChannels === 1 ? i * 3 : i * numChannels * 3) + if (idx + 3 > buf.length) break + // Read 24-bit signed little-endian + const lo = buf[idx] + const mid = buf[idx + 1] + const hi = buf[idx + 2] + let s = (hi << 16) | (mid << 8) | lo + if (s >= 0x800000) s -= 0x1000000 // sign extend + samples[i] = s / 8388608 + } + } else if (audioFormat === 3 && bitsPerSample === 32) { + // 32-bit IEEE float + const bytesPerSample = 4 + const numSamples = Math.floor(dataLen / bytesPerSample) + const numFrames = numChannels === 1 ? numSamples : Math.floor(numSamples / numChannels) + samples = new Float32Array(numFrames) + for (let i = 0; i < numFrames; i++) { + const idx = dataOff + (numChannels === 1 ? i * 4 : i * numChannels * 4) + if (idx + 4 > buf.length) break + samples[i] = view.getFloat32(idx, true) + } + } else if (audioFormat === 1 && bitsPerSample === 8) { + // 8-bit PCM (unsigned) + const numSamples = dataLen + const numFrames = numChannels === 1 ? numSamples : Math.floor(numSamples / numChannels) + samples = new Float32Array(numFrames) + for (let i = 0; i < numFrames; i++) { + const idx = dataOff + (numChannels === 1 ? i : i * numChannels) + if (idx >= buf.length) break + samples[i] = (buf[idx] - 128) / 128 + } + } else { + throw new Error('Unsupported WAV format: audioFormat=' + audioFormat + ', bitsPerSample=' + bitsPerSample) + } + + return { samples, sampleRate, numChannels } +} + +// Helper: write a little-endian integer +function writeIntLE (buffer, value, offset, byteLength) { + for (let i = 0; i < byteLength; i++) { + buffer[offset + i] = value & 0xff + value >>= 8 + } +} + +// Generate WAV file (16-bit PCM mono) +function createWav (samples, sampleRate = 16000, outputPath = 'test.wav') { + const numChannels = 1 + const bytesPerSample = 2 // 16-bit PCM + const blockAlign = numChannels * bytesPerSample + const byteRate = sampleRate * blockAlign + const dataSize = samples.length * bytesPerSample + const buffer = new Uint8Array(44 + dataSize) + + // RIFF header + buffer.set([0x52, 0x49, 0x46, 0x46], 0) // "RIFF" + writeIntLE(buffer, 36 + dataSize, 4, 4) // file size - 8 + buffer.set([0x57, 0x41, 0x56, 0x45], 8) // "WAVE" + + // fmt chunk + buffer.set([0x66, 0x6d, 0x74, 0x20], 12) // "fmt " + writeIntLE(buffer, 16, 16, 4) // Subchunk1Size + writeIntLE(buffer, 1, 20, 2) // AudioFormat = PCM + writeIntLE(buffer, numChannels, 22, 2) + writeIntLE(buffer, sampleRate, 24, 4) + writeIntLE(buffer, byteRate, 28, 4) + writeIntLE(buffer, blockAlign, 32, 2) + writeIntLE(buffer, bytesPerSample * 8, 34, 2) // bits per sample + + // data chunk + buffer.set([0x64, 0x61, 0x74, 0x61], 36) // "data" + writeIntLE(buffer, dataSize, 40, 4) + + // write PCM samples - samples are already int16 values from the TTS output + for (let i = 0; i < samples.length; i++) { + // Clamp the int16 value to valid range and write as little-endian + const sample = Math.max(-32768, Math.min(32767, samples[i])) + // Convert to unsigned for proper bit manipulation + const unsignedSample = sample < 0 ? sample + 65536 : sample + writeIntLE(buffer, unsignedSample, 44 + i * 2, 2) + } + + fs.writeFileSync(outputPath, buffer) +} +function resampleLinear (samples, fromRate, toRate) { + if (fromRate === toRate) return samples + const ratio = fromRate / toRate + const outputLen = Math.round(samples.length / ratio) + const output = new Float32Array(outputLen) + for (let i = 0; i < outputLen; i++) { + const srcIdx = i * ratio + const lo = Math.floor(srcIdx) + const hi = Math.min(lo + 1, samples.length - 1) + const frac = srcIdx - lo + output[i] = samples[lo] * (1 - frac) + samples[hi] * frac + } + return output +} + +module.exports = { + createWav, + readWavAsFloat32, + resampleLinear +} diff --git a/packages/tts-ggml/index.d.ts b/packages/tts-ggml/index.d.ts new file mode 100644 index 0000000000..3aa6e9a46e --- /dev/null +++ b/packages/tts-ggml/index.d.ts @@ -0,0 +1,212 @@ +import type QvacResponse from '@qvac/infer-base/src/QvacResponse' + +/** + * Model file paths for the GGML TTS backend. Engine is auto-detected + * from these fields (chatterbox vs supertonic) unless overridden via + * `TTSGgmlOptions.engine`. All paths must be absolute (passed through + * to the native layer as-is). + */ +declare interface TTSGgmlFiles { + /** + * Bundle root. For Chatterbox, expected to contain + * `chatterbox-t3-turbo.gguf` + `chatterbox-s3gen.gguf` (turbo) or + * `chatterbox-t3-mtl.gguf` + `chatterbox-s3gen-mtl.gguf` (multilingual). + * For Supertonic, expected to contain `supertonic.gguf`. + */ + modelDir?: string + /** Chatterbox T3 (text -> speech tokens) GGUF path. Overrides `modelDir`. */ + t3Model?: string + t3ModelPath?: string + t3?: string + /** Chatterbox S3Gen + HiFT (speech tokens -> 24 kHz wav) GGUF path. Overrides `modelDir`. */ + s3genModel?: string + s3genModelPath?: string + s3gen?: string + /** Supertonic single-file GGUF path. Overrides `modelDir`. */ + supertonicModel?: string + supertonicModelPath?: string + supertonic?: string + /** Optional directory containing baked Chatterbox voice profiles. */ + voicesDir?: string +} + +declare interface TTSGgmlRuntimeConfig { + /** Language code; default "en". Chatterbox MTL accepts es/fr/de/pt/it/zh/ja/ko/... */ + language?: string + /** Route inference through a GPU backend (Metal / Vulkan / CUDA) if available. Chatterbox: defaults true. Supertonic: rejected at construction time (engine is CPU-only today). */ + useGPU?: boolean + /** Resample the engine's native rate (24 kHz Chatterbox, 44.1 kHz Supertonic) to this rate before emitting (8000-192000 Hz). */ + outputSampleRate?: number +} + +declare interface TTSGgmlOptions { + files?: TTSGgmlFiles + config?: TTSGgmlRuntimeConfig + logger?: object + lazySessionLoading?: boolean + /** Explicit engine selection ('chatterbox' | 'supertonic'). Auto-detected from `files` when omitted. */ + engine?: 'chatterbox' | 'supertonic' + /** Chatterbox: voice-cloning reference audio path (wav). */ + referenceAudio?: string + /** Chatterbox: directory of baked voice-conditioning tensors. */ + voiceDir?: string + /** RNG seed for CFM initial noise + SineGen excitation (Chatterbox) / vector-estimator latent (Supertonic). */ + seed?: number + /** Move N layers to the GPU backend. Chatterbox: pass 99 to move everything. Supertonic: must be 0 / unset (engine is CPU-only today). */ + nGpuLayers?: number + /** Override `std::thread::hardware_concurrency()`. */ + threads?: number + /** Chatterbox-only: speech tokens per native streaming chunk (25 ~= 1 s of audio). 0 disables. */ + streamChunkTokens?: number + /** Chatterbox-only: smaller first chunk for low first-audio-out latency. */ + streamFirstChunkTokens?: number + /** Chatterbox-only: CFM Euler step count (1 halves cost; 2 matches Python meanflow). */ + cfmSteps?: number + /** Supertonic: voice id baked into the GGUF (e.g. 'F1', 'F2', 'M1', 'M2'). */ + voice?: string + /** Alias for `voice` (cross-compat with `@qvac/tts-onnx`). */ + voiceName?: string + /** Supertonic: number of vector-estimator (CFM) steps. 0 -> GGUF default. */ + steps?: number + /** Alias for `steps` (cross-compat with `@qvac/tts-onnx`). */ + numInferenceSteps?: number + /** Supertonic: speech-rate factor. 0 -> GGUF default. */ + speed?: number + /** Supertonic: optional path to a .npy initial-noise tensor (byte-exact reference reproduction). */ + noiseNpyPath?: string + opts?: object + exclusiveRun?: boolean +} + +/** + * GGML-backed TTS via the `tts-cpp` library. Wraps both + * `tts_cpp::chatterbox::Engine` and `tts_cpp::supertonic::Engine` behind + * a single engine-agnostic JS surface. Engine type is auto-detected + * from `files` (chatterbox-* gguf vs supertonic.gguf) or set explicitly + * via the `engine` option. + * + * Owns a persistent native Engine: model weights and any voice- + * conditioning tensors are loaded once at `load()` and reused across + * every `run()` / `runStream()` / `runStreaming()` call. + */ +declare class TTSGgml { + constructor(options?: TTSGgmlOptions) + + static readonly ENGINE_CHATTERBOX: 'chatterbox' + static readonly ENGINE_SUPERTONIC: 'supertonic' + + load(...args: unknown[]): Promise + unload(): Promise + destroy(): Promise + reload(newConfig?: Record): Promise + cancel(): Promise + getApiDefinition(): string + getState(): { configLoaded: boolean; weightsLoaded: boolean; destroyed: boolean } + getEngineType(): 'chatterbox' | 'supertonic' + + opts: object + exclusiveRun: boolean + logger: object + state: { configLoaded: boolean; weightsLoaded: boolean; destroyed: boolean } + addon: unknown + + /** + * Run text-to-speech. With `{ streamOutput: true }`, splits `input` into chunks and emits PCM on `onUpdate` per chunk. + */ + run( + input: TTSGgml.TTSRunInput & { streamOutput: true }, + ): Promise> + + run(input: TTSGgml.TTSRunInput): Promise> + + /** + * Chunked streaming synthesis: forwards to `run({ input: text, streamOutput: true, ... })`. + */ + runStream( + text: string, + options?: TTSGgml.SentenceStreamOptions, + ): Promise> + + /** + * Streaming text in, streaming audio out. Each flushed string is one native job; PCM on `onUpdate`. + * For `AsyncIterable` inputs, `accumulateSentences` defaults true (coalesce small streamed fragments). + */ + runStreaming( + textStream: TTSGgml.TextStreamInput, + options?: TTSGgml.RunStreamingOptions, + ): Promise> +} + +declare namespace TTSGgml { + export interface RuntimeStats { + totalTime: number + tokensPerSecond: number + realTimeFactor: number + audioDurationMs: number + totalSamples: number + /** Active compute device after the load-time backend cascade. 0 = CPU, 1 = GPU. */ + backendDevice?: number + /** Stable numeric code for the active backend. 0=CPU, 1=Metal, 2=CUDA, 3=Vulkan, 4=OpenCL, 99=other-GPU. */ + backendId?: number + } + + export interface TTSOutputChunk { + outputArray: ArrayBuffer + /** Native engine sample rate (24000 for Chatterbox, 44100 for Supertonic). */ + sampleRate?: number + } + + export interface SentenceStreamChunkMeta { + chunkIndex?: number + sentenceChunk?: string + /** True on the final chunk of a pre-chunked synthesis (`runStream` / `run({ streamOutput: true })`). Undefined for async-iterator streaming where the count isn't known up-front. */ + isLast?: boolean + } + + export interface SentenceStreamOptions { + /** BCP-47 locale for Intl.Segmenter when available. */ + locale?: string + /** Max graphemes per chunk (defaults: 300, or 120 when language is ko). */ + maxChunkScalars?: number + } + + /** Input accepted by `runStreaming`. */ + export type TextStreamInput = + | string + | string[] + | Iterable + | AsyncIterable + + export interface RunStreamingOptions { + accumulateSentences?: boolean + sentenceDelimiter?: RegExp + sentenceDelimiterPreset?: 'latin' | 'cjk' | 'multilingual' + maxBufferScalars?: number + flushAfterMs?: number + } + + export type TTSRunInput = { + type?: string + input: string + streamOutput?: boolean + locale?: string + maxChunkScalars?: number + outputSampleRate?: number + } + + export { + TTSGgml as default, + TTSGgmlFiles, + TTSGgmlOptions, + TTSGgmlRuntimeConfig, + RuntimeStats, + SentenceStreamChunkMeta, + SentenceStreamOptions, + RunStreamingOptions, + TextStreamInput, + TTSOutputChunk, + TTSRunInput + } +} + +export = TTSGgml diff --git a/packages/tts-ggml/index.js b/packages/tts-ggml/index.js new file mode 100644 index 0000000000..639b92646e --- /dev/null +++ b/packages/tts-ggml/index.js @@ -0,0 +1,967 @@ +'use strict' + +const { platform } = require('bare-os') +const path = require('bare-path') +const fs = require('bare-fs') +const QvacLogger = require('@qvac/logging') +const { + createJobHandler, + exclusiveRunQueue, + getApiDefinition: inferGetApiDefinition +} = require('@qvac/infer-base') +const { TTSInterface } = require('./tts') +const { QvacErrorAddonTTSGgml, ERR_CODES } = require('./lib/error') +const { splitTtsText } = require('./lib/textChunker') +const { accumulateTextStream } = require('./lib/textStreamAccumulator') + +const ENGINE_CHATTERBOX = 'chatterbox' +const ENGINE_SUPERTONIC = 'supertonic' + +const CHATTERBOX_T3_TURBO = 'chatterbox-t3-turbo.gguf' +const CHATTERBOX_T3_MTL = 'chatterbox-t3-mtl.gguf' +const CHATTERBOX_S3GEN_DEFAULT = 'chatterbox-s3gen.gguf' +const CHATTERBOX_S3GEN_MTL = 'chatterbox-s3gen-mtl.gguf' +const SUPERTONIC_DEFAULT = 'supertonic.gguf' +const SUPERTONIC_MTL = 'supertonic2.gguf' + +function firstNonEmpty (...candidates) { + for (let i = 0; i < candidates.length; i++) { + const v = candidates[i] + if (v != null && v !== '') return v + } + return undefined +} + +function fileExistsSafe (p) { + if (!p) return false + try { + return fs.existsSync(p) + } catch (_e) { + return false + } +} + +/** + * Normalize the `files` map into the GGUF paths each engine variant needs. + * Accepts: + * - Chatterbox: explicit `t3Model`/`s3genModel`, or a `modelDir` that + * contains either the turbo (`chatterbox-t3-turbo.gguf` + + * `chatterbox-s3gen.gguf`) or multilingual + * (`chatterbox-t3-mtl.gguf` + `chatterbox-s3gen-mtl.gguf`) GGUFs. + * - Supertonic: explicit `supertonicModel`, or a `modelDir` that + * contains `supertonic.gguf`. + * + * @param {Record} files + */ +function normalizeGgmlFiles (files) { + if (files == null || typeof files !== 'object') { + return {} + } + const f = files + return { + modelDir: firstNonEmpty(f.modelDir), + t3Model: firstNonEmpty(f.t3Model, f.t3ModelPath, f.t3), + s3genModel: firstNonEmpty(f.s3genModel, f.s3genModelPath, f.s3gen), + supertonicModel: firstNonEmpty( + f.supertonicModel, + f.supertonicModelPath, + f.supertonic + ), + voicesDir: firstNonEmpty(f.voicesDir) + } +} + +/** + * Decide which engine the constructor should drive. Order of precedence: + * 1. Explicit `engine` option (caller-asserted: 'chatterbox' | 'supertonic'). + * 2. An explicit Supertonic file path. + * 3. A `modelDir` that contains `supertonic.gguf` on disk. + * 4. Default → Chatterbox (turbo or MTL is decided later inside the + * Chatterbox path resolver based on which T3 file is present). + */ +function detectEngineType (engine, normalizedFiles) { + if (engine === ENGINE_CHATTERBOX || engine === ENGINE_SUPERTONIC) { + return engine + } + if (engine != null && engine !== '') { + throw new Error( + "tts-ggml: 'engine' option must be 'chatterbox' or 'supertonic' " + + "(got '" + engine + "')" + ) + } + if (normalizedFiles.t3Model || normalizedFiles.s3genModel) return ENGINE_CHATTERBOX + if (normalizedFiles.supertonicModel) return ENGINE_SUPERTONIC + if (normalizedFiles.modelDir) { + const turboT3 = path.join(normalizedFiles.modelDir, CHATTERBOX_T3_TURBO) + const mtlT3 = path.join(normalizedFiles.modelDir, CHATTERBOX_T3_MTL) + const supertonicEn = path.join(normalizedFiles.modelDir, SUPERTONIC_DEFAULT) + const supertonicMtl = path.join(normalizedFiles.modelDir, SUPERTONIC_MTL) + const hasChatterbox = fileExistsSafe(turboT3) || fileExistsSafe(mtlT3) + const hasSupertonic = fileExistsSafe(supertonicEn) || fileExistsSafe(supertonicMtl) + if (hasChatterbox) return ENGINE_CHATTERBOX + if (hasSupertonic) return ENGINE_SUPERTONIC + } + return ENGINE_CHATTERBOX +} + +/** + * Pick the right Supertonic GGUF inside `modelDir`. + * Mirrors the chatterbox resolver: prefer the English-only build when + * present (smaller, single-language), only fall back to the multilingual + * build when English isn't on disk. Callers that explicitly want the + * multilingual variant should pass `files.supertonicModel` directly. + */ +function resolveSupertonicModelDirPath (modelDir) { + const supertonicEn = path.join(modelDir, SUPERTONIC_DEFAULT) + const supertonicMtl = path.join(modelDir, SUPERTONIC_MTL) + if (fileExistsSafe(supertonicEn)) return supertonicEn + if (fileExistsSafe(supertonicMtl)) return supertonicMtl + return supertonicEn +} + +/** + * Pick the right Chatterbox T3 + S3Gen file names inside `modelDir`. + * Multilingual GGUFs win when both variants are present (only-mtl is + * the only state where mtl beats turbo at the file-detection layer). + * Otherwise fall back to the turbo English layout. + */ +function resolveChatterboxModelDirPaths (modelDir) { + const turboT3 = path.join(modelDir, CHATTERBOX_T3_TURBO) + const mtlT3 = path.join(modelDir, CHATTERBOX_T3_MTL) + const defaultS3 = path.join(modelDir, CHATTERBOX_S3GEN_DEFAULT) + const mtlS3 = path.join(modelDir, CHATTERBOX_S3GEN_MTL) + + const hasTurbo = fileExistsSafe(turboT3) + const hasMtl = fileExistsSafe(mtlT3) + if (hasMtl && !hasTurbo) { + return { + t3: mtlT3, + s3: fileExistsSafe(mtlS3) ? mtlS3 : defaultS3 + } + } + return { t3: turboT3, s3: defaultS3 } +} + +/** + * Default `accumulateSentences` for `runStreaming`: true only for native `AsyncIterable` + * (e.g. incremental text from an upstream async source), not for strings, arrays, or sync-only iterables. + * @param {unknown} textStream + * @returns {boolean} + */ +function defaultAccumulateSentencesForStreamInput (textStream) { + if (textStream == null) return false + if (typeof textStream === 'string') return false + if (Array.isArray(textStream)) return false + if (typeof textStream[Symbol.asyncIterator] === 'function') return true + return false +} + +function ttsOutputDebugString (data) { + if (!data) return '' + if (typeof data !== 'object') return data.toString() + // Skip the heavy fields (outputArray = Int16Array of 24 kHz PCM + // samples; for native chunk streaming each event carries thousands of + // samples and JSON.stringify becomes the dominant cost on the + // outputCallback fast path). Surface only the summary fields so + // logger.debug stays useful. + const summary = {} + if (data.sampleRate != null) summary.sampleRate = data.sampleRate + if (data.chunkIndex != null) summary.chunkIndex = data.chunkIndex + if (data.isLast != null) summary.isLast = data.isLast + if (data.sentenceChunk != null) summary.sentenceChunk = data.sentenceChunk + if (data.outputArray && typeof data.outputArray.length === 'number') { + summary.outputArrayLen = data.outputArray.length + } + return JSON.stringify(summary) +} + +/** + * GGML-backed Chatterbox TTS (via the `tts-cpp` / qvac-tts.cpp library). + * + * Owns a persistent native engine — T3, S3Gen, and any voice-conditioning + * tensors are loaded once at `load()` and reused across every `run()` / + * `runStream()` / `runStreaming()` call. Exposes batch synthesis + * (`run({ input })`), sentence-granularity streaming (`runStreaming()` over + * an async iterator of sentences), and sub-sentence native chunk streaming + * (set `streamChunkTokens` on the constructor; the C++ Engine then emits + * PCM per chunk as it's produced). See README.md for usage. + */ +class TTSGgml { + constructor (options = {}) { + const { + files: filesInput = {}, + config = {}, + logger, + lazySessionLoading, + engine, + referenceAudio, + voiceDir, + seed, + nGpuLayers, + threads, + streamChunkTokens, + streamFirstChunkTokens, + cfmSteps, + voice, + voiceName, + steps, + numInferenceSteps, + speed, + noiseNpyPath, + opts, + exclusiveRun + } = options + + this.opts = opts || {} + this.exclusiveRun = !!exclusiveRun + this.logger = new QvacLogger(logger) + this.state = { + configLoaded: false, + weightsLoaded: false, + destroyed: false + } + this.addon = null + this._sentenceStreamCtx = null + /** Serializes `run({ streamOutput: true })`, `runStream`, and `runStreaming` until each response settles (Whisper-style). */ + this._ttsInferenceQueueWaiter = Promise.resolve() + this._job = createJobHandler({ + cancel: () => { + const a = this.addon + return a ? a.cancel() : undefined + } + }) + this._runExclusive = this.exclusiveRun + ? exclusiveRunQueue() + : async function runNow (fn) { + return fn() + } + + const normalizedFiles = normalizeGgmlFiles(filesInput) + this._config = { ...config } + + this._lazySessionLoading = lazySessionLoading != null + ? lazySessionLoading + : (platform() === 'ios' || platform() === 'android') + + const outputSampleRate = this._config.outputSampleRate + if (outputSampleRate != null && (outputSampleRate < 8000 || outputSampleRate > 192000)) { + throw new Error('outputSampleRate must be between 8000 and 192000, got ' + outputSampleRate) + } + this._outputSampleRate = outputSampleRate || null + + this._engineType = detectEngineType(engine, normalizedFiles) + this._voicesDir = normalizedFiles.voicesDir + + if (this._engineType === ENGINE_SUPERTONIC) { + const root = normalizedFiles.modelDir + this._supertonicModelPath = firstNonEmpty( + normalizedFiles.supertonicModel, + root ? resolveSupertonicModelDirPath(root) : undefined + ) + this._t3ModelPath = undefined + this._s3genModelPath = undefined + } else { + const root = normalizedFiles.modelDir + if (root) { + const resolved = resolveChatterboxModelDirPaths(root) + this._t3ModelPath = firstNonEmpty( + normalizedFiles.t3Model, + resolved.t3 + ) + this._s3genModelPath = firstNonEmpty( + normalizedFiles.s3genModel, + resolved.s3 + ) + } else { + this._t3ModelPath = normalizedFiles.t3Model + this._s3genModelPath = normalizedFiles.s3genModel + } + this._supertonicModelPath = undefined + } + + this._referenceAudio = referenceAudio + this._voiceDir = voiceDir + this._seed = seed + this._nGpuLayers = nGpuLayers + this._threads = threads + this._streamChunkTokens = streamChunkTokens + this._streamFirstChunkTokens = streamFirstChunkTokens + this._cfmSteps = cfmSteps + this._voice = firstNonEmpty(voice, voiceName) + this._steps = firstNonEmpty(steps, numInferenceSteps) + this._speed = speed + this._noiseNpyPath = noiseNpyPath + + // Run the conflict check before any engine-specific GPU policy so a + // caller passing { useGPU:false, nGpuLayers:99 } gets the precise + // conflict message instead of, e.g., the Supertonic "GPU not + // supported" branch firing on `nGpuLayers > 0` and confusing them. + // `layers != 0` (rather than `layers > 0`) so a future llama.cpp- + // style `nGpuLayers: -1` ("offload all layers") doesn't falsely + // pass through as "wants CPU" against an explicit useGPU:true. + if ( + typeof this._config.useGPU === 'boolean' && + this._nGpuLayers != null + ) { + const layersWantGpu = this._nGpuLayers !== 0 + if (this._config.useGPU !== layersWantGpu) { + throw new Error( + 'tts-ggml: useGPU=' + this._config.useGPU + + ' conflicts with nGpuLayers=' + this._nGpuLayers + '. ' + + 'Either drop one of the two, or make them agree ' + + '(useGPU:true + nGpuLayers!=0, or useGPU:false + nGpuLayers=0).' + ) + } + } + + if (this._engineType === ENGINE_SUPERTONIC) { + if (this._streamChunkTokens != null || this._streamFirstChunkTokens != null) { + throw new Error( + 'tts-ggml: streamChunkTokens / streamFirstChunkTokens are Chatterbox-only ' + + 'options (sub-sentence native streaming via the chatterbox::Engine ' + + 'streaming chunked S3Gen+HiFT loop). Supertonic does not support sub-' + + 'sentence native streaming; use sentence-level streaming via the engine-' + + 'agnostic runStream() / runStreaming() / run({ streamOutput: true }) APIs.' + ) + } + const wantsGpu = + this._config.useGPU === true || + (this._nGpuLayers != null && this._nGpuLayers !== 0) + if (wantsGpu) { + throw new Error( + 'tts-ggml: GPU execution is not supported by the Supertonic engine yet ' + + '(see tts-cpp include/tts-cpp/supertonic/engine.h: "CPU only today"). ' + + 'GPU output is currently silently wrong (~4x quieter, slightly truncated) ' + + 'because the Vulkan path of the supertonic vector-estimator + vocoder is ' + + 'not yet validated. Pass config: { useGPU: false } (and leave nGpuLayers ' + + 'unset, or set it to 0) when constructing a Supertonic model. ' + + 'Chatterbox engine remains GPU-enabled by default.' + ) + } + if (this._config.useGPU === undefined) { + this._config.useGPU = false + } + } else if (this._config.useGPU === undefined && this._nGpuLayers == null) { + this._config.useGPU = true + } + } + + getEngineType () { + return this._engineType + } + + getApiDefinition () { + const api = inferGetApiDefinition() + this.logger.debug( + `Using API definition: ${api} for platform: ${platform()}` + ) + return api + } + + getState () { + return this.state + } + + async load (..._args) { + if (this.state.destroyed) { + throw new QvacErrorAddonTTSGgml({ + code: ERR_CODES.FAILED_TO_LOAD, + adds: 'instance was destroyed' + }) + } + if (this.state.configLoaded || this.state.weightsLoaded) { + this.logger.info('Reload requested - unloading existing model first') + await this.unload() + } + await this._load() + this.state.configLoaded = true + this.state.weightsLoaded = true + } + + /** + * Run text-to-speech. Set `streamOutput: true` to split `input` into sentence + * chunks and emit PCM on `response.onUpdate` as each chunk completes (same + * behavior as `runStream`). + * + * @param {Object} input + * @param {string} input.input - Text to synthesize + * @param {boolean} [input.streamOutput=false] - Chunked streaming output + * @param {string} [input.locale] - BCP-47 locale for chunking when `streamOutput` + * @param {number} [input.maxChunkScalars] - Max graphemes per chunk when `streamOutput` + */ + async run (input) { + if (input && typeof input === 'object' && input.streamOutput === true) { + if (typeof input.input !== 'string' || input.input.trim().length === 0) { + throw new QvacErrorAddonTTSGgml({ + code: ERR_CODES.FAILED_TO_APPEND, + adds: 'run with streamOutput: non-empty string `input` is required' + }) + } + const streamOpts = { + locale: input.locale, + maxChunkScalars: input.maxChunkScalars + } + if (this.exclusiveRun) { + return await this._enqueueExclusiveTtsResponse(() => + this._runStreamOrchestrator(input.input, streamOpts) + ) + } + return this._runStreamOrchestrator(input.input, streamOpts) + } + return this._runExclusive(() => this._runInternal(input)) + } + + /** + * Serialize streaming runs until the returned {@link QvacResponse} settles. + */ + async _enqueueExclusiveTtsResponse (runFn) { + const prev = this._ttsInferenceQueueWaiter || Promise.resolve() + let releaseSlot + this._ttsInferenceQueueWaiter = new Promise(resolve => { + releaseSlot = resolve + }) + await prev + let response + try { + response = await runFn() + } catch (err) { + releaseSlot() + throw err + } + response.await().finally(() => { releaseSlot() }).catch(() => {}) + return response + } + + /** + * Chunk long text by sentence (see {@link splitTtsText}), synthesize each chunk + * in order, and emit PCM on `response.onUpdate` as each chunk completes. + * Equivalent to `run({ input: text, streamOutput: true, ...options })`. + * + * @param {string} text + * @param {{ locale?: string, maxChunkScalars?: number }} [options] + */ + async runStream (text, options = {}) { + const opts = options == null || typeof options !== 'object' ? {} : options + return this.run({ + input: text, + streamOutput: true, + locale: opts.locale, + maxChunkScalars: opts.maxChunkScalars + }) + } + + /** + * Streaming input + streaming output: each flushed string is one synthesis job; + * PCM is emitted on `response.onUpdate` per job. Same chunk metadata shape as + * `runStream`. + * + * For **AsyncIterable** inputs, **`accumulateSentences` defaults to true**: + * fragments are concatenated until a sentence end (see + * `sentenceDelimiterPreset`), max buffer size (`maxBufferScalars`), or + * `flushAfterMs` idle after the last fragment. Strings and arrays default to + * one job per yield (`accumulateSentences` false). + * + * @param {AsyncIterable|Iterable|string} textStream + * @param {Object} [options] + * @param {boolean} [options.accumulateSentences] - Default: true for `AsyncIterable` inputs only. + * @param {'latin'|'cjk'|'multilingual'} [options.sentenceDelimiterPreset] + * @param {RegExp} [options.sentenceDelimiter] - Overrides preset when set (tested against full buffer). + * @param {number} [options.maxBufferScalars] - Max graphemes before hard flush (default by language). + * @param {number} [options.flushAfterMs] - Idle flush after last fragment (default 500). + */ + async runStreaming (textStream, options = {}) { + const streamOpts = this._resolveRunStreamingOptions(textStream, options) + let normalized = this._normalizeTextStream(textStream) + if (streamOpts.accumulateSentences) { + normalized = accumulateTextStream(normalized, { + sentenceDelimiterPreset: streamOpts.sentenceDelimiterPreset, + maxBufferScalars: streamOpts.maxBufferScalars, + flushAfterMs: streamOpts.flushAfterMs, + sentenceDelimiter: streamOpts.sentenceDelimiter, + language: this._config?.language + }) + } + if (this.exclusiveRun) { + return await this._enqueueExclusiveTtsResponse(() => + this._runTextStreamOrchestrator(normalized) + ) + } + return this._runTextStreamOrchestrator(normalized) + } + + _resolveRunStreamingOptions (textStream, options) { + const o = options == null || typeof options !== 'object' ? {} : options + let accumulateSentences = o.accumulateSentences + if (accumulateSentences === undefined) { + accumulateSentences = defaultAccumulateSentencesForStreamInput(textStream) + } + const rawPreset = o.sentenceDelimiterPreset + const sentenceDelimiterPreset = + rawPreset === 'latin' || rawPreset === 'cjk' || rawPreset === 'multilingual' + ? rawPreset + : 'multilingual' + const maxBufferScalars = o.maxBufferScalars + const flushAfterMs = o.flushAfterMs != null ? o.flushAfterMs : 500 + const sentenceDelimiter = + o.sentenceDelimiter instanceof RegExp ? o.sentenceDelimiter : undefined + return { + accumulateSentences: !!accumulateSentences, + sentenceDelimiterPreset, + maxBufferScalars, + flushAfterMs, + sentenceDelimiter + } + } + + _normalizeTextStream (textStream) { + if (textStream == null) { + throw new QvacErrorAddonTTSGgml({ + code: ERR_CODES.FAILED_TO_APPEND, + adds: 'runStreaming: text stream is required' + }) + } + if (typeof textStream === 'string') { + async function * oneString () { + yield textStream + } + return oneString() + } + if (typeof textStream[Symbol.asyncIterator] === 'function') { + return textStream + } + if (Array.isArray(textStream)) { + async function * fromArray () { + for (let i = 0; i < textStream.length; i++) { + yield textStream[i] + } + } + return fromArray() + } + if (typeof textStream[Symbol.iterator] === 'function') { + async function * fromIterable () { + for (const x of textStream) { + yield x + } + } + return fromIterable() + } + throw new QvacErrorAddonTTSGgml({ + code: ERR_CODES.FAILED_TO_APPEND, + adds: 'runStreaming: expected string, array of strings, Iterable, or AsyncIterable' + }) + } + + _runTextStreamOrchestrator (asyncTextSource) { + const response = this._job.start() + this._sentenceStreamCtx = { + textStreamMode: true, + asyncTextSource, + chunks: [], + chunkIdx: 0, + acc: { + totalTime: 0, + audioDurationMs: 0, + totalSamples: 0 + }, + chunkResolver: null + } + + this._sentenceStreamTextIterableDrive().catch((err) => { + if (this._sentenceStreamCtx && this._sentenceStreamCtx.chunkResolver) { + const rej = this._sentenceStreamCtx.chunkResolver.reject + this._sentenceStreamCtx.chunkResolver = null + rej(err) + } + this._sentenceStreamCtx = null + this._job.fail(err) + }) + + return response + } + + async _sentenceStreamTextIterableDrive () { + const ctx = this._sentenceStreamCtx + if (!ctx || !ctx.textStreamMode) return + try { + for await (const piece of ctx.asyncTextSource) { + const s = String(piece).trim() + if (s.length === 0) continue + ctx.chunks.push(s) + ctx.chunkIdx = ctx.chunks.length - 1 + const donePromise = new Promise((resolve, reject) => { + ctx.chunkResolver = { resolve, reject } + }) + await this.addon.runJob({ + type: 'text', + input: s + }) + await donePromise + } + } catch (err) { + if (this._sentenceStreamCtx && this._sentenceStreamCtx.chunkResolver) { + const rej = this._sentenceStreamCtx.chunkResolver.reject + this._sentenceStreamCtx.chunkResolver = null + rej(err) + } + this._sentenceStreamCtx = null + this._job.fail(err) + return + } + + const chunks = this._sentenceStreamCtx ? this._sentenceStreamCtx.chunks : [] + const acc = this._sentenceStreamCtx + ? this._sentenceStreamCtx.acc + : { totalTime: 0, audioDurationMs: 0, totalSamples: 0 } + this._sentenceStreamCtx = null + + if (chunks.length === 0) { + if (this.opts?.stats) { + this._job.end({ + totalTime: 0, + tokensPerSecond: 0, + realTimeFactor: 0, + audioDurationMs: 0, + totalSamples: 0 + }) + } else { + this._job.end() + } + return + } + + const totalChars = chunks.join('').length + const merged = { ...acc } + merged.tokensPerSecond = acc.totalTime > 0 ? totalChars / acc.totalTime : 0 + merged.realTimeFactor = + acc.audioDurationMs > 0 ? (acc.totalTime * 1000.0) / acc.audioDurationMs : 0 + if (this.opts?.stats) { + this._job.end(merged) + } else { + this._job.end() + } + } + + _runStreamOrchestrator (text, options) { + const chunks = splitTtsText(String(text), { + language: this._config?.language, + locale: options.locale, + maxScalars: options.maxChunkScalars + }) + if (chunks.length === 0) { + throw new QvacErrorAddonTTSGgml({ + code: ERR_CODES.FAILED_TO_APPEND, + adds: 'chunked synthesis: text produced no chunks after split' + }) + } + + const response = this._job.start() + this._sentenceStreamCtx = { + chunks, + chunkIdx: 0, + acc: { + totalTime: 0, + audioDurationMs: 0, + totalSamples: 0 + }, + chunkResolver: null + } + + this._sentenceStreamDriveBody().catch((err) => { + if (this._sentenceStreamCtx && this._sentenceStreamCtx.chunkResolver) { + const rej = this._sentenceStreamCtx.chunkResolver.reject + this._sentenceStreamCtx.chunkResolver = null + rej(err) + } + this._sentenceStreamCtx = null + this._job.fail(err) + }) + + return response + } + + async _sentenceStreamDriveBody () { + const ctx = this._sentenceStreamCtx + if (!ctx || ctx.textStreamMode) return + for (let i = 0; i < ctx.chunks.length; i++) { + ctx.chunkIdx = i + const donePromise = new Promise((resolve, reject) => { + ctx.chunkResolver = { resolve, reject } + }) + await this.addon.runJob({ + type: 'text', + input: ctx.chunks[i] + }) + await donePromise + } + this._sentenceStreamCtx = null + } + + async _load () { + this.logger.info('[TTSGgml] Language:', this._config?.language || 'en') + + const ttsParams = this._buildTtsParams() + + this.addon = this._createAddon(ttsParams, this._addonOutputCallback.bind(this)) + await this.addon.activate() + } + + _buildTtsParams () { + if (this._engineType === ENGINE_SUPERTONIC) { + return this._buildSupertonicParams() + } + return this._buildChatterboxParams() + } + + _buildChatterboxParams () { + const params = { + engineType: ENGINE_CHATTERBOX, + t3ModelPath: this._t3ModelPath || '', + s3genModelPath: this._s3genModelPath || '', + language: this._config?.language || 'en' + } + if (this._referenceAudio != null) { + params.referenceAudio = this._referenceAudio + } + if (this._voiceDir != null) { + params.voiceDir = this._voiceDir + } + if (this._seed != null) params.seed = this._seed | 0 + if (this._nGpuLayers != null) params.nGpuLayers = this._nGpuLayers | 0 + if (this._threads != null) params.threads = this._threads | 0 + if (this._streamChunkTokens != null) params.streamChunkTokens = this._streamChunkTokens | 0 + if (this._streamFirstChunkTokens != null) { + params.streamFirstChunkTokens = this._streamFirstChunkTokens | 0 + } + if (this._cfmSteps != null) params.cfmSteps = this._cfmSteps | 0 + if (this._outputSampleRate != null) { + params.outputSampleRate = this._outputSampleRate | 0 + } + if (this._config?.useGPU != null) { + params.useGPU = !!this._config.useGPU + } + return params + } + + _buildSupertonicParams () { + const params = { + engineType: ENGINE_SUPERTONIC, + supertonicModelPath: this._supertonicModelPath || '', + language: this._config?.language || 'en' + } + if (this._voice) params.voice = this._voice + if (this._steps != null) params.steps = this._steps | 0 + if (this._speed != null) params.speed = Number(this._speed) + if (this._seed != null) params.seed = this._seed | 0 + if (this._threads != null) params.threads = this._threads | 0 + if (this._nGpuLayers != null) params.nGpuLayers = this._nGpuLayers | 0 + if (this._outputSampleRate != null) { + params.outputSampleRate = this._outputSampleRate | 0 + } + if (this._config?.useGPU != null) { + params.useGPU = !!this._config.useGPU + } + if (this._noiseNpyPath) params.noiseNpyPath = this._noiseNpyPath + return params + } + + /** + * Instantiate the native addon with the given parameters. + * @param {Object} configurationParams + * @param {Function} outputCb + * @returns {TTSInterface} + */ + _createAddon (configurationParams, outputCb) { + const binding = require('./binding') + return new TTSInterface(binding, configurationParams, outputCb) + } + + async unload () { + await this.cancel() + this._failAndClearActiveResponse('Model was unloaded') + if (this.addon) { + await this.addon.destroyInstance() + } + this.state.configLoaded = false + this.state.weightsLoaded = false + } + + async destroy () { + await this.unload() + this.state.destroyed = true + } + + async _runInternal (input) { + const response = this._job.start() + try { + // Per-request overrides (e.g. input.outputSampleRate) are not + // honoured by the native engine today — all synthesis knobs are + // resolved at construction / reload. Route those through + // `model.reload({...})` instead when the engine exposes them. + const jobData = { + type: input.type || 'text', + input: input.input + } + + await this.addon.runJob(jobData) + } catch (error) { + this._job.fail(error) + throw error + } + + return response + } + + _mergeSentenceStreamStats (acc, data) { + const t = typeof data.totalTime === 'number' ? data.totalTime : 0 + const a = typeof data.audioDurationMs === 'number' ? data.audioDurationMs : 0 + const s = typeof data.totalSamples === 'number' ? data.totalSamples : 0 + acc.totalTime += t + acc.audioDurationMs += a + acc.totalSamples += s + } + + _addonOutputCallback (addon, event, data, error) { + if (typeof error === 'string' && error.length > 0) { + this.logger.error(`TTS job failed with error: ${error}`) + if (this._sentenceStreamCtx && this._sentenceStreamCtx.chunkResolver) { + const rej = this._sentenceStreamCtx.chunkResolver.reject + this._sentenceStreamCtx.chunkResolver = null + rej(new Error(error)) + } + this._job.fail(error) + return + } + + if (data && typeof data === 'object' && data.outputArray) { + try { + this.logger.debug(`TTS job produced output: ${ttsOutputDebugString(data)}`) + } catch (err) { + if (err instanceof RangeError) { + this.logger.debug('TTS job produced output: [data too large]') + } else { + throw err + } + } + if (this._sentenceStreamCtx) { + const ctx = this._sentenceStreamCtx + const idx = ctx.chunkIdx + const sentenceChunk = ctx.chunks[idx] || '' + const enriched = { + outputArray: data.outputArray, + chunkIndex: idx, + sentenceChunk + } + if (data.sampleRate != null) enriched.sampleRate = data.sampleRate + if (!ctx.textStreamMode) { + enriched.isLast = idx >= ctx.chunks.length - 1 + } + this._job.output(enriched) + } else { + this._job.output(data) + } + return + } + + if ( + data && + typeof data === 'object' && + ('totalTime' in data || 'audioDurationMs' in data || 'totalSamples' in data) + ) { + this.logger.info(`TTS job completed. Stats: ${JSON.stringify(data)}`) + if (this._sentenceStreamCtx) { + const ctx = this._sentenceStreamCtx + this._mergeSentenceStreamStats(ctx.acc, data) + if (ctx.chunkResolver) { + ctx.chunkResolver.resolve() + ctx.chunkResolver = null + } + if (ctx.textStreamMode) { + return + } + const isLast = ctx.chunkIdx >= ctx.chunks.length - 1 + if (isLast) { + const totalChars = ctx.chunks.join('').length + const merged = { ...ctx.acc } + merged.tokensPerSecond = + ctx.acc.totalTime > 0 ? totalChars / ctx.acc.totalTime : 0 + merged.realTimeFactor = + ctx.acc.audioDurationMs > 0 + ? (ctx.acc.totalTime * 1000.0) / ctx.acc.audioDurationMs + : 0 + if (this.opts?.stats) { + this._job.end(merged) + } else { + this._job.end() + } + } + return + } + if (this.opts?.stats) { + this._job.end(data) + } else { + this._job.end() + } + return + } + + this.logger.debug(`Received TTS event: ${event}`) + } + + async cancel () { + if (this.addon?.cancel) { + await this.addon.cancel() + } + } + + _failAndClearActiveResponse (reason) { + if (this._sentenceStreamCtx && this._sentenceStreamCtx.chunkResolver) { + this._sentenceStreamCtx.chunkResolver.reject( + reason instanceof Error ? reason : new Error(String(reason)) + ) + this._sentenceStreamCtx.chunkResolver = null + } + this._sentenceStreamCtx = null + this._job.fail(reason) + } + + /** + * Reload the addon with new configuration parameters. + * @param {Object} newConfig + * @param {string} [newConfig.language] + * @param {boolean} [newConfig.useGPU] + * @param {number} [newConfig.outputSampleRate] + */ + async reload (newConfig = {}) { + this.logger.debug('Reloading addon with new configuration', newConfig) + + if (newConfig.language !== undefined) this._config.language = newConfig.language + if (newConfig.useGPU !== undefined) this._config.useGPU = newConfig.useGPU + if (newConfig.outputSampleRate !== undefined) this._outputSampleRate = newConfig.outputSampleRate + + const ttsParams = this._buildTtsParams() + + await this.cancel() + this._failAndClearActiveResponse('Model was reloaded') + + if (this.addon) { + await this.addon.destroyInstance() + } + this.addon = this._createAddon(ttsParams, this._addonOutputCallback.bind(this)) + await this.addon.activate() + } + + static inferenceManagerConfig = { + noAdditionalDownload: true + } + + static getModelKey (params) { + return 'tts-ggml' + } + + static ENGINE_CHATTERBOX = ENGINE_CHATTERBOX + static ENGINE_SUPERTONIC = ENGINE_SUPERTONIC +} + +module.exports = TTSGgml +module.exports.ENGINE_CHATTERBOX = ENGINE_CHATTERBOX +module.exports.ENGINE_SUPERTONIC = ENGINE_SUPERTONIC diff --git a/packages/tts-ggml/lib/error.js b/packages/tts-ggml/lib/error.js new file mode 100644 index 0000000000..768cb98008 --- /dev/null +++ b/packages/tts-ggml/lib/error.js @@ -0,0 +1,88 @@ +'use strict' + +const { QvacErrorBase, addCodes } = require('@qvac/error') + +class QvacErrorAddonTTSGgml extends QvacErrorBase { } + +const { name, version } = require('../package.json') + +// This library has error code range from 13001 to 14000. +// (7001-7999 is owned by @qvac/tts-onnx; a separate range keeps +// the global addCodes() registry conflict-free when both packages +// coexist in the same Bare process.) +// +// Reserved-but-not-thrown today (kept for stable code numbering and +// covered by tts.error.test.js so accidental renumbering breaks loudly): +// - FAILED_TO_PAUSE / FAILED_TO_STOP — pause/stop intentionally not +// implemented in addon-cpp 1.x; cancel() is the only path. +// - JOB_ALREADY_RUNNING — JobRunner already serialises on the C++ side +// and rejects via runJob() returning false; no JS code path throws +// this today. Will be wired in once JS surfaces busy state. +const ERR_CODES = Object.freeze({ + FAILED_TO_ACTIVATE: 13001, + FAILED_TO_APPEND: 13002, + FAILED_TO_GET_STATUS: 13003, + FAILED_TO_PAUSE: 13004, + FAILED_TO_CANCEL: 13005, + FAILED_TO_DESTROY: 13006, + FAILED_TO_UNLOAD: 13007, + FAILED_TO_LOAD: 13008, + FAILED_TO_RELOAD: 13009, + FAILED_TO_STOP: 13010, + JOB_ALREADY_RUNNING: 13011 +}) + +addCodes({ + [ERR_CODES.FAILED_TO_ACTIVATE]: { + name: 'FAILED_TO_ACTIVATE', + message: (message) => `Failed to activate model, error: ${message}` + }, + [ERR_CODES.FAILED_TO_APPEND]: { + name: 'FAILED_TO_APPEND', + message: (message) => `Failed to append data to processing queue, error: ${message}` + }, + [ERR_CODES.FAILED_TO_GET_STATUS]: { + name: 'FAILED_TO_GET_STATUS', + message: (message) => `Failed to get addon status, error: ${message}` + }, + [ERR_CODES.FAILED_TO_PAUSE]: { + name: 'FAILED_TO_PAUSE', + message: (message) => `Failed to pause inference, error: ${message}` + }, + [ERR_CODES.FAILED_TO_CANCEL]: { + name: 'FAILED_TO_CANCEL', + message: (message) => `Failed to cancel inference, error: ${message}` + }, + [ERR_CODES.FAILED_TO_DESTROY]: { + name: 'FAILED_TO_DESTROY', + message: (message) => `Failed to destroy instance, error: ${message}` + }, + [ERR_CODES.FAILED_TO_UNLOAD]: { + name: 'FAILED_TO_UNLOAD', + message: (message) => `Failed to unload model, error: ${message}` + }, + [ERR_CODES.FAILED_TO_LOAD]: { + name: 'FAILED_TO_LOAD', + message: (message) => `Failed to load model, error: ${message}` + }, + [ERR_CODES.FAILED_TO_RELOAD]: { + name: 'FAILED_TO_RELOAD', + message: (message) => `Failed to reload model, error: ${message}` + }, + [ERR_CODES.FAILED_TO_STOP]: { + name: 'FAILED_TO_STOP', + message: (message) => `Failed to stop inference, error: ${message}` + }, + [ERR_CODES.JOB_ALREADY_RUNNING]: { + name: 'JOB_ALREADY_RUNNING', + message: () => 'Cannot set new job: a job is already set or being processed' + } +}, { + name, + version +}) + +module.exports = { + ERR_CODES, + QvacErrorAddonTTSGgml +} diff --git a/packages/tts-ggml/lib/textChunker.js b/packages/tts-ggml/lib/textChunker.js new file mode 100644 index 0000000000..d09cf55e37 --- /dev/null +++ b/packages/tts-ggml/lib/textChunker.js @@ -0,0 +1,224 @@ +'use strict' + +/** + * Text chunking for sentence-stream TTS. + * + * `Intl.Segmenter` is used when present (typical Bun/Node). The Bare worker + * used by the SDK may not define `Intl.Segmenter`; in that case splitting falls + * back to punctuation rules and max-length chunking only. + */ + +/** + * Whether `Intl.Segmenter` exists (Bun/Node). Bare may omit it; callers + * should always use {@link splitTtsText} which falls back to punctuation + * and max-length chunking. + */ +function intlSentenceSegmentationAvailable () { + return ( + typeof Intl !== 'undefined' && + typeof Intl.Segmenter === 'function' + ) +} + +/** + * @param {string} text + * @param {string} [locale] + * @returns {string[]|null} + */ +function splitByIntlSentences (text, locale) { + if (!intlSentenceSegmentationAvailable()) return null + const trimmed = text.trim() + if (!trimmed) return null + try { + const seg = new Intl.Segmenter(locale || 'en', { granularity: 'sentence' }) + const out = [] + for (const s of seg.segment(trimmed)) { + const part = s.segment.trim() + if (part.length > 0) out.push(part) + } + if (out.length === 0) return null + return out + } catch { + return null + } +} + +const SENTENCE_TERMINATORS = /([.!?。!?؟])(\s*)/gu + +/** + * @param {string} text + * @returns {string[]} + */ +function splitByAsciiAndCjkPunctuation (text) { + const parts = [] + let lastIndex = 0 + let m + while ((m = SENTENCE_TERMINATORS.exec(text)) !== null) { + const end = m.index + m[1].length + const slice = text.slice(lastIndex, end).trim() + if (slice.length > 0) parts.push(slice) + lastIndex = m.index + m[0].length + } + const tail = text.slice(lastIndex).trim() + if (tail.length > 0) parts.push(tail) + return parts +} + +/** + * @param {string} text + * @returns {string[]} + */ +function splitByParagraphs (text) { + return text.split(/\n\s*\n/).map(p => p.trim()).filter(p => p.length > 0) +} + +const MIN_CHUNK_GRAPHEMES = 10 + +/** + * @param {string[]} chunks + * @returns {string[]} + */ +function mergeShortChunks (chunks) { + const merged = [] + let buffer = '' + + for (const chunk of chunks) { + if (buffer.length === 0) { + buffer = chunk + continue + } + + const graphemeCount = [...buffer].length + if (graphemeCount < MIN_CHUNK_GRAPHEMES) { + buffer = buffer + ' ' + chunk + } else { + merged.push(buffer) + buffer = chunk + } + } + + if (buffer.length > 0) { + merged.push(buffer) + } + + return merged +} + +/** + * @param {string} s + * @returns {number} + */ +function countScalars (s) { + return [...s].length +} + +/** + * @param {string} text + * @param {number} maxScalars + * @returns {string[]} + */ +function hardSplitByMaxScalars (text, maxScalars) { + if (maxScalars < 10) maxScalars = 10 + const g = [...text] + if (g.length <= maxScalars) return [text] + const out = [] + let i = 0 + while (i < g.length) { + const slice = g.slice(i, i + maxScalars).join('') + out.push(slice) + i += maxScalars + } + return out +} + +/** + * Merge adjacent pieces until each is at most maxScalars (grapheme count). + * @param {string[]} pieces + * @param {number} maxScalars + * @returns {string[]} + */ +function mergeUpToMaxScalars (pieces, maxScalars) { + const out = [] + let current = '' + + for (const p of pieces) { + const piece = p.trim() + if (!piece) continue + const trial = current.length ? `${current} ${piece}` : piece + if (countScalars(trial) <= maxScalars) { + current = trial + } else { + if (current.length > 0) { + out.push(...hardSplitByMaxScalars(current, maxScalars)) + } + current = piece + } + } + if (current.length > 0) { + out.push(...hardSplitByMaxScalars(current, maxScalars)) + } + return out.filter(s => s.trim().length > 0) +} + +/** + * Split long text into synthesis-sized chunks for sentence streaming. + * + * @param {string} text + * @param {object} [options] + * @param {string} [options.language] BCP-47 / model language (e.g. en, ko) + * @param {string} [options.locale] Optional override for Intl.Segmenter + * @param {number} [options.maxScalars] Max graphemes per chunk (default aligns with Supertonic) + * @param {boolean} [options.mergeToMaxScalars] When false, return sentence-level pieces only (no + * mergeUpToMaxScalars pass). Default true. Useful for test harnesses that synthesize per sentence. + * @returns {string[]} + */ +function splitTtsText (text, options = {}) { + const mergeToMaxScalars = options.mergeToMaxScalars !== false + const language = (options.language || 'en').toLowerCase() + const maxScalars = + options.maxScalars != null + ? options.maxScalars + : language === 'ko' + ? 120 + : 300 + + const raw = text.trim() + if (!raw) return [] + + const locale = options.locale || language + + let sentences = splitByIntlSentences(raw, locale) + if (!sentences || sentences.length === 0) { + const paras = splitByParagraphs(raw) + const blocks = paras.length > 0 ? paras : [raw] + sentences = [] + for (const para of blocks) { + const sents = splitByAsciiAndCjkPunctuation(para) + const mergedShort = mergeShortChunks(sents.length > 0 ? sents : [para]) + for (const m of mergedShort) { + if (m.trim()) sentences.push(m.trim()) + } + } + } + + if (sentences.length === 0) { + if (!mergeToMaxScalars) { + return [raw] + } + return mergeUpToMaxScalars([raw], maxScalars) + } + + if (!mergeToMaxScalars) { + return sentences + } + + return mergeUpToMaxScalars(sentences, maxScalars) +} + +module.exports = { + splitTtsText, + intlSentenceSegmentationAvailable, + splitByIntlSentences, + splitByAsciiAndCjkPunctuation, + countScalars +} diff --git a/packages/tts-ggml/lib/textStreamAccumulator.js b/packages/tts-ggml/lib/textStreamAccumulator.js new file mode 100644 index 0000000000..9ce18b5f90 --- /dev/null +++ b/packages/tts-ggml/lib/textStreamAccumulator.js @@ -0,0 +1,173 @@ +'use strict' + +const { countScalars } = require('./textChunker') + +/** + * @param {string} s + * @param {number} n + * @returns {{ head: string, rest: string }} + */ +function splitGraphemeHead (s, n) { + const g = [...s] + if (g.length <= n) { + return { head: s, rest: '' } + } + return { + head: g.slice(0, n).join(''), + rest: g.slice(n).join('') + } +} + +/** + * @param {{ sentenceDelimiter?: RegExp, sentenceDelimiterPreset?: string }} opts + * @returns {(buffer: string) => boolean} + */ +function buildSentenceEndTester (opts) { + if (opts.sentenceDelimiter instanceof RegExp) { + const re = opts.sentenceDelimiter + return function testCustom (buffer) { + re.lastIndex = 0 + return re.test(buffer) + } + } + const preset = opts.sentenceDelimiterPreset || 'multilingual' + const patterns = { + latin: /[.!?…]\s*$/u, + cjk: /[。!?…]\s*$/u, + multilingual: /(?:[.!?…؟]|[。!?…])\s*$/u + } + const re = patterns[preset] || patterns.multilingual + return function testPreset (buffer) { + return re.test(buffer) + } +} + +/** + * Default `maxBufferScalars` aligned with `splitTtsText` when `maxScalars` is unset. + * @param {string} [language] + */ +function defaultMaxBufferScalars (language) { + const lang = (language || 'en').toLowerCase() + return lang === 'ko' ? 120 : 300 +} + +/** + * Coalesces small text fragments from a streaming source into TTS-sized strings: flush when the + * buffer ends with a sentence delimiter, when grapheme length exceeds `maxBufferScalars`, or after + * `flushAfterMs` idle (timer reset on each fragment). Always flushes non-whitespace remainder when + * the source ends. + * + * @param {AsyncIterable} source + * @param {object} opts + * @param {RegExp} [opts.sentenceDelimiter] - If set, overrides `sentenceDelimiterPreset`. + * @param {'latin'|'cjk'|'multilingual'} [opts.sentenceDelimiterPreset] + * @param {number} [opts.maxBufferScalars] + * @param {number} [opts.flushAfterMs] + * @returns {AsyncGenerator} + */ +async function * accumulateTextStream (source, opts) { + const flushAfterMs = opts.flushAfterMs != null ? opts.flushAfterMs : 500 + const defaultMax = defaultMaxBufferScalars(opts.language) + let maxScalars + if (opts.maxBufferScalars == null) { + maxScalars = defaultMax + } else { + const n = Number(opts.maxBufferScalars) + maxScalars = Number.isFinite(n) && n > 0 ? n : defaultMax + } + const testEnd = buildSentenceEndTester(opts) + + const queue = [] + let notify = null + + function push (item) { + queue.push(item) + if (notify) { + const n = notify + notify = null + n() + } + } + + ;(async function pump () { + let buffer = '' + let idleTimer = null + + function clearIdle () { + if (idleTimer) { + clearTimeout(idleTimer) + idleTimer = null + } + } + + function armIdle () { + clearIdle() + idleTimer = setTimeout(() => { + idleTimer = null + const t = buffer.trim() + if (t) { + buffer = '' + push({ kind: 'chunk', text: t }) + } + }, flushAfterMs) + } + + try { + for await (const fragment of source) { + clearIdle() + buffer += String(fragment) + + while (countScalars(buffer) >= maxScalars) { + const { head, rest } = splitGraphemeHead(buffer, maxScalars) + buffer = rest + if (head.length > 0) { + push({ kind: 'chunk', text: head }) + } + } + + if (testEnd(buffer)) { + const t = buffer.trim() + buffer = '' + if (t) { + push({ kind: 'chunk', text: t }) + } + } + + armIdle() + } + + clearIdle() + const tail = buffer.trim() + if (tail) { + push({ kind: 'chunk', text: tail }) + } + push({ kind: 'done' }) + } catch (error) { + clearIdle() + push({ kind: 'err', error }) + } + })() + + while (true) { + while (queue.length === 0) { + await new Promise(resolve => { + notify = resolve + }) + } + const item = queue.shift() + if (item.kind === 'done') { + return + } + if (item.kind === 'err') { + throw item.error + } + yield item.text + } +} + +module.exports = { + accumulateTextStream, + defaultMaxBufferScalars, + buildSentenceEndTester, + splitGraphemeHead +} diff --git a/packages/tts-ggml/package.json b/packages/tts-ggml/package.json new file mode 100644 index 0000000000..027254ee1b --- /dev/null +++ b/packages/tts-ggml/package.json @@ -0,0 +1,116 @@ +{ + "name": "@qvac/tts-ggml", + "version": "0.1.0", + "description": "Text to Speech (TTS) addon for qvac (ggml backend, wrapping the chatterbox + supertonic engines from tts-cpp)", + "addon": true, + "engines": { + "bare": ">=1.19.0" + }, + "scripts": { + "build": "bare-make generate && bare-make build && bare-make install", + "build:pack": "mkdir -p dist && npm pack --pack-destination dist", + "example": "bare examples/chatterbox-tts.js \"Hello from qvac tts ggml.\"", + "example:chatterbox": "bare examples/chatterbox-tts.js \"Hello from qvac tts ggml.\"", + "example:chatterbox-mtl": "bare examples/chatterbox-mtl-tts.js \"Hello from the multilingual Chatterbox engine.\"", + "example:chatterbox-mtl-sweep": "bare examples/chatterbox-mtl-sweep-tts.js", + "example:chatterbox-sentence-stream": "bare examples/chatterbox-sentence-stream-tts.js", + "example:chatterbox-chunk-stream": "bare examples/chatterbox-chunk-stream-tts.js", + "example:supertonic": "bare examples/supertonic-tts.js \"Hello from supertonic.\"", + "example:supertonic-mtl": "bare examples/supertonic-mtl-tts.js \"Hello from the multilingual Supertonic engine.\"", + "example:supertonic-mtl-sweep": "bare examples/supertonic-mtl-sweep-tts.js", + "example:supertonic-sentence-stream": "bare examples/supertonic-sentence-stream-tts.js", + "lint": "standard \"test/**/*.js\" \"*.js\" \"lib/*.js\"", + "lint:fix": "standard --fix \"test/**/*.js\" \"*.js\" \"lib/*.js\"", + "test:unit": "brittle-bare test/unit/**/*.test.js", + "test": "npm run test:unit && npm run test:integration", + "test:integration": "brittle-bare test/integration/**/*.test.js", + "test:mobile:generate": "node scripts/generate-mobile-integration-tests.js", + "test:mobile:validate": "node scripts/validate-mobile-tests.js", + "test:cpp:build": "bare-make generate -D BUILD_TESTING=ON && bare-make build --target tts_ggml_tests && bare-make install", + "test:cpp:run": "cd build && LLVM_PROFILE_FILE=default.profraw ./tts_ggml_tests --gtest_output=xml:cpp-test-results.xml", + "test:cpp": "npm run test:cpp:build && npm run test:cpp:run", + "coverage:cpp:build": "bare-make generate -D BUILD_TESTING=ON -D ENABLE_COVERAGE=ON && bare-make build --target tts_ggml_tests", + "coverage:cpp:summary": "cd build && llvm-cov report ./tts_ggml_tests --instr-profile=coverage.profdata -ignore-filename-regex='(tests|build|node_modules|gtest|gmock|\\.vcpkg|/usr)/' > coverage-summary.txt", + "coverage:cpp:report": "cd build && ls -lha && llvm-profdata merge -sparse default.profraw -o coverage.profdata && llvm-cov show ./tts_ggml_tests -instr-profile=coverage.profdata -format=html -output-dir=coverage-html -ignore-filename-regex='(tests|build|node_modules|gtest|gmock|\\.vcpkg|/usr)/' && llvm-cov export ./tts_ggml_tests -instr-profile=coverage.profdata -format=lcov -ignore-filename-regex='(tests|build|node_modules|gtest|gmock|\\.vcpkg|/usr)/' > lcov.info && npm run coverage:cpp:summary", + "coverage:cpp": "npm run coverage:cpp:build && npm run test:cpp:run && npm run coverage:cpp:report", + "test:dts": "tsc index.d.ts addonLogging.d.ts --noEmit --lib es2018 --esModuleInterop --skipLibCheck", + "setup:venv": "bash scripts/setup-venv.sh", + "convert-models": "bash scripts/convert-models.sh", + "setup-models": "bash scripts/setup-venv.sh && bash scripts/convert-models.sh" + }, + "files": [ + "addonLogging.js", + "addonLogging.d.ts", + "binding.js", + "index.js", + "lib", + "prebuilds", + "index.d.ts", + "tts.js", + "test/integration", + "test/mobile", + "test/reference-audio", + "test/utils/runTTS.js", + "test/utils/runChatterboxTTS.js", + "test/utils/runSupertonicTTS.js", + "test/utils/downloadModel.js", + "test/utils/wav-helper.js", + "test/utils/loader.fake.js", + "test/utils/pcmConcatenator.js", + "test/utils/runWhisper.js", + "test/data/sentences-medium.js", + "test/data/sentences-long.js", + "LICENSE" + ], + "exports": { + "./package": "./package.json", + ".": "./index.js", + "./text-chunker": "./lib/textChunker.js", + "./addonLogging": { + "types": "./addonLogging.d.ts", + "default": "./addonLogging.js" + }, + "./addonLogging.js": "./addonLogging.js", + "./lib/textStreamAccumulator.js": "./lib/textStreamAccumulator.js" + }, + "license": "Apache-2.0", + "repository": { + "type": "git", + "url": "git+https://github.com/tetherto/qvac.git", + "directory": "packages/tts-ggml" + }, + "bugs": "https://github.com/tetherto/qvac/issues", + "homepage": "https://github.com/tetherto/qvac/tree/main/packages/tts-ggml#readme", + "devDependencies": { + "@qvac/langdetect-text": "^0.1.2", + "@qvac/transcription-whispercpp": "^0.5.5", + "bare-buffer": "^3.4.2", + "bare-node-stream": "^1.0.0", + "bare-process": "^4.2.2", + "bare-subprocess": "^5.2.1", + "bare-tty": "^5.0.3", + "brittle": "^3.17.0", + "cmake-bare": "^1.7.5", + "cmake-vcpkg": "^1.1.0", + "fflate": "^0.8.2", + "fs": "npm:bare-fs", + "husky": "^9.1.7", + "os": "npm:bare-os@^3.6.2", + "process": "npm:bare-process", + "sinon": "^21.0.0", + "standard": "^17.0.0", + "stream": "npm:bare-node-stream", + "tty": "npm:bare-node-tty", + "typescript": "^6.0.3", + "util": "npm:bare-utils@^1.5.1" + }, + "dependencies": { + "@qvac/error": "^0.1.0", + "@qvac/infer-base": "^0.4.0", + "@qvac/logging": "^0.1.0", + "bare-fs": "^4.5.6", + "bare-os": "^3.8.0", + "bare-path": "^3.0.0" + }, + "types": "index.d.ts" +} diff --git a/packages/tts-ggml/scripts/convert-models.sh b/packages/tts-ggml/scripts/convert-models.sh new file mode 100644 index 0000000000..86c9560dc0 --- /dev/null +++ b/packages/tts-ggml/scripts/convert-models.sh @@ -0,0 +1,302 @@ +#!/usr/bin/env bash +# +# Download upstream Resemble Chatterbox + Supertone Supertonic checkpoints +# and convert them into the single-file .gguf format the ggml backend +# consumes. Wraps the in-tree converters under scripts/ (vendored from +# the standalone chatterbox.cpp repo; see each .py file's header +# comment). +# +# The converters fetch their source weights through huggingface_hub at +# convert time; there is no separate "download" step. Models land in +# ./models/ by default. +# +# Requirements: +# - A Python venv at ./venv with `gguf`, `numpy`, `torch`, +# `safetensors`, `huggingface_hub`, `onnx` installed. Run +# `./scripts/setup-venv.sh` (or `npm run setup:venv`) once. +# +# Usage: +# ./scripts/convert-models.sh [flags] +# +# Flags: +# --type, -t +# Which model family (default: all). +# supertonic = both supertonic-en + supertonic-mtl. +# --quant, -q +# Chatterbox quant tier (default: f16). +# Mapped to --ftype for Supertonic, which +# only accepts f32 | f16 | q8_0; q5_0 / q4_0 +# fall back to f16 there. +# --python Python interpreter (default: $PYTHON, +# then ./venv/bin/python, then +# ./venv/Scripts/python.exe, then python3) +# --output, -o GGUF output dir (default: ./models) +# --hf-token HuggingFace auth token, forwarded to +# each converter's --hf-token. Optional; +# only needed for gated repos. +# --force, -f Re-convert even if the .gguf already +# exists at the target path. +# --help, -h Show this help. +# +# Examples: +# ./scripts/convert-models.sh # all variants, f16 +# ./scripts/convert-models.sh -t turbo -q q8_0 # Turbo only, q8_0 +# ./scripts/convert-models.sh -t multilingual -q q4_0 # Multilingual q4_0 +# ./scripts/convert-models.sh -t supertonic # Supertonic only + +set -euo pipefail + +TYPE="all" +QUANT="f16" +PYTHON_BIN="${PYTHON:-}" +OUTPUT_DIR="./models" +HF_TOKEN="" +FORCE=0 + +print_usage() { + sed -n '/^# Usage:/,/^set -euo/p' "$0" | sed -e '/^set -euo/d' -e 's/^# *//' >&2 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --type|-t) TYPE="$2"; shift 2;; + --quant|-q) QUANT="$2"; shift 2;; + --python) PYTHON_BIN="$2"; shift 2;; + --output|-o) OUTPUT_DIR="$2"; shift 2;; + --hf-token) HF_TOKEN="$2"; shift 2;; + --force|-f) FORCE=1; shift;; + --help|-h) print_usage; exit 0;; + *) echo "Unknown flag: $1" >&2; print_usage; exit 2;; + esac +done + +case "$TYPE" in + turbo|multilingual|supertonic|supertonic-en|supertonic-mtl|all) ;; + *) echo "Error: --type must be turbo|multilingual|supertonic|supertonic-en|supertonic-mtl|all" >&2; exit 2;; +esac +case "$QUANT" in + f32|f16|q8_0|q5_0|q4_0) ;; + *) echo "Error: --quant must be f32|f16|q8_0|q5_0|q4_0" >&2; exit 2;; +esac + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PKG_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" + +if [[ -z "$PYTHON_BIN" ]]; then + if [[ -x "$PKG_DIR/venv/bin/python" ]]; then + PYTHON_BIN="$PKG_DIR/venv/bin/python" + elif [[ -x "$PKG_DIR/venv/Scripts/python.exe" ]]; then + PYTHON_BIN="$PKG_DIR/venv/Scripts/python.exe" + else + PYTHON_BIN="python3" + fi +fi + +if ! command -v "$PYTHON_BIN" >/dev/null 2>&1 && [[ ! -x "$PYTHON_BIN" ]]; then + echo "Error: python interpreter not found: $PYTHON_BIN" >&2 + echo " run \`npm run setup:venv\` or pass --python ." >&2 + exit 1 +fi + +# Sanity-check the python env has the modules the converters need. +# Failing fast here is better than a cryptic ModuleNotFoundError dump +# halfway through a multi-GB download. +missing_modules=$("$PYTHON_BIN" -c ' +import sys +mods = ["gguf", "numpy", "torch", "safetensors", "huggingface_hub", "onnx"] +missing = [] +for m in mods: + try: + __import__(m) + except ImportError: + missing.append(m) +print(",".join(missing)) +' 2>/dev/null || echo 'PYTHON_BROKEN') + +if [[ "$missing_modules" == "PYTHON_BROKEN" ]]; then + echo "Error: python interpreter $PYTHON_BIN failed to start." >&2 + exit 1 +fi +if [[ -n "$missing_modules" ]]; then + echo "Error: python at $PYTHON_BIN is missing required module(s): ${missing_modules//,/, }" >&2 + echo " run \`npm run setup:venv\` to provision ./venv with scripts/requirements.txt," >&2 + echo " or pass --python /path/to/venv/bin/python with those modules installed." >&2 + exit 1 +fi + +bytes_human() { + local b=${1:-0} + awk -v b="$b" 'BEGIN { + if (b >= 1073741824) printf "%.2f GiB", b / 1073741824 + else if (b >= 1048576) printf "%.2f MiB", b / 1048576 + else printf "%d B", b + }' +} + +mkdir -p "$OUTPUT_DIR" + +# Maps the unified --quant flag onto Supertonic's --ftype which only +# accepts f32 | f16 | q8_0. Quantisation tiers below q8 fall back to +# f16; the Supertonic CFM diffusion is sensitive enough that lower +# tiers degrade audibly. +supertonic_ftype() { + case "$QUANT" in + f32) echo "f32";; + q8_0) echo "q8_0";; + *) echo "f16";; # f16, q5_0, q4_0 -> f16 + esac +} + +is_skip() { + # Returns 0 (true) if --force is off and the target file is non-empty. + local path="$1" + if [[ "$FORCE" -eq 1 ]]; then + return 1 + fi + if [[ -s "$path" ]]; then + local sz; sz=$(stat -f%z "$path" 2>/dev/null || stat -c%s "$path" 2>/dev/null || echo 0) + echo " - $(basename "$path"): already converted ($(bytes_human "$sz")) -- pass --force to redo" + return 0 + fi + return 1 +} + +run_converter() { + # $1 = label, $2 = converter script, rest = args + local label="$1"; shift + local converter="$1"; shift + local out="" + for ((i = 1; i <= $#; i++)); do + if [[ "${!i}" == "--out" ]]; then + local next=$((i + 1)) + out="${!next}" + fi + done + + echo " > ${label}: converting" + if ! "$PYTHON_BIN" "$SCRIPT_DIR/$converter" "$@"; then + echo " x ${label}: conversion failed (see python traceback above)" >&2 + [[ -n "$out" ]] && rm -f "$out" + return 1 + fi + if [[ -n "$out" ]] && [[ ! -s "$out" ]]; then + echo " x ${label}: produced empty output -- removing" >&2 + rm -f "$out" + return 1 + fi + if [[ -n "$out" ]]; then + local sz; sz=$(stat -f%z "$out" 2>/dev/null || stat -c%s "$out" 2>/dev/null || echo 0) + echo " - ${label}: $(basename "$out") ($(bytes_human "$sz"))" + fi +} + +# NOTE on `${hf_args[@]+"${hf_args[@]}"}` below: +# Bash 3.2 (the system bash on macOS runners) treats `"${arr[@]}"` as +# unset-variable access when the array is empty AND `set -u` (nounset) +# is in effect, which `set -euo pipefail` at the top of this script +# enables. CI on darwin-arm64 hits this with HF_TOKEN unset: +# scripts/convert-models.sh: line 200: hf_args[@]: unbound variable +# The `${arr[@]+"${arr[@]}"}` idiom expands to the array if it's +# defined and to nothing otherwise — works under nounset on bash 3.2+. +# Don't simplify this back to `"${hf_args[@]}"` without testing on +# macOS bash 3.2 with HF_TOKEN unset. + +convert_turbo() { + local t3_out="$OUTPUT_DIR/chatterbox-t3-turbo.gguf" + local s3_out="$OUTPUT_DIR/chatterbox-s3gen.gguf" + local hf_args=() + [[ -n "$HF_TOKEN" ]] && hf_args=(--hf-token "$HF_TOKEN") + + if ! is_skip "$t3_out"; then + run_converter "Turbo T3" convert-t3-turbo-to-gguf.py \ + --out "$t3_out" --quant "$QUANT" ${hf_args[@]+"${hf_args[@]}"} || return 1 + fi + if ! is_skip "$s3_out"; then + run_converter "Turbo S3Gen" convert-s3gen-to-gguf.py \ + --variant turbo --out "$s3_out" --quant "$QUANT" ${hf_args[@]+"${hf_args[@]}"} || return 1 + fi +} + +convert_multilingual() { + local t3_out="$OUTPUT_DIR/chatterbox-t3-mtl.gguf" + local s3_out="$OUTPUT_DIR/chatterbox-s3gen-mtl.gguf" + local hf_args=() + [[ -n "$HF_TOKEN" ]] && hf_args=(--hf-token "$HF_TOKEN") + + if ! is_skip "$t3_out"; then + run_converter "MTL T3" convert-t3-mtl-to-gguf.py \ + --out "$t3_out" --quant "$QUANT" ${hf_args[@]+"${hf_args[@]}"} || return 1 + fi + if ! is_skip "$s3_out"; then + run_converter "MTL S3Gen" convert-s3gen-to-gguf.py \ + --variant mtl --out "$s3_out" --quant "$QUANT" ${hf_args[@]+"${hf_args[@]}"} || return 1 + fi +} + +convert_supertonic_en() { + # Pulls the English-only Supertone/supertonic checkpoint. Cheaper / + # smaller than supertonic-2 and is the default the addon's Supertonic + # examples use. Output file: models/supertonic.gguf. + local out="$OUTPUT_DIR/supertonic.gguf" + local ftype; ftype=$(supertonic_ftype) + local hf_args=() + [[ -n "$HF_TOKEN" ]] && hf_args=(--hf-token "$HF_TOKEN") + + if ! is_skip "$out"; then + run_converter "Supertonic (English)" convert-supertonic2-to-gguf.py \ + --arch supertonic --out "$out" --ftype "$ftype" ${hf_args[@]+"${hf_args[@]}"} || return 1 + fi +} + +convert_supertonic_mtl() { + # Pulls the multilingual Supertone/supertonic-2 checkpoint. Output + # file: models/supertonic2.gguf. Supports en/ko/es/pt/fr today via + # tts-cpp's supertonic_preprocess.cpp::is_supported_language. + local out="$OUTPUT_DIR/supertonic2.gguf" + local ftype; ftype=$(supertonic_ftype) + local hf_args=() + [[ -n "$HF_TOKEN" ]] && hf_args=(--hf-token "$HF_TOKEN") + + if ! is_skip "$out"; then + run_converter "Supertonic (multilingual)" convert-supertonic2-to-gguf.py \ + --arch supertonic2 --out "$out" --ftype "$ftype" ${hf_args[@]+"${hf_args[@]}"} || return 1 + fi +} + +convert_supertonic() { + # Bundle: convert both English + multilingual checkpoints, mirroring how + # the chatterbox `multilingual` group converts both T3 + S3Gen. Either + # leg can be requested individually via --type supertonic-en / + # --type supertonic-mtl. + local rc=0 + convert_supertonic_en || rc=$((rc + 1)) + convert_supertonic_mtl || rc=$((rc + 1)) + return $rc +} + +echo "Converting upstream sources -> .gguf -- type=${TYPE} quant=${QUANT}" +echo "Python: ${PYTHON_BIN}" +echo "Output: ${OUTPUT_DIR}" +echo + +failures=0 +case "$TYPE" in + turbo) convert_turbo || failures=$((failures + 1));; + multilingual) convert_multilingual || failures=$((failures + 1));; + supertonic-en) convert_supertonic_en || failures=$((failures + 1));; + supertonic-mtl) convert_supertonic_mtl || failures=$((failures + 1));; + supertonic) convert_supertonic || failures=$((failures + 1));; + all) + convert_turbo || failures=$((failures + 1)) + convert_multilingual || failures=$((failures + 1)) + convert_supertonic || failures=$((failures + 1)) + ;; +esac + +echo +if [[ "$failures" -gt 0 ]]; then + echo "${failures} conversion(s) failed -- see warnings above." >&2 + exit 1 +fi +echo "All conversions complete. Try:" +echo " bare examples/chatterbox-tts.js \"Hello from qvac tts ggml.\"" diff --git a/packages/tts-ggml/scripts/convert-s3gen-to-gguf.py b/packages/tts-ggml/scripts/convert-s3gen-to-gguf.py new file mode 100644 index 0000000000..3b0be7969d --- /dev/null +++ b/packages/tts-ggml/scripts/convert-s3gen-to-gguf.py @@ -0,0 +1,566 @@ +#!/usr/bin/env python3 +""" +Convert Chatterbox Turbo S3Gen (flow + mel2wav) weights to GGUF. + +Optional block quantization (--quant q4_0 | q5_0 | q8_0) uses the same +tensor selection rules as scripts/requantize-gguf.py (large 2-D weights in +flow / cfm / hift only; deny-list for embeddings, voice encoders, norms, +biases, and filterbanks). + +Exports: + - flow.input_embedding (6561, 512) + - flow.spk_embed_affine weight + bias + - flow.encoder.embed subsampling layer + - flow.encoder.pre_lookahead conv1 + conv2 weights + - flow.encoder.encoders.{0..5} 6 Conformer blocks (with rel-pos attn) + - flow.encoder.up_layer upsample conv + - flow.encoder.up_embed second subsampling + - flow.encoder.up_encoders.{0..3} 4 more Conformer blocks + - flow.encoder.after_norm LayerNorm + - flow.encoder_proj Linear(512->80) + - flow.decoder.estimator ConditionalDecoder (U-Net with transformer blocks) + - mel2wav.* HiFTGenerator (weight_norm convs resolved) + +Also embeds built-in S3Gen conditionals: + - prompt_token (250,) int32 + - prompt_feat (500, 80) float32 + - embedding (1, 192) float32 +""" + +import argparse +import importlib.util +import re +import sys +from pathlib import Path +from typing import Optional + +import gguf +import numpy as np +import torch +from huggingface_hub import snapshot_download +from safetensors.torch import load_file + + +TURBO_REPO_ID = "ResembleAI/chatterbox-turbo" +MTL_REPO_ID = "ResembleAI/chatterbox" + +VARIANTS = { + "turbo": { + "repo_id": TURBO_REPO_ID, + "allow_patterns": ["*.safetensors", "*.json", "*.txt", "*.pt", "*.model"], + "ckpt_filename": "s3gen_meanflow.safetensors", + "loader": "safetensors", + "gguf_name": "Chatterbox Turbo S3Gen", + "gguf_description": "S3Gen flow + mel2wav (HiFT) for ggml port.", + "meanflow": True, + "n_timesteps": 2, + "cfg_rate": 0.0, + }, + "mtl": { + "repo_id": MTL_REPO_ID, + "allow_patterns": ["ve.pt", "t3_mtl23ls_v2.safetensors", "s3gen.pt", + "grapheme_mtl_merged_expanded_v1.json", "conds.pt", "Cangjie5_TC.json"], + "ckpt_filename": "s3gen.pt", + "loader": "torch", + "gguf_name": "Chatterbox Multilingual S3Gen", + "gguf_description": "S3Gen standard-CFM (10-step Euler, CFG) + HiFT vocoder for ggml port.", + "meanflow": False, + "n_timesteps": 10, + "cfg_rate": 0.7, + }, +} + + +QUANT_CHOICES = ("f32", "f16", "q8_0", "q5_0", "q4_0") + + +def _load_requantize_policy(): + """Load should_quantize + _QUANT_TYPE from requantize-gguf.py (single source of truth).""" + path = Path(__file__).resolve().parent / "requantize-gguf.py" + spec = importlib.util.spec_from_file_location("_chatterbox_requantize_policy", path) + if spec is None or spec.loader is None: + print(f"error: could not load quant policy from {path}", file=sys.stderr) + sys.exit(1) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod.should_quantize, mod._QUANT_TYPE + + +_SHOULD_QUANTIZE, _RQ_QUANT_TYPE = _load_requantize_policy() + + +def parse_args() -> argparse.Namespace: + ap = argparse.ArgumentParser(description="Convert Chatterbox S3Gen weights to GGUF.") + ap.add_argument("--variant", choices=list(VARIANTS.keys()), default="turbo", + help="Which S3Gen checkpoint to convert. 'turbo' = meanflow (2-step)," + " 'mtl' = standard CFM (10-step + CFG).") + ap.add_argument("--ckpt-dir", type=Path, help="Local checkpoint dir (downloads from HF if omitted).") + ap.add_argument("--out", type=Path, default=None, + help="Defaults to models/chatterbox-s3gen.gguf (turbo) or " + "models/chatterbox-s3gen-mtl.gguf (mtl).") + ap.add_argument("--hf-token", default=None, help="Optional Hugging Face token.") + ap.add_argument( + "--quant", + choices=QUANT_CHOICES, + default="f16", + help=( + "Target format for the big matmul weights (encoder Linears, " + "CFM attn/FF Linears, HiFT Conv1d weights, CAMPPlus/S3TokenizerV2). " + "Biases, LayerNorm gammas/betas, embeddings, filterbanks and " + "built-in conditionals always stay F32. Tensors whose shape cannot " + "hold the requested block quant (rank != 2 or ne[0] not a multiple " + "of 32) transparently fall back to F16 so conv kernels still " + "benefit even at q8_0/q5_0/q4_0. q8_0/q5_0/q4_0 follow the same " + "deny-list as scripts/requantize-gguf.py (no quant on " + "flow/input_embedding, campplus, s3tokv2, builtins, mel " + "filterbanks, norms/biases). Default f16 stores all float " + "weights as F32 in GGUF (the pre-multilingual baseline)." + ), + ) + args = ap.parse_args() + if args.out is None: + args.out = Path("models/chatterbox-s3gen-mtl.gguf") if args.variant == "mtl" \ + else Path("models/chatterbox-s3gen.gguf") + return args + + +def as_numpy(tensor: torch.Tensor, *, dtype=None) -> np.ndarray: + if dtype is not None: + tensor = tensor.to(dtype) + return np.ascontiguousarray(tensor.detach().cpu().numpy()) + + +def resolve_weight_norm(state: dict[str, torch.Tensor], prefix: str) -> torch.Tensor: + """ + PyTorch weight_norm stores original0 (g, magnitudes) and original1 (v, direction). + Actual weight = g * v / ||v||_2. For 2D convs we broadcast appropriately. + Returns the fused weight tensor. + """ + g = state[f"{prefix}.parametrizations.weight.original0"] + v = state[f"{prefix}.parametrizations.weight.original1"] + # ||v|| is computed over all dims except 0 (the output channel dim) + # by default for Conv1d. See torch.nn.utils.weight_norm. + norm = v.flatten(1).norm(dim=1).view(-1, *([1] * (v.ndim - 1))) + return g * v / norm + + +def expand_weight_norm(state: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """ + Rewrite all `*.parametrizations.weight.original{0,1}` entries into a single + `*.weight` tensor and drop the originals. Also rename `*.parametrizations.weight.0.original0` + etc. if present. + """ + out = dict(state) + prefixes = set() + for k in state: + m = re.match(r"(.+)\.parametrizations\.weight\.original0$", k) + if m: + prefixes.add(m.group(1)) + for p in prefixes: + out[f"{p}.weight"] = resolve_weight_norm(state, p) + out.pop(f"{p}.parametrizations.weight.original0", None) + out.pop(f"{p}.parametrizations.weight.original1", None) + return out + + +def export(writer: gguf.GGUFWriter, state: dict, name: str, *, dtype=torch.float32): + arr = as_numpy(state[name], dtype=dtype) + # Map the name to a GGUF-friendly name but keep the hierarchy recognizable. + gguf_name = name + writer.add_tensor(gguf_name, arr) + return arr.shape + + +def add_tensor_maybe_q( + writer: gguf.GGUFWriter, + name: str, + arr: np.ndarray, + quant: str, + *, + stats: Optional[dict[str, int]] = None, +) -> None: + """Write a tensor; quantize large 2-D float weights when quant != f16.""" + if arr.dtype.kind in "iu" or np.issubdtype(arr.dtype, np.integer): + writer.add_tensor(name, arr) + return + if quant in ("f16", "f32"): + writer.add_tensor(name, arr) + return + + qtype = _RQ_QUANT_TYPE[quant] + if not _SHOULD_QUANTIZE(name, arr.shape, qtype): + writer.add_tensor(name, arr) + return + + qdata = gguf.quants.quantize(np.ascontiguousarray(arr.astype(np.float32)), qtype) + writer.add_tensor(name, qdata, raw_shape=qdata.shape, raw_dtype=qtype) + if stats is not None: + stats["n_quant"] = stats.get("n_quant", 0) + 1 + + +def export_conformer_block( + writer: gguf.GGUFWriter, + state: dict, + prefix: str, + gguf_prefix: str, + quant: str, + *, + stats: Optional[dict[str, int]] = None, +): + """Export one Conformer encoder block.""" + mapping = { + "norm_mha.weight": ("norm_mha/w", torch.float32), + "norm_mha.bias": ("norm_mha/b", torch.float32), + "norm_ff.weight": ("norm_ff/w", torch.float32), + "norm_ff.bias": ("norm_ff/b", torch.float32), + "self_attn.linear_q.weight": ("attn/q/w", torch.float32), + "self_attn.linear_q.bias": ("attn/q/b", torch.float32), + "self_attn.linear_k.weight": ("attn/k/w", torch.float32), + "self_attn.linear_k.bias": ("attn/k/b", torch.float32), + "self_attn.linear_v.weight": ("attn/v/w", torch.float32), + "self_attn.linear_v.bias": ("attn/v/b", torch.float32), + "self_attn.linear_out.weight": ("attn/o/w", torch.float32), + "self_attn.linear_out.bias": ("attn/o/b", torch.float32), + "self_attn.linear_pos.weight": ("attn/pos/w", torch.float32), + "self_attn.pos_bias_u": ("attn/pos_bias_u", torch.float32), + "self_attn.pos_bias_v": ("attn/pos_bias_v", torch.float32), + "feed_forward.w_1.weight": ("ff/w1/w", torch.float32), + "feed_forward.w_1.bias": ("ff/w1/b", torch.float32), + "feed_forward.w_2.weight": ("ff/w2/w", torch.float32), + "feed_forward.w_2.bias": ("ff/w2/b", torch.float32), + } + for src_suffix, (dst_suffix, dtype) in mapping.items(): + src = f"{prefix}.{src_suffix}" + dst = f"{gguf_prefix}/{dst_suffix}" + arr = as_numpy(state[src], dtype=dtype) + add_tensor_maybe_q(writer, dst, arr, quant, stats=stats) + + +def main(): + args = parse_args() + cfg = VARIANTS[args.variant] + if args.ckpt_dir: + ckpt_dir = args.ckpt_dir + else: + ckpt_dir = Path(snapshot_download( + repo_id=cfg["repo_id"], token=args.hf_token, + allow_patterns=cfg["allow_patterns"], + )) + args.out.parent.mkdir(parents=True, exist_ok=True) + + ckpt_path = ckpt_dir / cfg["ckpt_filename"] + print(f"Loading {ckpt_path}") + if cfg["loader"] == "safetensors": + raw = load_file(ckpt_path) + elif cfg["loader"] == "torch": + raw = torch.load(ckpt_path, map_location="cpu", weights_only=True) + else: + raise ValueError(f"unknown loader: {cfg['loader']}") + state = expand_weight_norm(raw) + + print(f"Resolved {len([k for k in raw if 'parametrizations' in k])} weight_norm entries") + + conds = torch.load(ckpt_dir / "conds.pt", map_location="cpu", weights_only=True) + gen = conds["gen"] + + writer = gguf.GGUFWriter(str(args.out), "chatterbox-s3gen") + writer.add_name(cfg["gguf_name"]) + writer.add_description(cfg["gguf_description"]) + writer.add_string("s3gen.quantization", args.quant) + + writer.add_string("s3gen.variant", args.variant) + writer.add_bool("s3gen.meanflow", cfg["meanflow"]) + writer.add_uint32("s3gen.n_timesteps", cfg["n_timesteps"]) + writer.add_float32("s3gen.cfg_rate", cfg["cfg_rate"]) + + qstats: Optional[dict[str, int]] = {"n_quant": 0} if args.quant not in ("f16", "f32") else None + + # Meta / hparams + writer.add_uint32("s3gen.speech_vocab_size", 6561) + writer.add_uint32("s3gen.input_size", 512) + writer.add_uint32("s3gen.output_size", 80) + writer.add_uint32("s3gen.encoder.n_blocks", 6) + writer.add_uint32("s3gen.encoder.up_n_blocks", 4) + writer.add_uint32("s3gen.encoder.attention_heads", 8) + writer.add_uint32("s3gen.encoder.head_dim", 64) + writer.add_uint32("s3gen.encoder.ff_size", 2048) + writer.add_uint32("s3gen.encoder.token_mel_ratio", 2) + writer.add_uint32("s3gen.encoder.pre_lookahead_len", 3) + writer.add_float32("s3gen.layer_norm_eps", 1e-12) + writer.add_uint32("s3gen.spk_embed_dim", 192) + + # Built-in conditionals + prompt_token = gen["prompt_token"].reshape(-1).to(torch.int32) + prompt_feat = gen["prompt_feat"].squeeze(0) # (500, 80) + embedding = gen["embedding"].squeeze(0) # (192,) + writer.add_uint32("s3gen.builtin.prompt_token_len", int(prompt_token.numel())) + writer.add_uint32("s3gen.builtin.prompt_feat_frames", int(prompt_feat.shape[0])) + add_tensor_maybe_q(writer, "s3gen/builtin/prompt_token", as_numpy(prompt_token), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "s3gen/builtin/prompt_feat", as_numpy(prompt_feat, dtype=torch.float32), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "s3gen/builtin/embedding", as_numpy(embedding, dtype=torch.float32), args.quant, stats=qstats) + + # Flow top-level weights + add_tensor_maybe_q(writer, "flow/input_embedding", as_numpy(state["flow.input_embedding.weight"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/spk_embed_affine/w", as_numpy(state["flow.spk_embed_affine_layer.weight"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/spk_embed_affine/b", as_numpy(state["flow.spk_embed_affine_layer.bias"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder_proj/w", as_numpy(state["flow.encoder_proj.weight"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder_proj/b", as_numpy(state["flow.encoder_proj.bias"]), args.quant, stats=qstats) + + # Encoder embed (LinearNoSubsampling: Linear(512 -> 512) + LayerNorm) + add_tensor_maybe_q(writer, "flow/encoder/embed/linear/w", as_numpy(state["flow.encoder.embed.out.0.weight"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder/embed/linear/b", as_numpy(state["flow.encoder.embed.out.0.bias"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder/embed/norm/w", as_numpy(state["flow.encoder.embed.out.1.weight"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder/embed/norm/b", as_numpy(state["flow.encoder.embed.out.1.bias"]), args.quant, stats=qstats) + + # PreLookaheadLayer: two convs (kernel 4 and 3). Use F32 via custom im2col+matmul. + add_tensor_maybe_q(writer, "flow/encoder/pre_lookahead/conv1/w", as_numpy(state["flow.encoder.pre_lookahead_layer.conv1.weight"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder/pre_lookahead/conv1/b", as_numpy(state["flow.encoder.pre_lookahead_layer.conv1.bias"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder/pre_lookahead/conv2/w", as_numpy(state["flow.encoder.pre_lookahead_layer.conv2.weight"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder/pre_lookahead/conv2/b", as_numpy(state["flow.encoder.pre_lookahead_layer.conv2.bias"]), args.quant, stats=qstats) + + # 6 Conformer blocks. + for i in range(6): + export_conformer_block(writer, state, + f"flow.encoder.encoders.{i}", + f"flow/encoder/block{i}", + args.quant, + stats=qstats) + + # Upsample1D (Conv1d with kernel 5) — F32 (we use conv1d_f32 in C++) + add_tensor_maybe_q(writer, "flow/encoder/up_layer/conv/w", as_numpy(state["flow.encoder.up_layer.conv.weight"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder/up_layer/conv/b", as_numpy(state["flow.encoder.up_layer.conv.bias"]), args.quant, stats=qstats) + + # up_embed (second subsampling) + add_tensor_maybe_q(writer, "flow/encoder/up_embed/linear/w", as_numpy(state["flow.encoder.up_embed.out.0.weight"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder/up_embed/linear/b", as_numpy(state["flow.encoder.up_embed.out.0.bias"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder/up_embed/norm/w", as_numpy(state["flow.encoder.up_embed.out.1.weight"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder/up_embed/norm/b", as_numpy(state["flow.encoder.up_embed.out.1.bias"]), args.quant, stats=qstats) + + # 4 more Conformer blocks. + for i in range(4): + export_conformer_block(writer, state, + f"flow.encoder.up_encoders.{i}", + f"flow/encoder/up_block{i}", + args.quant, + stats=qstats) + + # Final after_norm + add_tensor_maybe_q(writer, "flow/encoder/after_norm/w", as_numpy(state["flow.encoder.after_norm.weight"]), args.quant, stats=qstats) + add_tensor_maybe_q(writer, "flow/encoder/after_norm/b", as_numpy(state["flow.encoder.after_norm.bias"]), args.quant, stats=qstats) + + # Decoder estimator (CFM) — the critical path on CPU/Metal/Vulkan since + # it runs 10-20 forwards per utterance on standard CFM. Linear weights + # pick up Q8_0 and Conv1d kernels pick up F16; LayerNorm gammas/betas + + # biases are rank-1 and stay F32 via the requantize policy guard. + decoder_keys = sorted(k for k in state if k.startswith("flow.decoder.estimator.")) + for k in decoder_keys: + gguf_name = k.replace("flow.decoder.estimator.", "cfm/").replace(".", "/") + add_tensor_maybe_q(writer, gguf_name, as_numpy(state[k], dtype=torch.float32), args.quant, stats=qstats) + + # mel2wav (HiFTGenerator): dozens of weight_norm Conv1d layers feeding + # the 24 kHz vocoder. These are almost all rank-3 (K, IC, OC) with + # short kernels → F16 at any --quant >= f16. Real bandwidth savings on + # every backend (HiFT decode is ~8% of CPU wall time on MTL). + mel2wav_keys = sorted(k for k in state if k.startswith("mel2wav.")) + for k in mel2wav_keys: + gguf_name = k.replace("mel2wav.", "hift/").replace(".", "/") + add_tensor_maybe_q(writer, gguf_name, as_numpy(state[k], dtype=torch.float32), args.quant, stats=qstats) + + # Bake in the pre-computed 80-channel mel filterbank used by + # s3gen.utils.mel.mel_spectrogram so the C++ side can compute prompt_feat + # natively for voice cloning (see src/voice_features.cpp). + import librosa + mel_fb_24k_80 = librosa.filters.mel( + sr=24000, n_fft=1920, n_mels=80, fmin=0, fmax=8000, + ).astype(np.float32) # (80, 961) + add_tensor_maybe_q(writer, "s3gen/mel_fb/24k_80", np.ascontiguousarray(mel_fb_24k_80), args.quant, stats=qstats) + + # ------------------------------------------------------------------------- + # CAMPPlus speaker encoder (FunASR/3D-Speaker xvector port). Produces the + # 192-d `embedding` tensor that drives S3Gen's spk_embed_affine layer. + # We fuse every BatchNorm's affine + running stats into a per-channel + # (scale, shift) pair so the C++ side can skip BN as its own module. + # y = gamma * (x - mean) / sqrt(var + eps) + beta + # = x * scale + shift + # scale = gamma / sqrt(var + eps) (=1/sqrt(var+eps) when affine=False) + # shift = beta - mean * scale (=-mean*scale when affine=False) + # ------------------------------------------------------------------------- + speaker_keys = [k for k in state if k.startswith("speaker_encoder.")] + if not speaker_keys: + print(f"warning: no speaker_encoder.* tensors found in {ckpt_path}") + else: + BN_EPS = 1e-5 # torch.nn.BatchNorm default + + # Group BN tensors by their prefix (everything before the final component). + # A BN module contributes: weight (optional, affine=True), bias (optional), + # running_mean, running_var, num_batches_tracked (ignored). + bn_groups: dict[str, dict[str, torch.Tensor]] = {} + for k in speaker_keys: + parts = k.rsplit(".", 1) + if len(parts) == 2 and parts[1] in ("weight", "bias", "running_mean", + "running_var", "num_batches_tracked"): + bn_groups.setdefault(parts[0], {})[parts[1]] = state[k] + + # A key is BN-owned iff its group has running_mean AND running_var. + bn_prefixes = {p for p, t in bn_groups.items() + if "running_mean" in t and "running_var" in t} + + n_bn = 0 + n_conv = 0 + for k in speaker_keys: + parts = k.rsplit(".", 1) + prefix, last = (parts[0], parts[1]) if len(parts) == 2 else (k, "") + + # Skip training-only counters. + if last == "num_batches_tracked": + continue + + gguf_base = "campplus/" + prefix.removeprefix("speaker_encoder.").replace(".", "/") + + if prefix in bn_prefixes: + if last in ("weight", "bias"): + # Skip the raw gamma/beta; we'll emit the fused scale/shift + # once per group when we hit running_mean. + continue + if last == "running_var": + continue + if last == "running_mean": + grp = bn_groups[prefix] + mean = grp["running_mean"].float() + var = grp["running_var"].float() + denom = torch.sqrt(var + BN_EPS) + if "weight" in grp and "bias" in grp: + gamma = grp["weight"].float() + beta = grp["bias"].float() + scale = gamma / denom + shift = beta - mean * scale + else: + # BatchNorm1d(..., affine=False) — only running stats. + scale = 1.0 / denom + shift = -mean * scale + add_tensor_maybe_q(writer, gguf_base + "/s", + np.ascontiguousarray(scale.numpy().astype(np.float32)), + args.quant, stats=qstats) + add_tensor_maybe_q(writer, gguf_base + "/b", + np.ascontiguousarray(shift.numpy().astype(np.float32)), + args.quant, stats=qstats) + n_bn += 1 + continue + + # Non-BN tensor: export as-is (F32). + gguf_name = "campplus/" + k.removeprefix("speaker_encoder.").replace(".", "/") + add_tensor_maybe_q(writer, gguf_name, as_numpy(state[k], dtype=torch.float32), args.quant, stats=qstats) + n_conv += 1 + + # Hyperparameters. CAMPPlus() is instantiated with the defaults in + # s3gen.py, so hard-code them here to avoid re-encoding in C++. + writer.add_uint32("campplus.feat_dim", 80) + writer.add_uint32("campplus.embedding_size", 192) + writer.add_uint32("campplus.growth_rate", 32) + writer.add_uint32("campplus.bn_size", 4) + writer.add_uint32("campplus.init_channels", 128) + writer.add_uint32("campplus.block1_layers", 12) + writer.add_uint32("campplus.block2_layers", 24) + writer.add_uint32("campplus.block3_layers", 16) + writer.add_uint32("campplus.block1_dilation", 1) + writer.add_uint32("campplus.block2_dilation", 2) + writer.add_uint32("campplus.block3_dilation", 2) + writer.add_uint32("campplus.kernel_size", 3) + writer.add_uint32("campplus.seg_pool_len", 100) + writer.add_uint32("campplus.sample_rate", 16000) + + # Kaldi-style mel filterbank (80 bins, 16 kHz, n_fft=512, low=20 Hz, + # high=8000 Hz). Used by the C++ fbank_kaldi_80 implementation in + # src/voice_features.cpp to replace torchaudio.compliance.kaldi.fbank + # at runtime. Formula: triangular filters equally spaced in mel-space + # (Kaldi mel: 1127 * log(1 + f/700)), evaluated at each FFT bin's + # linear frequency. + SR = 16000 + NFFT = 512 + N_MELS = 80 + LOW = 20.0 + HIGH = 8000.0 + mel_low = 1127.0 * np.log(1.0 + LOW / 700.0) + mel_high = 1127.0 * np.log(1.0 + HIGH / 700.0) + mel_delta = (mel_high - mel_low) / (N_MELS + 1) + bin_freq = np.arange(NFFT // 2 + 1, dtype=np.float64) * SR / NFFT + bin_mel = 1127.0 * np.log(1.0 + bin_freq / 700.0) + kaldi_fb = np.zeros((N_MELS, NFFT // 2 + 1), dtype=np.float32) + for m in range(N_MELS): + mel_center = mel_low + (m + 1) * mel_delta + mel_lo = mel_center - mel_delta + mel_hi = mel_center + mel_delta + for k, mb in enumerate(bin_mel): + if mb < mel_lo or mb > mel_hi: + continue + if mb <= mel_center: + kaldi_fb[m, k] = (mb - mel_lo) / (mel_center - mel_lo) + else: + kaldi_fb[m, k] = (mel_hi - mb) / (mel_hi - mel_center) + add_tensor_maybe_q(writer, "campplus/mel_fb_kaldi_80", np.ascontiguousarray(kaldi_fb), args.quant, stats=qstats) + print(f"Embedded CAMPPlus: {n_conv} conv/linear tensors + {n_bn} fused BNs " + f"+ kaldi mel filterbank {kaldi_fb.shape}") + + # ------------------------------------------------------------------------- + # S3TokenizerV2 (FunASR speech-to-token encoder that produces the 25 Hz + # token stream Chatterbox uses for voice conditioning). 103 raw tensors: + # tokenizer._mel_filters (128, 201) librosa mel fb + # tokenizer.encoder.conv{1,2}.{weight,bias} + # tokenizer.encoder.blocks.{0..5}.* (16 tensors each × 6 = 96) + # tokenizer.quantizer._codebook.project_down.{weight,bias} + # ------------------------------------------------------------------------- + tok_keys = [k for k in state if k.startswith("tokenizer.")] + if not tok_keys: + print(f"warning: no tokenizer.* tensors found in {ckpt_path}") + else: + n_tok = 0 + for k in tok_keys: + rest = k[len("tokenizer."):] + # Skip window buffer (we recompute it). + if rest in ("window",): + continue + if rest == "_mel_filters": + gguf_name = "s3tokv2/mel_fb" + else: + gguf_name = "s3tokv2/" + rest.replace(".", "/") + add_tensor_maybe_q(writer, gguf_name, as_numpy(state[k], dtype=torch.float32), args.quant, stats=qstats) + n_tok += 1 + + writer.add_uint32("s3tokv2.n_mels", 128) + writer.add_uint32("s3tokv2.n_audio_state", 1280) + writer.add_uint32("s3tokv2.n_audio_head", 20) + writer.add_uint32("s3tokv2.n_audio_layer", 6) + writer.add_uint32("s3tokv2.head_dim", 64) + writer.add_uint32("s3tokv2.mlp_ratio", 4) + writer.add_uint32("s3tokv2.fsmn_kernel", 31) + writer.add_uint32("s3tokv2.fsq_levels", 3) + writer.add_uint32("s3tokv2.fsq_dim", 8) + writer.add_uint32("s3tokv2.codebook_size", 3 ** 8) + writer.add_uint32("s3tokv2.conv_stride", 2) + writer.add_uint32("s3tokv2.n_fft", 400) + writer.add_uint32("s3tokv2.hop", 160) + writer.add_uint32("s3tokv2.sample_rate", 16000) + writer.add_float32("s3tokv2.rope_theta", 10000.0) + writer.add_uint32("s3tokv2.rope_max_pos", 2048) + print(f"Embedded S3TokenizerV2: {n_tok} tensors") + + n_flow = sum(1 for k in state if k.startswith("flow.")) - sum(1 for k in state if k.startswith("flow.decoder.estimator.")) + n_cfm = len(decoder_keys) + n_hift = len(mel2wav_keys) + print(f"Wrote: encoder(+proj)~{n_flow} tensors, cfm={n_cfm}, hift={n_hift}") + + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file() + writer.close() + + out_size_mb = args.out.stat().st_size / (1024 * 1024) + print(f"\nOutput: {args.out} ({out_size_mb:.0f} MB)") + if args.quant not in ("f16", "f32") and qstats is not None: + print(f" --quant {args.quant}: {qstats['n_quant']} tensors block-quantized " + f"(policy matches scripts/requantize-gguf.py; embeddings, voice encoders, " + f"norms/biases, and filterbanks kept at full precision)") + + +if __name__ == "__main__": + main() diff --git a/packages/tts-ggml/scripts/convert-supertonic2-to-gguf.py b/packages/tts-ggml/scripts/convert-supertonic2-to-gguf.py new file mode 100644 index 0000000000..bbeabb478d --- /dev/null +++ b/packages/tts-ggml/scripts/convert-supertonic2-to-gguf.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +"""Convert official Supertonic 2 ONNX/assets into a single GGUF file. + +This is intentionally model-specific. The GGUF stores every ONNX initializer +and tensor-valued Constant under short ggml-safe names, plus metadata arrays +mapping those short names back to their source ONNX names. The C++ runtime can +therefore ask for a tensor by its original ONNX source name without relying on +long ggml tensor names. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +from pathlib import Path +from typing import Iterable + +import numpy as np +import onnx +from onnx import numpy_helper + +try: + import gguf +except ImportError as exc: # pragma: no cover - user environment guard + raise SystemExit("error: Python package 'gguf' is required; install with `pip install gguf`.") from exc + + +STAGES = ( + ("duration", "duration_predictor.onnx"), + ("text_encoder", "text_encoder.onnx"), + ("vector_estimator", "vector_estimator.onnx"), + ("vocoder", "vocoder.onnx"), +) +REQUIRED_ONNX = tuple(filename for _, filename in STAGES) +HF_ALLOW_PATTERNS = ( + "*.onnx", + "*.json", + "*.bin", + "*.data", + "**/*.onnx", + "**/*.json", + "**/*.bin", + "**/*.data", +) + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="Convert Supertonic 2 ONNX/assets to GGUF.") + p.add_argument("--onnx-dir", type=Path, default=None, + help="Directory containing the four Supertonic ONNX files and tts.json. " + "If omitted, downloads --repo-id from Hugging Face first.") + p.add_argument("--assets-dir", type=Path, default=None, + help="Directory containing unicode_indexer.json and voice_styles/. " + "Defaults to --onnx-dir if present, otherwise ../../assets relative to --onnx-dir.") + p.add_argument("--out", type=Path, default=Path("models/supertonic2.gguf")) + p.add_argument("--arch", default="supertonic2", choices=("supertonic", "supertonic2"), + help="Model family metadata. Use 'supertonic' for the English-only HF bundle.") + p.add_argument("--repo-id", default=None, + help="Hugging Face repo to download when --onnx-dir is omitted. " + "Defaults to Supertone/supertonic-2 or Supertone/supertonic based on --arch.") + p.add_argument("--download-dir", type=Path, default=None, + help="Optional local directory for the Hugging Face snapshot download.") + p.add_argument("--hf-token", default=None, help="Optional Hugging Face token.") + p.add_argument("--local-files-only", action="store_true", + help="Use only the local Hugging Face cache when downloading.") + p.add_argument("--reference-repo", default=None, + help="HF repo/source metadata. Defaults from --arch.") + p.add_argument("--default-voice", default=None, + help="Default voice metadata. Defaults to F1 when present, otherwise first voice.") + p.add_argument("--default-steps", type=int, default=None, + help="Default denoising steps metadata. Defaults to 5 to match reference dumps and examples.") + p.add_argument("--default-speed", type=float, default=1.05, + help="Default speed metadata.") + p.add_argument("--ftype", choices=("f32", "f16", "q8_0"), default="f32", + help="Weight storage type. f32 is required by the current scalar reference backend; " + "f16/q8_0 are intended for the GGML graph backend.") + p.add_argument("--language-wrap-mode", choices=("none", "prefix", "open_close"), default=None, + help="Text wrapping metadata. Defaults to none for --arch supertonic and open_close for supertonic2.") + p.add_argument("--no-language-wrap", action="store_true", + help="Store metadata telling runtimes not to wrap text as ... . " + "Use for the English-only Supertone/supertonic bundle.") + p.add_argument("--validate", action="store_true", + help="Re-open the written GGUF and validate tensor count + metadata.") + return p.parse_args() + + +def default_repo_for_arch(arch: str) -> str: + return "Supertone/supertonic" if arch == "supertonic" else "Supertone/supertonic-2" + + +def download_hf_snapshot(repo_id: str, + token: str | None, + download_dir: Path | None, + local_files_only: bool) -> Path: + try: + from huggingface_hub import snapshot_download + except ImportError as exc: # pragma: no cover - user environment guard + raise SystemExit( + "error: Python package 'huggingface_hub' is required for automatic download; " + "install with `pip install huggingface_hub` or pass --onnx-dir." + ) from exc + + kwargs = { + "repo_id": repo_id, + "token": token, + "allow_patterns": list(HF_ALLOW_PATTERNS), + "local_files_only": local_files_only, + } + if download_dir is not None: + kwargs["local_dir"] = str(download_dir) + return Path(snapshot_download(**kwargs)) + + +def contains_required_onnx(path: Path) -> bool: + return all((path / filename).exists() for filename in REQUIRED_ONNX) + + +def resolve_onnx_dir(repo_root: Path) -> Path: + candidates = [ + repo_root / "onnx_models" / "onnx", + repo_root / "onnx", + repo_root / "onnx_models", + repo_root, + ] + for candidate in candidates: + if contains_required_onnx(candidate): + return candidate + + for duration_path in repo_root.rglob("duration_predictor.onnx"): + candidate = duration_path.parent + if contains_required_onnx(candidate): + return candidate + + required = ", ".join(REQUIRED_ONNX) + raise FileNotFoundError(f"could not find Supertonic ONNX directory under {repo_root}; required: {required}") + + +def resolve_tts_json(onnx_dir: Path, repo_root: Path | None) -> Path: + candidates = [onnx_dir / "tts.json"] + if repo_root is not None: + candidates.extend([ + repo_root / "tts.json", + repo_root / "onnx_models" / "onnx" / "tts.json", + repo_root / "onnx" / "tts.json", + ]) + for candidate in candidates: + if candidate.exists(): + return candidate + raise FileNotFoundError(f"tts.json not found near {onnx_dir}") + + +def resolve_assets_dir(onnx_dir: Path, assets_dir: Path | None, repo_root: Path | None = None) -> Path: + if assets_dir is not None: + return assets_dir + if (onnx_dir / "unicode_indexer.json").exists(): + return onnx_dir + if repo_root is not None and (repo_root / "assets").exists(): + return repo_root / "assets" + if (onnx_dir.parent / "assets").exists(): + return onnx_dir.parent / "assets" + return onnx_dir.parent.parent / "assets" + + +def resolve_unicode_indexer(onnx_dir: Path, assets_dir: Path, repo_root: Path | None = None) -> Path: + candidates = [assets_dir / "unicode_indexer.json", onnx_dir / "unicode_indexer.json"] + if repo_root is not None: + candidates.extend([repo_root / "unicode_indexer.json", repo_root / "assets" / "unicode_indexer.json"]) + for candidate in candidates: + if candidate.exists(): + return candidate + raise FileNotFoundError(f"unicode_indexer.json not found under {assets_dir} or {onnx_dir}") + + +def resolve_voice_styles_dir(onnx_dir: Path, assets_dir: Path, repo_root: Path | None = None) -> Path: + candidates = [assets_dir / "voice_styles", onnx_dir / "voice_styles", onnx_dir.parent / "voice_styles"] + if repo_root is not None: + candidates.extend([repo_root / "voice_styles", repo_root / "assets" / "voice_styles"]) + for candidate in candidates: + if candidate.exists(): + return candidate + raise FileNotFoundError(f"voice_styles/ not found under {assets_dir}, {onnx_dir}, or {onnx_dir.parent}") + + +def as_contiguous(arr: np.ndarray) -> np.ndarray: + if arr.dtype == np.float64: + arr = arr.astype(np.float32) + # GGUF stores int64 tensors, but int32 is easier for ggml consumers when + # values are small ids/shapes. Leave true int64 if narrowing would change data. + if arr.dtype == np.int64: + narrowed = arr.astype(np.int32) + if np.array_equal(arr, narrowed.astype(np.int64)): + arr = narrowed + return np.ascontiguousarray(arr) + + +def tensor_sha256(arr: np.ndarray) -> str: + data = np.ascontiguousarray(arr).view(np.uint8) + return hashlib.sha256(data).hexdigest() + + +def prepare_weight_tensor(arr: np.ndarray, ftype: str) -> tuple[np.ndarray, tuple[int, ...] | None, "gguf.GGMLQuantizationType | None"]: + if ftype == "f32" or not np.issubdtype(arr.dtype, np.floating): + return arr, None, None + if ftype == "f16": + return np.ascontiguousarray(arr.astype(np.float16)), None, None + if ftype == "q8_0": + # Keep small/vector tensors in F32. Quantizing bias/norm/scalar tensors + # hurts parity and gives little size/speed benefit. + if arr.ndim < 2 or arr.size < 256: + return arr, None, None + qtype = gguf.GGMLQuantizationType.Q8_0 + try: + q = gguf.quantize(np.ascontiguousarray(arr.astype(np.float32)), qtype) + except gguf.QuantError: + return arr, None, None + return q, None, qtype + raise ValueError(f"unsupported ftype: {ftype}") + + +def tensor_from_attribute(attr: onnx.AttributeProto) -> np.ndarray | None: + if attr.type == onnx.AttributeProto.TENSOR: + return numpy_helper.to_array(attr.t) + if attr.type == onnx.AttributeProto.FLOAT: + return np.asarray([attr.f], dtype=np.float32) + if attr.type == onnx.AttributeProto.FLOATS: + return np.asarray(attr.floats, dtype=np.float32) + if attr.type == onnx.AttributeProto.INT: + return np.asarray([attr.i], dtype=np.int32) + if attr.type == onnx.AttributeProto.INTS: + return np.asarray(attr.ints, dtype=np.int32) + return None + + +def iter_onnx_tensors(model_path: Path) -> Iterable[tuple[str, np.ndarray]]: + model = onnx.load(str(model_path), load_external_data=True) + seen: set[str] = set() + + for init in model.graph.initializer: + name = init.name + if not name: + continue + arr = numpy_helper.to_array(init) + seen.add(name) + yield name, as_contiguous(arr) + + for node_idx, node in enumerate(model.graph.node): + if node.op_type != "Constant": + continue + if not node.output: + continue + out_name = node.output[0] + if not out_name or out_name in seen: + continue + for attr in node.attribute: + arr = tensor_from_attribute(attr) + if arr is None: + continue + seen.add(out_name) + yield out_name, as_contiguous(arr) + break + + +def add_json_metadata(writer: "gguf.GGUFWriter", prefix: str, data: dict) -> None: + writer.add_string(prefix, json.dumps(data, ensure_ascii=False, separators=(",", ":"))) + + +def main() -> int: + args = parse_args() + repo_root: Path | None = None + repo_id = args.repo_id or default_repo_for_arch(args.arch) + if args.onnx_dir is None: + print(f"Downloading {repo_id} from Hugging Face (cached by huggingface_hub)") + repo_root = download_hf_snapshot(repo_id, args.hf_token, args.download_dir, args.local_files_only) + args.onnx_dir = resolve_onnx_dir(repo_root) + else: + args.onnx_dir = args.onnx_dir.resolve() + + assets_dir = resolve_assets_dir(args.onnx_dir, args.assets_dir, repo_root) + unicode_path = resolve_unicode_indexer(args.onnx_dir, assets_dir, repo_root) + voice_styles_dir = resolve_voice_styles_dir(args.onnx_dir, assets_dir, repo_root) + tts_json_path = resolve_tts_json(args.onnx_dir, repo_root) + args.out.parent.mkdir(parents=True, exist_ok=True) + + print(f"Using ONNX directory: {args.onnx_dir}") + print(f"Using assets directory: {assets_dir}") + cfg = json.loads(tts_json_path.read_text()) + unicode_indexer = np.asarray(json.loads(unicode_path.read_text()), dtype=np.int32) + + reference_repo = args.reference_repo or repo_id + writer = gguf.GGUFWriter(str(args.out), args.arch) + writer.add_name("Supertonic" if args.arch == "supertonic" else "Supertonic 2") + writer.add_description(f"{reference_repo} ONNX weights/assets converted for a model-specific ggml runtime.") + writer.add_string("supertonic.arch", args.arch) + writer.add_string("supertonic.reference_repo", reference_repo) + writer.add_string("supertonic.ftype", args.ftype) + writer.add_string("supertonic.tts_version", str(cfg.get("tts_version", ""))) + writer.add_string("supertonic.split", str(cfg.get("split", ""))) + writer.add_uint32("supertonic.sample_rate", int(cfg["ae"]["sample_rate"])) + writer.add_uint32("supertonic.base_chunk_size", int(cfg["ae"]["base_chunk_size"])) + writer.add_uint32("supertonic.ttl_chunk_compress_factor", int(cfg["ttl"]["chunk_compress_factor"])) + writer.add_uint32("supertonic.latent_dim", int(cfg["ttl"]["latent_dim"])) + writer.add_uint32( + "supertonic.latent_channels", + int(cfg["ttl"]["latent_dim"]) * int(cfg["ttl"]["chunk_compress_factor"]), + ) + wrap_mode = "none" if args.no_language_wrap else (args.language_wrap_mode or ("none" if args.arch == "supertonic" else "open_close")) + default_steps = args.default_steps if args.default_steps is not None else 5 + + writer.add_uint32("supertonic.default_steps", default_steps) + writer.add_float32("supertonic.default_speed", args.default_speed) + writer.add_uint32("supertonic.language_wrap", 0 if wrap_mode == "none" else 1) + writer.add_string("supertonic.language_wrap_mode", wrap_mode) + writer.add_array("supertonic.languages", ["en", "ko", "es", "pt", "fr"]) + add_json_metadata(writer, "supertonic.tts_json", cfg) + + writer.add_tensor("supertonic/unicode_indexer", unicode_indexer) + + voice_names: list[str] = [] + for voice_path in sorted(voice_styles_dir.glob("*.json")): + voice_name = voice_path.stem + voice = json.loads(voice_path.read_text()) + ttl = as_contiguous(np.asarray(voice["style_ttl"]["data"], dtype=np.float32)) + dp = as_contiguous(np.asarray(voice["style_dp"]["data"], dtype=np.float32)) + writer.add_tensor(f"supertonic/voices/{voice_name}/ttl", ttl) + writer.add_tensor(f"supertonic/voices/{voice_name}/dp", dp) + writer.add_string(f"supertonic.voice.{voice_name}.metadata", + json.dumps(voice.get("metadata", {}), ensure_ascii=False, separators=(",", ":"))) + voice_names.append(voice_name) + writer.add_array("supertonic.voice_names", voice_names) + default_voice = args.default_voice or ("F1" if "F1" in voice_names else (voice_names[0] if voice_names else "")) + writer.add_string("supertonic.default_voice", default_voice) + + tensor_names: list[str] = [] + source_names: list[str] = [] + tensor_shapes: list[str] = [] + tensor_dtypes: list[str] = [] + tensor_hashes: list[str] = [] + per_stage_counts: dict[str, int] = {} + total_bytes = 0 + + for stage, filename in STAGES: + count = 0 + for source_name, arr in iter_onnx_tensors(args.onnx_dir / filename): + short_name = f"supertonic/{stage}/t{count:04d}" + source_key = f"{stage}:{source_name}" + stored, raw_shape, raw_dtype = prepare_weight_tensor(arr, args.ftype) + writer.add_tensor(short_name, stored, raw_shape=raw_shape, raw_dtype=raw_dtype) + tensor_names.append(short_name) + source_names.append(source_key) + tensor_shapes.append(json.dumps(list(arr.shape), separators=(",", ":"))) + tensor_dtypes.append(str(raw_dtype.name if raw_dtype is not None else stored.dtype)) + tensor_hashes.append(tensor_sha256(stored)) + total_bytes += stored.nbytes + count += 1 + per_stage_counts[stage] = count + print(f"{stage:16s} {count:5d} tensors") + + writer.add_array("supertonic.tensor_names", tensor_names) + writer.add_array("supertonic.source_names", source_names) + writer.add_array("supertonic.tensor_shapes", tensor_shapes) + writer.add_array("supertonic.tensor_dtypes", tensor_dtypes) + writer.add_array("supertonic.tensor_sha256", tensor_hashes) + for stage, count in per_stage_counts.items(): + writer.add_uint32(f"supertonic.{stage}.tensor_count", count) + + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file() + writer.close() + + print(f"Wrote {len(tensor_names)} ONNX tensors + {1 + 2 * len(voice_names)} asset tensors") + print(f" output: {args.out}") + print(f" source tensor bytes: {total_bytes / 1e6:.1f} MB") + + if args.validate: + reader = gguf.GGUFReader(args.out, "r") + if len(reader.tensors) != len(tensor_names) + 1 + 2 * len(voice_names): + raise RuntimeError( + f"tensor count mismatch: got {len(reader.tensors)}, " + f"expected {len(tensor_names) + 1 + 2 * len(voice_names)}" + ) + print("Validation: tensor count OK") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/packages/tts-ggml/scripts/convert-t3-mtl-to-gguf.py b/packages/tts-ggml/scripts/convert-t3-mtl-to-gguf.py new file mode 100644 index 0000000000..182a6ba642 --- /dev/null +++ b/packages/tts-ggml/scripts/convert-t3-mtl-to-gguf.py @@ -0,0 +1,392 @@ +#!/usr/bin/env python3 +""" +Convert the multilingual Chatterbox T3 (t3_mtl23ls_v2) weights to GGUF. + +Parallels scripts/convert-t3-turbo-to-gguf.py, adapted to: + - Llama 520M backbone (30 layers, RoPE llama3 scaling) instead of GPT-2 medium + - Tokenizer: grapheme_mtl_merged_expanded_v1 (embedded as raw JSON blob) + - T3 cond enc with perceiver resampler + emotion_adv projection + - VoiceEncoder weights from ve.pt (torch state_dict) +""" + +import argparse +import importlib.util +import json +import os +import re +import sys +from pathlib import Path + +import gguf +import numpy as np +import torch +from huggingface_hub import snapshot_download +from safetensors.torch import load_file + + +def _load_requantize_policy(): + """Load should_quantize + _QUANT_TYPE from requantize-gguf.py (single + source of truth shared with convert-s3gen-to-gguf.py and the offline + requantize tool). Keeps the deny-list in one place so adding a new + tensor name to T3 doesn't accidentally leak into a quantised slot.""" + path = Path(__file__).resolve().parent / "requantize-gguf.py" + spec = importlib.util.spec_from_file_location("_chatterbox_requantize_policy", path) + if spec is None or spec.loader is None: + print(f"error: could not load quant policy from {path}", file=sys.stderr) + sys.exit(1) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod.should_quantize, mod._QUANT_TYPE + + +_SHOULD_QUANTIZE, _RQ_QUANT_TYPE = _load_requantize_policy() + + +REPO_ID = "ResembleAI/chatterbox" +ALLOW_PATTERNS = [ + "ve.pt", + "t3_mtl23ls_v2.safetensors", + "s3gen.pt", + "grapheme_mtl_merged_expanded_v1.json", + "conds.pt", + "Cangjie5_TC.json", +] + +# All language codes the *Python reference tokenizer* accepts. The C++ +# tokenizer in src/mtl_tokenizer.cpp only honors the tier-1 subset (18 +# of these); the other 5 (ja, he, ru, zh, hi) need external preprocessing +# (pykakasi / dicta / russian_text_stresser / Cangjie) and hard-error at +# runtime. See mtl_tokenizer::supported_languages() for the runtime list. +ALL_KNOWN_LANGUAGES = [ + "ar", "da", "de", "el", "en", "es", "fi", "fr", "he", "hi", + "it", "ja", "ko", "ms", "nl", "no", "pl", "pt", "ru", "sv", + "sw", "tr", "zh", +] + +N_EMBD = 1024 +N_HEAD = 16 +N_KV_HEAD = 16 +HEAD_DIM = 64 +N_LAYER = 30 +INTERMEDIATE_SIZE = 4096 +TEXT_VOCAB_SIZE = 2454 +SPEECH_VOCAB_SIZE = 8194 +START_SPEECH_TOKEN = 6561 +STOP_SPEECH_TOKEN = 6562 +START_TEXT_TOKEN = 255 +STOP_TEXT_TOKEN = 0 +MAX_TEXT_TOKENS = 2048 +MAX_SPEECH_TOKENS = 4096 +SPEECH_COND_PROMPT_LEN = 150 +SPEAKER_EMBED_SIZE = 256 +PERCEIVER_QUERY_TOKENS = 32 +PERCEIVER_QUERY_SIZE = 1024 +PERCEIVER_NUM_HEADS = 4 +RMS_NORM_EPS = 1e-5 +ROPE_THETA = 500000.0 +ROPE_SCALING_FACTOR = 8.0 +ROPE_LOW_FREQ_FACTOR = 1.0 +ROPE_HIGH_FREQ_FACTOR = 4.0 +ROPE_ORIGINAL_MAX_POS = 8192 + +N_CTX = MAX_TEXT_TOKENS + MAX_SPEECH_TOKENS + 4 + +LAYER_RE = re.compile(r"^tfmr\.layers\.(\d+)\.(.+)$") + +QUANT_CHOICES = ["f16", "q8_0", "q5_0", "q4_0"] + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="Convert Chatterbox multilingual T3 weights to GGUF.") + p.add_argument("--ckpt-dir", type=Path, help="Local checkpoint dir (downloads from HF if omitted).") + p.add_argument("--out", type=Path, default=Path("models/chatterbox-t3-mtl.gguf")) + p.add_argument("--hf-token", default=None) + p.add_argument("--quant", choices=QUANT_CHOICES, default="f16", + help="Weight dtype for the big 2-D matmul weights. f16 keeps " + "the GGUF byte-identical to the legacy default. q8_0/q5_0/q4_0 " + "block-quantise eligible tensors per scripts/requantize-gguf.py " + "(same deny-list as the S3Gen converter and the offline " + "requantize tool: keeps embeddings, position embeddings, " + "norms/biases, voice-encoder weights, and built-in voice " + "conditioning at full precision).") + return p.parse_args() + + +def as_numpy(tensor: torch.Tensor, *, dtype=None, transpose: bool = False) -> np.ndarray: + if dtype is not None: + tensor = tensor.to(dtype) + arr = tensor.detach().cpu().numpy() + if transpose: + arr = arr.T + return np.ascontiguousarray(arr) + + +def add_maybe_quantized(writer: "gguf.GGUFWriter", name: str, array: np.ndarray, quant: str) -> str: + """Write a tensor; quantise eligible big 2-D float weights when + quant != f16. Eligibility is decided by `should_quantize()` in + scripts/requantize-gguf.py — single source of truth shared with + convert-s3gen-to-gguf.py and the offline requantize tool. + + Concretely, for the T3 MTL tensor set this means q8_0/q5_0/q4_0 + quantises: + - model/h{i}/attn/{q,k,v,o}/w + - model/h{i}/mlp/{gate,up,down}/w + - chatterbox/{text,speech}_head + - chatterbox/cond_spkr/w + - chatterbox/perceiver/pre_attention_query + - chatterbox/perceiver/attn/{to_q,to_k,to_v,proj_out}/w + and keeps full precision on: + - all norms / biases (matched by `/g`, `/b`, `/norm/`, `/ln_`) + - text/speech token embedding tables (`text_emb`, `speech_emb`) + - text/speech positional embedding tables (`pos_emb`) + - voice_encoder/* and chatterbox/builtin/* (whole subtrees) + - chatterbox/emotion_adv_fc/w (fails the rank/alignment gate; ne[0]=1) + + Returns the storage dtype as a short string for the BENCH log. + """ + if quant == "f16": + writer.add_tensor(name, array) + return str(array.dtype) + if array.dtype.kind in ("i", "u") or np.issubdtype(array.dtype, np.integer): + writer.add_tensor(name, array) + return str(array.dtype) + qtype = _RQ_QUANT_TYPE[quant] + if not _SHOULD_QUANTIZE(name, array.shape, qtype): + writer.add_tensor(name, array) + return str(array.dtype) + qdata = gguf.quants.quantize(np.ascontiguousarray(array.astype(np.float32)), qtype) + writer.add_tensor(name, qdata, raw_shape=qdata.shape, raw_dtype=qtype) + return qtype.name + + +def map_llama_layer(name: str): + """Return (gguf_name, dtype, transpose) for a Llama backbone tensor, or None.""" + m = LAYER_RE.match(name) + if not m: + return None + idx = int(m.group(1)) + suffix = m.group(2) + # Llama uses nn.Linear everywhere. PyTorch stores those as (out, in); + # ggml's axis reversal (numpy.shape[0] <-> ne[1]) already gives us (in, out) + # for free, so no explicit transpose is needed and the assertion + # ggml_can_mul_mat(weight, x) (ne[0] must match) lines up correctly. + table = { + "input_layernorm.weight": ("model/h{}/ln_attn/g", torch.float32, False), + "post_attention_layernorm.weight": ("model/h{}/ln_mlp/g", torch.float32, False), + "self_attn.q_proj.weight": ("model/h{}/attn/q/w", torch.float16, False), + "self_attn.k_proj.weight": ("model/h{}/attn/k/w", torch.float16, False), + "self_attn.v_proj.weight": ("model/h{}/attn/v/w", torch.float16, False), + "self_attn.o_proj.weight": ("model/h{}/attn/o/w", torch.float16, False), + "mlp.gate_proj.weight": ("model/h{}/mlp/gate/w", torch.float16, False), + "mlp.up_proj.weight": ("model/h{}/mlp/up/w", torch.float16, False), + "mlp.down_proj.weight": ("model/h{}/mlp/down/w", torch.float16, False), + } + if suffix not in table: + return None + fmt, dtype, transpose = table[suffix] + return fmt.format(idx), dtype, transpose + + +def map_tensor(name: str): + """Map any T3 state dict key to (gguf_name, dtype, transpose) or None to skip.""" + mapped = map_llama_layer(name) + if mapped is not None: + return mapped + + if name == "tfmr.norm.weight": + return "model/norm/g", torch.float32, False + if name == "text_emb.weight": + return "chatterbox/text_emb", torch.float16, False + if name == "speech_emb.weight": + return "chatterbox/speech_emb", torch.float16, False + if name == "text_head.weight": + return "chatterbox/text_head", torch.float16, False + if name == "speech_head.weight": + return "chatterbox/speech_head", torch.float16, False + if name == "text_pos_emb.emb.weight": + return "chatterbox/text_pos_emb", torch.float32, False + if name == "speech_pos_emb.emb.weight": + return "chatterbox/speech_pos_emb", torch.float32, False + + if name == "cond_enc.spkr_enc.weight": + return "chatterbox/cond_spkr/w", torch.float32, False + if name == "cond_enc.spkr_enc.bias": + return "chatterbox/cond_spkr/b", torch.float32, False + + if name == "cond_enc.emotion_adv_fc.weight": + return "chatterbox/emotion_adv_fc/w", torch.float32, False + + if name.startswith("cond_enc.perceiver."): + rest = name[len("cond_enc.perceiver."):] + if rest == "pre_attention_query": + return "chatterbox/perceiver/pre_attention_query", torch.float32, False + if rest == "attn.norm.weight": + return "chatterbox/perceiver/attn/norm/g", torch.float32, False + if rest == "attn.norm.bias": + return "chatterbox/perceiver/attn/norm/b", torch.float32, False + for proj in ("to_q", "to_k", "to_v", "proj_out"): + if rest == f"attn.{proj}.weight": + return f"chatterbox/perceiver/attn/{proj}/w", torch.float32, False + if rest == f"attn.{proj}.bias": + return f"chatterbox/perceiver/attn/{proj}/b", torch.float32, False + + return None + + +def write_metadata(writer: gguf.GGUFWriter, quant: str) -> None: + writer.add_name("Chatterbox Multilingual T3") + writer.add_description("Chatterbox multilingual text-to-speech token generator (23 languages) for ggml.") + writer.add_context_length(N_CTX) + writer.add_embedding_length(N_EMBD) + writer.add_block_count(N_LAYER) + writer.add_head_count(N_HEAD) + # Note: vocab size goes through `chatterbox.text_vocab_size` only (read + # by the C++ loader as KEY_TEXT_VOCAB_SIZE). Skipping the GGUF-standard + # `general.vocab_size` keeps a single canonical source so a future + # converter can't have the two metadata entries drift. + + writer.add_string("chatterbox.variant", "t3_mtl") + writer.add_string("chatterbox.backbone", "llama_520m") + writer.add_uint32("chatterbox.n_ctx", N_CTX) + writer.add_uint32("chatterbox.n_embd", N_EMBD) + writer.add_uint32("chatterbox.n_head", N_HEAD) + writer.add_uint32("chatterbox.n_kv_head", N_KV_HEAD) + writer.add_uint32("chatterbox.head_dim", HEAD_DIM) + writer.add_uint32("chatterbox.n_layer", N_LAYER) + writer.add_uint32("chatterbox.intermediate_size", INTERMEDIATE_SIZE) + writer.add_uint32("chatterbox.text_vocab_size", TEXT_VOCAB_SIZE) + writer.add_uint32("chatterbox.speech_vocab_size", SPEECH_VOCAB_SIZE) + writer.add_uint32("chatterbox.start_speech_token", START_SPEECH_TOKEN) + writer.add_uint32("chatterbox.stop_speech_token", STOP_SPEECH_TOKEN) + writer.add_uint32("chatterbox.start_text_token", START_TEXT_TOKEN) + writer.add_uint32("chatterbox.stop_text_token", STOP_TEXT_TOKEN) + writer.add_uint32("chatterbox.max_text_tokens", MAX_TEXT_TOKENS) + writer.add_uint32("chatterbox.max_speech_tokens", MAX_SPEECH_TOKENS) + writer.add_uint32("chatterbox.speech_cond_prompt_len", SPEECH_COND_PROMPT_LEN) + writer.add_uint32("chatterbox.speaker_embed_size", SPEAKER_EMBED_SIZE) + writer.add_uint32("chatterbox.perceiver_query_tokens", PERCEIVER_QUERY_TOKENS) + writer.add_uint32("chatterbox.perceiver_query_size", PERCEIVER_QUERY_SIZE) + writer.add_uint32("chatterbox.perceiver_num_heads", PERCEIVER_NUM_HEADS) + writer.add_bool("chatterbox.emotion_adv", True) + writer.add_float32("chatterbox.rms_norm_eps", RMS_NORM_EPS) + writer.add_float32("chatterbox.rope_theta", ROPE_THETA) + writer.add_string("chatterbox.rope.scaling_type", "llama3") + writer.add_float32("chatterbox.rope.scaling_factor", ROPE_SCALING_FACTOR) + writer.add_float32("chatterbox.rope.low_freq_factor", ROPE_LOW_FREQ_FACTOR) + writer.add_float32("chatterbox.rope.high_freq_factor", ROPE_HIGH_FREQ_FACTOR) + writer.add_uint32("chatterbox.rope.original_max_position", ROPE_ORIGINAL_MAX_POS) + writer.add_string("chatterbox.reference_repo", REPO_ID) + writer.add_string("chatterbox.quantization", quant) + + +def write_tokenizer(writer: gguf.GGUFWriter, ckpt_dir: Path) -> None: + tok_path = ckpt_dir / "grapheme_mtl_merged_expanded_v1.json" + text = tok_path.read_text(encoding="utf-8") + writer.add_string("tokenizer.ggml.model", "mtl_grapheme") + writer.add_string("tokenizer.ggml.mtl_json", text) + writer.add_array("tokenizer.ggml.mtl_languages", ALL_KNOWN_LANGUAGES) + print(f"Embedded tokenizer JSON ({len(text)} bytes), {len(ALL_KNOWN_LANGUAGES)} languages") + + +def write_voice_encoder(writer: gguf.GGUFWriter, ckpt_dir: Path) -> None: + ve_path = ckpt_dir / "ve.pt" + if not ve_path.exists(): + print(f"warning: no ve.pt at {ve_path}, skipping VoiceEncoder weights") + return + + ve_state = torch.load(ve_path, map_location="cpu", weights_only=True) + VE_HIDDEN = 256 + VE_INPUT = 40 + writer.add_uint32("voice_encoder.n_mels", VE_INPUT) + writer.add_uint32("voice_encoder.hidden_size", VE_HIDDEN) + writer.add_uint32("voice_encoder.num_layers", 3) + writer.add_uint32("voice_encoder.embedding_size", VE_HIDDEN) + writer.add_uint32("voice_encoder.partial_frames", 160) + writer.add_uint32("voice_encoder.sample_rate", 16000) + writer.add_uint32("voice_encoder.n_fft", 400) + writer.add_uint32("voice_encoder.hop_size", 160) + writer.add_uint32("voice_encoder.win_size", 400) + writer.add_float32("voice_encoder.overlap", 0.5) + writer.add_float32("voice_encoder.rate", 1.3) + writer.add_float32("voice_encoder.min_coverage", 0.8) + + n = 0 + for k, t in ve_state.items(): + if k.startswith("similarity_"): + continue + writer.add_tensor(f"voice_encoder/{k.replace('.', '/')}", + as_numpy(t, dtype=torch.float32)) + n += 1 + + import librosa + ve_mel_fb = librosa.filters.mel( + sr=16000, n_fft=400, n_mels=40, fmin=0, fmax=8000, + ).astype(np.float32) + writer.add_tensor("voice_encoder/mel_fb", np.ascontiguousarray(ve_mel_fb)) + print(f"Embedded VoiceEncoder: {n} tensors + mel_fb {ve_mel_fb.shape}") + + +def main() -> None: + args = parse_args() + if args.ckpt_dir: + ckpt_dir = args.ckpt_dir + else: + ckpt_dir = Path(snapshot_download( + repo_id=REPO_ID, + token=args.hf_token or os.getenv("HF_TOKEN"), + allow_patterns=ALLOW_PATTERNS, + )) + args.out.parent.mkdir(parents=True, exist_ok=True) + + print(f"Loading checkpoint from {ckpt_dir}") + state = load_file(ckpt_dir / "t3_mtl23ls_v2.safetensors") + if "model" in state and not torch.is_tensor(state["model"]): + state = state["model"][0] + conds = torch.load(ckpt_dir / "conds.pt", map_location="cpu", weights_only=True) + + writer = gguf.GGUFWriter(str(args.out), "chatterbox") + write_metadata(writer, args.quant) + write_tokenizer(writer, ckpt_dir) + + exported = 0 + quantized = 0 + ignored = [] + for name, tensor in state.items(): + mapped = map_tensor(name) + if mapped is None: + ignored.append(name) + continue + gguf_name, dtype, transpose = mapped + arr = as_numpy(tensor, dtype=dtype, transpose=transpose) + written = add_maybe_quantized(writer, gguf_name, arr, args.quant) + exported += 1 + if written not in ("float32", "float16"): + quantized += 1 + print(f"{gguf_name:46s} {str(tuple(arr.shape)):22s} {written}") + + builtin_speaker = conds["t3"]["speaker_emb"].reshape(1, SPEAKER_EMBED_SIZE) + builtin_tokens = conds["t3"]["cond_prompt_speech_tokens"].reshape(-1).to(torch.int32) + writer.add_uint32("chatterbox.cond_prompt_length", int(builtin_tokens.numel())) + writer.add_tensor("chatterbox/builtin/speaker_emb", as_numpy(builtin_speaker, dtype=torch.float32)) + writer.add_tensor("chatterbox/builtin/cond_prompt_speech_tokens", as_numpy(builtin_tokens)) + + write_voice_encoder(writer, ckpt_dir) + + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file() + writer.close() + + out_size = args.out.stat().st_size + print(f"\nWrote {exported + 2} mapped tensors to {args.out} ({out_size / 1e6:.1f} MB)") + print(f" --quant {args.quant}: {quantized}/{exported} weight tensors quantized") + if ignored: + print("\nIgnored tensors (first 20):") + for n in ignored[:20]: + print(f" {n}") + if len(ignored) > 20: + print(f" ... and {len(ignored) - 20} more") + + +if __name__ == "__main__": + main() diff --git a/packages/tts-ggml/scripts/convert-t3-turbo-to-gguf.py b/packages/tts-ggml/scripts/convert-t3-turbo-to-gguf.py new file mode 100644 index 0000000000..578dc5cbb4 --- /dev/null +++ b/packages/tts-ggml/scripts/convert-t3-turbo-to-gguf.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python3 + +import argparse +import json +import re +from pathlib import Path + +import gguf +import numpy as np +import torch +from huggingface_hub import snapshot_download +from safetensors.torch import load_file + + +REPO_ID = "ResembleAI/chatterbox-turbo" +ALLOW_PATTERNS = ["*.safetensors", "*.json", "*.txt", "*.pt", "*.model"] + +TEXT_VOCAB_SIZE = 50276 +SPEECH_VOCAB_SIZE = 6563 +START_SPEECH_TOKEN = 6561 +STOP_SPEECH_TOKEN = 6562 +SPEAKER_EMBED_SIZE = 256 +N_CTX = 8196 +N_EMBD = 1024 +N_HEAD = 16 +N_LAYER = 24 +LAYER_NORM_EPS = 1e-5 + +LAYER_RE = re.compile(r"^tfmr\.h\.(\d+)\.(.+)$") + + +QUANT_CHOICES = ["f16", "q8_0", "q5_0", "q4_0"] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert Chatterbox Turbo T3 weights to GGUF.") + parser.add_argument("--ckpt-dir", type=Path, help="Local checkpoint dir (downloads from HF if omitted).") + parser.add_argument("--out", type=Path, default=Path("models/chatterbox-t3-turbo.gguf"), help="Output GGUF path.") + parser.add_argument("--hf-token", default=None, help="Optional Hugging Face token.") + parser.add_argument("--quant", choices=QUANT_CHOICES, default="f16", + help=("Weight dtype for attention + MLP + speech_head projections. " + "f16 (default, ~730 MB), q8_0 (~385 MB), q5_0 (~250 MB), " + "q4_0 (~205 MB). Biases, layer norms, embeddings and " + "positional embeddings always stay at their original dtype. " + "For K-quants (q4_k / q5_k / q6_k), run the resulting f16 " + "GGUF through llama.cpp's llama-quantize instead — the " + "Python gguf package doesn't implement them yet.")) + return parser.parse_args() + + +def as_numpy(tensor: torch.Tensor, *, dtype=None, transpose: bool = False) -> np.ndarray: + if dtype is not None: + tensor = tensor.to(dtype) + array = tensor.detach().cpu().numpy() + if transpose: + array = array.T + return np.ascontiguousarray(array) + + +# Which exported tensor names hold "big" 2-D projection weights that are +# worth quantizing. These are the ones ggml_mul_mat will consume; their +# inner (reduction) dimension is always a multiple of 256 for GPT-2 Medium +# (n_embd = 1024, inner_ffn = 4096), which is the block size requirement +# for Q4_K / Q5_K. +def _is_quantizable_weight(gguf_name: str) -> bool: + if gguf_name == "chatterbox/speech_head": + return True + # Per-layer: model/h{i}/attn/c_attn/w, c_proj/w, mlp/c_fc/w, mlp/c_proj/w + if gguf_name.startswith("model/h") and ( + gguf_name.endswith("/attn/c_attn/w") or + gguf_name.endswith("/attn/c_proj/w") or + gguf_name.endswith("/mlp/c_fc/w") or + gguf_name.endswith("/mlp/c_proj/w") + ): + return True + return False + + +# NOTE: the Python gguf 0.18 package only implements the "legacy" block +# types (Q4_0/1, Q5_0/1, Q8_0). The K-quants (Q4_K, Q5_K, Q6_K) are +# declared but NotImplementedError at runtime — use llama.cpp's +# llama-quantize tool on the F16 GGUF if you need those. +_QUANT_TYPE = { + "q8_0": gguf.GGMLQuantizationType.Q8_0, + "q5_0": gguf.GGMLQuantizationType.Q5_0, + "q4_0": gguf.GGMLQuantizationType.Q4_0, +} + + +def add_maybe_quantized(writer: "gguf.GGUFWriter", name: str, array: np.ndarray, quant: str): + """Pass F32/F16 arrays straight through; quantize the "big" projection + weights when --quant is not f16. + """ + if quant == "f16" or not _is_quantizable_weight(name): + writer.add_tensor(name, array) + return str(array.dtype) + + qtype = _QUANT_TYPE[quant] + # Block-quantized kernels consume F32 input. + qdata = gguf.quants.quantize(array.astype(np.float32), qtype) + # GGUF writer wants the BYTE shape as raw_shape (the qdata.shape); + # it converts back to element shape using the quant type's block size. + writer.add_tensor(name, qdata, raw_shape=qdata.shape, raw_dtype=qtype) + return qtype.name + + +def load_tokenizer_assets(ckpt_dir: Path): + """Read vocab.json + merges.txt + added_tokens.json and return arrays + ready to embed as GGUF metadata. + + Returns (tokens, types, merges): + tokens: list[str], token text indexed by token id + types: list[int], gguf TokenType (1=NORMAL, 4=USER_DEFINED for added tokens) + merges: list[str], BPE merge rules in "left right" format (header skipped) + """ + vocab_path = ckpt_dir / "vocab.json" + merges_path = ckpt_dir / "merges.txt" + added_path = ckpt_dir / "added_tokens.json" + + vocab = json.loads(vocab_path.read_text(encoding="utf-8")) # {token: id} + added = {} + if added_path.exists(): + added = json.loads(added_path.read_text(encoding="utf-8")) + + id_to_tok = {int(idx): tok for tok, idx in vocab.items()} + for tok, idx in added.items(): + id_to_tok[int(idx)] = tok + + max_id = max(id_to_tok) if id_to_tok else -1 + tokens = [] + types = [] + for i in range(max_id + 1): + tok = id_to_tok.get(i, "") + tokens.append(tok) + types.append(int(gguf.TokenType.USER_DEFINED) if tok in added else int(gguf.TokenType.NORMAL)) + + merges = [] + for line in merges_path.read_text(encoding="utf-8").splitlines(): + line = line.rstrip("\r\n") + if not line or line.startswith("#"): + continue + merges.append(line) + + return tokens, types, merges + + +def map_tensor_name(name: str): + if name == "tfmr.wte.weight": + return None + if name == "tfmr.wpe.weight": + return "model/wpe", torch.float32, False + if name == "tfmr.ln_f.weight": + return "model/ln_f/g", torch.float32, False + if name == "tfmr.ln_f.bias": + return "model/ln_f/b", torch.float32, False + if name == "text_emb.weight": + return "chatterbox/text_emb", torch.float16, False + if name == "speech_emb.weight": + return "chatterbox/speech_emb", torch.float16, False + if name == "speech_head.weight": + return "chatterbox/speech_head", torch.float16, False + if name == "speech_head.bias": + return "chatterbox/speech_head_bias", torch.float32, False + if name == "cond_enc.spkr_enc.weight": + return "chatterbox/cond_spkr/w", torch.float32, False + if name == "cond_enc.spkr_enc.bias": + return "chatterbox/cond_spkr/b", torch.float32, False + + match = LAYER_RE.match(name) + if not match: + return None + + layer_idx = int(match.group(1)) + suffix = match.group(2) + + # GPT-2 Conv1D weights need transposing; biases and LayerNorm do not + table = { + "ln_1.weight": ("model/h{}/ln_1/g", torch.float32, False), + "ln_1.bias": ("model/h{}/ln_1/b", torch.float32, False), + "ln_2.weight": ("model/h{}/ln_2/g", torch.float32, False), + "ln_2.bias": ("model/h{}/ln_2/b", torch.float32, False), + "attn.c_attn.weight": ("model/h{}/attn/c_attn/w", torch.float16, True), + "attn.c_attn.bias": ("model/h{}/attn/c_attn/b", torch.float32, False), + "attn.c_proj.weight": ("model/h{}/attn/c_proj/w", torch.float16, True), + "attn.c_proj.bias": ("model/h{}/attn/c_proj/b", torch.float32, False), + "mlp.c_fc.weight": ("model/h{}/mlp/c_fc/w", torch.float16, True), + "mlp.c_fc.bias": ("model/h{}/mlp/c_fc/b", torch.float32, False), + "mlp.c_proj.weight": ("model/h{}/mlp/c_proj/w", torch.float16, True), + "mlp.c_proj.bias": ("model/h{}/mlp/c_proj/b", torch.float32, False), + } + if suffix not in table: + return None + fmt, dtype, transpose = table[suffix] + return fmt.format(layer_idx), dtype, transpose + + +def main() -> None: + args = parse_args() + if args.ckpt_dir: + ckpt_dir = args.ckpt_dir + else: + ckpt_dir = Path(snapshot_download(repo_id=REPO_ID, token=args.hf_token, allow_patterns=ALLOW_PATTERNS)) + args.out.parent.mkdir(parents=True, exist_ok=True) + + print(f"Loading checkpoint from {ckpt_dir}") + state = load_file(ckpt_dir / "t3_turbo_v1.safetensors") + conds = torch.load(ckpt_dir / "conds.pt", map_location="cpu", weights_only=True) + + writer = gguf.GGUFWriter(str(args.out), "chatterbox") + writer.add_name("Chatterbox Turbo T3") + writer.add_description("Chatterbox Turbo text-to-speech token generator for ggml.") + writer.add_context_length(N_CTX) + writer.add_embedding_length(N_EMBD) + writer.add_block_count(N_LAYER) + writer.add_head_count(N_HEAD) + writer.add_vocab_size(TEXT_VOCAB_SIZE) + writer.add_uint32("chatterbox.n_ctx", N_CTX) + writer.add_uint32("chatterbox.n_embd", N_EMBD) + writer.add_uint32("chatterbox.n_head", N_HEAD) + writer.add_uint32("chatterbox.n_layer", N_LAYER) + writer.add_uint32("chatterbox.text_vocab_size", TEXT_VOCAB_SIZE) + writer.add_uint32("chatterbox.speech_vocab_size", SPEECH_VOCAB_SIZE) + writer.add_uint32("chatterbox.start_speech_token", START_SPEECH_TOKEN) + writer.add_uint32("chatterbox.stop_speech_token", STOP_SPEECH_TOKEN) + writer.add_uint32("chatterbox.speaker_embed_size", SPEAKER_EMBED_SIZE) + writer.add_float32("chatterbox.layer_norm_eps", LAYER_NORM_EPS) + writer.add_string("chatterbox.variant", "t3_turbo") + writer.add_string("chatterbox.reference_repo", REPO_ID) + + # Embed the GPT-2 BPE tokenizer so the C++ binary has no runtime dependency + # on vocab.json / merges.txt / added_tokens.json on disk. + tok_tokens, tok_types, tok_merges = load_tokenizer_assets(ckpt_dir) + writer.add_tokenizer_model("gpt2") + writer.add_token_list(tok_tokens) + writer.add_token_types(tok_types) + writer.add_token_merges(tok_merges) + print(f"Embedded tokenizer: {len(tok_tokens)} tokens, " + f"{sum(1 for t in tok_types if t == int(gguf.TokenType.USER_DEFINED))} added, " + f"{len(tok_merges)} merges") + + writer.add_string("chatterbox.quantization", args.quant) + + exported = 0 + quantized = 0 + ignored = [] + for name, tensor in state.items(): + mapped = map_tensor_name(name) + if mapped is None: + ignored.append(name) + continue + gguf_name, dtype, transpose = mapped + array = as_numpy(tensor, dtype=dtype, transpose=transpose) + written_type = add_maybe_quantized(writer, gguf_name, array, args.quant) + exported += 1 + if written_type not in ("float32", "float16"): + quantized += 1 + print(f"{gguf_name:32s} {str(tuple(array.shape)):18s} {written_type}") + + builtin_speaker = conds["t3"]["speaker_emb"].reshape(1, SPEAKER_EMBED_SIZE) + builtin_tokens = conds["t3"]["cond_prompt_speech_tokens"].reshape(-1).to(torch.int32) + + writer.add_uint32("chatterbox.cond_prompt_length", int(builtin_tokens.numel())) + writer.add_tensor("chatterbox/builtin/speaker_emb", as_numpy(builtin_speaker, dtype=torch.float32)) + writer.add_tensor("chatterbox/builtin/cond_prompt_speech_tokens", as_numpy(builtin_tokens)) + + # VoiceEncoder weights (3-layer unidirectional LSTM + Linear projection). + # Used by main.cpp to compute speaker_emb natively when --reference-audio + # is given, so no Python helper is needed at inference time. LSTM layout + # is PyTorch's default: each weight_i{h,h}_l* is (4*hidden, ...) with the + # [i, f, g, o] gate rows stacked. + ve_path = ckpt_dir / "ve.safetensors" + if ve_path.exists(): + ve_state = load_file(ve_path) + VE_HIDDEN = 256 + VE_INPUT = 40 + writer.add_uint32("voice_encoder.n_mels", VE_INPUT) + writer.add_uint32("voice_encoder.hidden_size", VE_HIDDEN) + writer.add_uint32("voice_encoder.num_layers", 3) + writer.add_uint32("voice_encoder.embedding_size", VE_HIDDEN) # proj is (256, 256) + writer.add_uint32("voice_encoder.partial_frames", 160) + writer.add_uint32("voice_encoder.sample_rate", 16000) + writer.add_uint32("voice_encoder.n_fft", 400) + writer.add_uint32("voice_encoder.hop_size", 160) + writer.add_uint32("voice_encoder.win_size", 400) + writer.add_float32("voice_encoder.overlap", 0.5) + writer.add_float32("voice_encoder.rate", 1.3) + writer.add_float32("voice_encoder.min_coverage", 0.8) + + for k, t in ve_state.items(): + # Skip the cosine-similarity scaling parameters; they're only used + # for training/CFG and don't affect embedding extraction. + if k.startswith("similarity_"): + continue + writer.add_tensor( + f"voice_encoder/{k.replace('.', '/')}", + as_numpy(t, dtype=torch.float32), + ) + + # Precomputed mel filterbank for the VE mel (40 channels @ 16 kHz, + # n_fft=400). Matches librosa.filters.mel with fmin=0, fmax=8000. + import librosa + import numpy as np + ve_mel_fb = librosa.filters.mel( + sr=16000, n_fft=400, n_mels=40, fmin=0, fmax=8000, + ).astype(np.float32) # (40, 201) + writer.add_tensor("voice_encoder/mel_fb", + np.ascontiguousarray(ve_mel_fb)) + print(f"Embedded VoiceEncoder: 14 tensors, mel_fb {ve_mel_fb.shape}") + else: + print(f"warning: no ve.safetensors at {ve_path}, skipping VoiceEncoder weights") + + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file() + writer.close() + + print(f"\nWrote {exported + 2} tensors to {args.out}") + print(f" --quant {args.quant}: {quantized}/{exported} weight tensors quantized " + f"({'f16/f32' if args.quant == 'f16' else args.quant.upper()} for quantized; " + f"embeddings + biases + layer-norms unchanged)") + if ignored: + print("\nIgnored tensors:") + for n in ignored: + print(f" {n}") + + +if __name__ == "__main__": + main() diff --git a/packages/tts-ggml/scripts/generate-mobile-integration-tests.js b/packages/tts-ggml/scripts/generate-mobile-integration-tests.js new file mode 100644 index 0000000000..a35a8945f3 --- /dev/null +++ b/packages/tts-ggml/scripts/generate-mobile-integration-tests.js @@ -0,0 +1,82 @@ +#!/usr/bin/env node +'use strict' + +// Run with `node`, not `bare`: this script is a build-time helper that uses +// node's built-in `fs` / `path` (same convention as the sibling +// validate-mobile-tests.js). Everything inside the addon itself runs under +// `bare` and uses bare-fs / bare-path instead. + +const fs = require('fs') +const path = require('path') + +const repoRoot = path.resolve(__dirname, '..') +const integrationDir = path.join(repoRoot, 'test', 'integration') +const mobileDir = path.join(repoRoot, 'test', 'mobile') +const outputFile = path.join(mobileDir, 'integration.auto.cjs') + +function getIntegrationFiles () { + if (!fs.existsSync(integrationDir)) { + throw new Error(`Integration directory not found: ${integrationDir}`) + } + + return fs.readdirSync(integrationDir) + .filter(entry => entry.endsWith('.test.js')) + .sort() +} + +function toFunctionName (fileName) { + const base = fileName.replace(/\.js$/, '') + const parts = base.split(/[^a-zA-Z0-9]+/).filter(Boolean) + const suffix = parts.map(part => part.charAt(0).toUpperCase() + part.slice(1)).join('') + return `run${suffix}` +} + +function buildFileContents (files) { + const lines = [] + const functionNames = files.map(toFunctionName) + lines.push("'use strict'") + lines.push("require('./integration-runtime.cjs')") + lines.push('') + lines.push('// AUTO-GENERATED FILE. Run `npm run test:mobile:generate` to update.') + lines.push('// Each function mirrors a single file under test/integration/.') + lines.push('') + lines.push('/* global runIntegrationModule */') + lines.push('') + + for (let i = 0; i < files.length; i++) { + const file = files[i] + const fnName = functionNames[i] + const relativePath = `../integration/${file}` + lines.push(`async function ${fnName} (options = {}) { // eslint-disable-line no-unused-vars`) + lines.push(` return runIntegrationModule('${relativePath}', options)`) + lines.push('}') + if (i < files.length - 1) { + lines.push('') + } + } + + lines.push('') + lines.push('module.exports = {') + for (let i = 0; i < functionNames.length; i++) { + const suffix = i < functionNames.length - 1 ? ',' : '' + lines.push(` ${functionNames[i]}${suffix}`) + } + lines.push('}') + + return `${lines.join('\n')}\n` +} + +function main () { + const files = getIntegrationFiles() + if (files.length === 0) { + throw new Error(`No integration test files found inside ${integrationDir}`) + } + + const content = buildFileContents(files) + fs.writeFileSync(outputFile, content, 'utf8') + console.log(`Generated ${outputFile} with ${files.length} integration runners.`) +} + +if (require.main === module) { + main() +} diff --git a/packages/tts-ggml/scripts/requantize-gguf.py b/packages/tts-ggml/scripts/requantize-gguf.py new file mode 100644 index 0000000000..379f36a0c9 --- /dev/null +++ b/packages/tts-ggml/scripts/requantize-gguf.py @@ -0,0 +1,348 @@ +#!/usr/bin/env python3 +"""Requantize a chatterbox GGUF (T3 or S3Gen) to a smaller dtype. + +`llama-quantize` refuses to touch either GGUF because neither +`chatterbox` nor `chatterbox-s3gen` is a llama.cpp-known arch. This +tool walks the GGUF tensor-by-tensor and rewrites it with the big 2-D +weight matrices stored as `Q8_0` / `Q5_0` / `Q4_0`, leaving the +numerically-sensitive tensors (embedding tables accessed via get_rows, +biases, norm scales, filterbank / STFT bases, positional embeddings, +builtin voice conditioning) at their source dtype. + +Works for both models because the deny-list covers the union of +patterns that either side uses for "keep-as-F32/F16". + +Usage: + + # T3 Q8_0 + python scripts/requantize-gguf.py \\ + models/chatterbox-t3-turbo.gguf \\ + models/t3-q8_0.gguf q8_0 + + # S3Gen Q8_0 + python scripts/requantize-gguf.py \\ + models/chatterbox-s3gen.gguf \\ + models/chatterbox-s3gen-q8_0.gguf q8_0 + + # Q4_0 is the same, last arg is just `q4_0`. + + # F16 downcast for HiFT conv kernels (multilingual S3Gen — see §3.24). + # `--name-filter hift/` constrains the rewrite to a name substring; + # everything else is passed through at its source dtype. Two-pass + # use: + # 1. F32→F16 for HiFT conv kernels in the F16 source GGUF + # 2. F16→Q4_0 for the CFM transformer linears (no name filter) + python scripts/requantize-gguf.py \\ + models/chatterbox-s3gen-mtl-f16.gguf \\ + /tmp/intermediate.gguf f16 --name-filter hift/ + python scripts/requantize-gguf.py \\ + /tmp/intermediate.gguf \\ + models/chatterbox-s3gen-mtl-q4_0_hift_f16.gguf q4_0 + +Quality trade-off (measured on a representative paragraph, Metal / M3 Ultra): + F32 (default) — baseline + Q8_0 — essentially bit-exact, cos-sim > 0.99 vs baseline + Q4_0 — different CFM ODE trajectory → different sample; + subjective quality equal, cos-sim falls to ~0.66 + F16 (--name-filter hift/) — HiFT conv kernels at half precision; PCM + cosine 0.9999 vs the corresponding all-F32-HiFT + baseline (audio essentially indistinguishable). + `[hift_decode]` ~3 % faster on M3 Ultra Metal + (124.9 → 121.3 ms median across 3 invocations); + GGUF ~33 MB smaller. See PROGRESS.md §3.24. +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +import numpy as np +import gguf + + +# Names we NEVER touch: they're read as raw F32 by the C++ loader, or +# they're accessed via ggml_get_rows (embedding tables), or they're +# numerically sensitive (filterbanks, STFT bases, voice conditioning, +# position embeddings, norm/bias params). Works for both T3 (GPT-2- +# style names) and S3Gen (custom per-module names). +_DENY_SUBSTRINGS = ( + # Raw-F32 access in the C++ loader + "flow/input_embedding", # S3Gen speech embedding table (read as F32 for CPU-side lookup) + "/builtin/", # voice conditioning tensors, loaded directly + # Embedding tables (accessed via ggml_get_rows — safer as F16/F32) + "text_emb", # T3 text token embedding + "speech_emb", # T3 speech token embedding + "wte", # GPT-2 word token embedding + "wpe", # GPT-2 learned position embedding + # Spectral bases / positional encodings (bit-exact numerics) + "stft_basis", # STFT analysis / synthesis + "mel_filterbank", # mel filterbank + "mel_fb", # T3 VoiceEncoder and S3Gen mel filterbank tensors + "pos_emb", # positional embeddings — small, keep F32 + "pe/pe", # conformer pos enc + "pre_attention_query", # MTL T3 perceiver: learned query embedding + # (CLS-like). Used as an *activation* (passed + # as the right-hand side of mul_mat after + # reshape), not a weight, so quantising it + # breaks ggml_reshape_2d / ggml_norm / + # ggml_mul_mat-as-src1 in build_perceiver. + # Pre-existing latent bug: was always wrongly + # quantizable (3-D shape (1024, 32, 1) clears + # the K%32==0 gate); only surfaced now because + # the shipped q4_0 GGUF was produced via an + # earlier code path that kept it at source + # dtype. + # Biases / norms / scale params — always 1-D or near-1-D + "/b", # legacy biases (gpt-2 /b, s3gen /b) + "/bias", # pytorch-style bias + "/bn/", # batchnorm params + "/norm/", # layernorms + "/ln_", # GPT-2 style layernorms (ln_1, ln_2, ln_f) + "/scale", # legacy scale weights (narrowed from the + # old "/s" glob so HiFT source_* conv + # weights are no longer incidentally + # excluded. The `kernel_mul_mv_f32_f16` + # / `_4` / `_short` Metal kernel variants + # that HiFT source_* conv1d needs are + # shipped in patches/ggml-metal- + # chatterbox-ops.patch as of PROGRESS + # §3.26, so this deny is no longer + # necessary for correctness. With the + # kernel in place, the 21 source_* + # conv-kernel weights go through the + # --name-filter hift/ recipe at f16 and + # the GGUF shrinks by ~7.7 MB with WAV + # parity (cos 1.000000, rms-diff 0.035 %, + # max abs 4/32767). See §3.26.) + "alpha", # Snake activation alphas + "beta", + "gamma", + # Voice-cloning preprocessing encoders — NEVER quantize. These are + # small specialised models whose dynamic range is too tight for Q4/Q8 + # block quantization; the resulting encoder output drifts so badly that + # the voice-cloning tensors become unusable (we've seen speaker_emb + # collapse to zeros, prompt_token to a single constant value, and + # CAMPPlus embedding go antipodal to its F32 counterpart). Keeping + # them at source dtype costs ~40 MB across both GGUFs but is the + # difference between a working clone and garbage audio. + "voice_encoder/", # T3 VoiceEncoder (3-layer bi-LSTM + projection) + "campplus/", # S3Gen CAMPPlus (TDNN x-vector extractor) + "s3tokv2/", # S3Gen S3TokenizerV2 (conformer + FSQ quantizer) +) + + +# Suffix-anchored denies. Use this for one-letter param names that would +# otherwise hit too many incidental substring matches. The classic case +# is the GPT-2 / Llama RMSNorm scale tensor `.../ln_attn/g`, `.../norm/g`: +# matched as a substring, "/g" also wrongly catches `.../mlp/gate/w` (30 +# tensors × ~4 MB each ≈ 120 MB on the multilingual T3 Q4_0 GGUF) and is +# the reason §3.23 observed `mlp_gate` shipping as F16 while `mlp_up` +# shipped as Q4_0 — a converter bug, not by design. +_DENY_SUFFIXES = ( + "/g", # GPT-2 / Llama RMSNorm / LayerNorm scale at end of path +) + + +# Tensor element dtypes we're willing to quantize from. F16 is T3's +# default for its big projection weights; F32 is S3Gen's default. +_QUANTIZABLE_SRC_DTYPES = { + gguf.GGMLQuantizationType.F32, + gguf.GGMLQuantizationType.F16, +} + + +_QUANT_TYPE = { + "q8_0": gguf.GGMLQuantizationType.Q8_0, + "q5_0": gguf.GGMLQuantizationType.Q5_0, + "q4_0": gguf.GGMLQuantizationType.Q4_0, + # F16 is a downcast, not a block quant — block_size = 1 in + # GGML_QUANT_SIZES, so the shape gates in should_quantize accept any + # 2-D / 3-D weight tensor. Useful for the 3-D HiFT conv kernels + # (K in {3, 7, 11, 16}) that none of the 32-block quants can take. + "f16": gguf.GGMLQuantizationType.F16, +} + + +def should_quantize(name: str, shape: tuple[int, ...], qtype: gguf.GGMLQuantizationType) -> bool: + # Keep tiny tensors at full precision. + n_elements = 1 + for d in shape: + n_elements *= d + if n_elements < 1024: + return False + + # Deny-list. + for s in _DENY_SUBSTRINGS: + if s in name: # case-sensitive for path-like names + return False + for s in _DENY_SUFFIXES: + if name.endswith(s): # one-letter param names that would over-match as substring + return False + + block = gguf.GGML_QUANT_SIZES[qtype][0] + + # 2D matmul weights: ggml shape (ne0, ne1) = (reduction_dim, output). + # GGUFReader exposes shape in numpy (reversed) order, so the + # reduction dim is shape[-1]. Quantization quantises along the + # last numpy axis, so shape[-1] must be a multiple of the block. + if len(shape) == 2: + return shape[-1] % block == 0 + + # 3D conv kernels: ggml shape (K, IC, OC) -> numpy (OC, IC, K). + # `gguf.quants.quantize` quantises along the LAST numpy axis, which is K + # for a conv kernel. HiFT conv kernels have K in {3, 7, 11, 16}; none + # are multiples of any block size we ship here (32). + # + # Quantising along K*IC instead would need a numpy reshape to + # (OC, K*IC) before `quantize` and then storing the result with ggml + # shape (K*IC, OC) — i.e. a 2-D on-disk tensor. But the C++ side's + # `conv1d_f32` calls `ggml_im2col(kernel, ...)` which derives the + # kernel size from `kernel->ne[0]`; collapsing K into a flattened + # (K*IC) ne[0] would silently break im2col window extraction. + # + # So 3-D quantisation only works when K alone meets the block-size + # constraint. We still gate on it (instead of returning False + # outright) so any future converter that ships K-aligned conv + # kernels gets the win for free; for the current HiFT stack this + # path stays a no-op and the caller logs the kept-as-source-dtype + # tensors via stats.kept. + if len(shape) == 3: + return shape[-1] % block == 0 + + return False + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("src", type=Path, help="Source GGUF (F32/F16)") + ap.add_argument("dst", type=Path, help="Output GGUF") + ap.add_argument("dtype", choices=_QUANT_TYPE.keys(), help="Target quant dtype") + ap.add_argument( + "--name-filter", + default=None, + help=("Substring filter on tensor names; only tensors whose name " + "contains this substring are touched. All other tensors " + "are passed through at their source dtype. Useful for " + "applying f16 to HiFT conv kernels in a Q4_0 source GGUF " + "without disturbing the existing Q4_0 CFM weights."), + ) + args = ap.parse_args() + + qtype = _QUANT_TYPE[args.dtype] + name_filter = args.name_filter + + src = gguf.GGUFReader(args.src, "r") + arch = src.fields.get("general.architecture") + arch_name = "" + if arch is not None: + arch_name = bytes(arch.parts[arch.data[0]]).decode("utf-8") + + writer = gguf.GGUFWriter(args.dst, arch_name or "chatterbox-s3gen") + + # Copy all metadata (KV fields) verbatim. Skip the ones the writer + # sets itself to avoid duplicates. + _SKIP_KEYS = { + "GGUF.version", + "GGUF.tensor_count", + "GGUF.kv_count", + "general.architecture", + } + for key, field in src.fields.items(): + if key in _SKIP_KEYS: + continue + val_type = field.types[0] if field.types else None + parts = [field.parts[i] for i in field.data] + if val_type is None: + continue + if val_type == gguf.GGUFValueType.ARRAY: + sub_type = field.types[1] if len(field.types) > 1 else None + if sub_type == gguf.GGUFValueType.STRING: + values = [bytes(p).decode("utf-8") for p in parts] + writer.add_array(key, values) + else: + arr = np.concatenate([np.asarray(p) for p in parts]).tolist() + writer.add_array(key, arr) + elif val_type == gguf.GGUFValueType.STRING: + writer.add_string(key, bytes(parts[0]).decode("utf-8")) + elif val_type == gguf.GGUFValueType.BOOL: + writer.add_bool(key, bool(parts[0][0])) + elif val_type in (gguf.GGUFValueType.UINT8, gguf.GGUFValueType.UINT16, + gguf.GGUFValueType.UINT32, gguf.GGUFValueType.UINT64): + writer.add_uint32(key, int(parts[0][0])) + elif val_type in (gguf.GGUFValueType.INT8, gguf.GGUFValueType.INT16, + gguf.GGUFValueType.INT32, gguf.GGUFValueType.INT64): + writer.add_int32(key, int(parts[0][0])) + elif val_type in (gguf.GGUFValueType.FLOAT32, gguf.GGUFValueType.FLOAT64): + writer.add_float32(key, float(parts[0][0])) + + quantized_count = 0 + kept_count = 0 + src_bytes = 0 + dst_bytes = 0 + + for t in src.tensors: + # GGUFReader returns shape in numpy-style reversed order. + shape = tuple(int(d) for d in reversed(t.shape) if d > 0) + if not shape: + shape = (int(t.shape[0]),) + + data = np.asarray(t.data) + src_bytes += data.nbytes + + in_filter = name_filter is None or name_filter in t.name + if (in_filter and t.tensor_type in _QUANTIZABLE_SRC_DTYPES + and t.tensor_type != qtype + and should_quantize(t.name, shape, qtype)): + # Reshape to natural (shape). GGUF raw data is contiguous in + # the original order, but reversed() above gives element-shape + # which is what `quantize()` expects. + arr = data.astype(np.float32).reshape(shape) + qdata = gguf.quants.quantize(arr, qtype) + writer.add_tensor(t.name, qdata, raw_shape=qdata.shape, raw_dtype=qtype) + quantized_count += 1 + dst_bytes += qdata.nbytes + else: + # Pass through unchanged. Preserve original dtype. + # + # For already-quantised inputs (Q-type sources) the GGUF data + # is opaque packed bytes (Q4_0: 18 B / 32 elements ≈ 0.56 B + # per element), so a numpy-shape reshape against the + # element-shape would fail with a size-mismatch. Float-type + # sources have block_size=1 in GGML_QUANT_SIZES so the + # reshape works as before. + block_size, type_size = gguf.GGML_QUANT_SIZES[t.tensor_type] + if block_size == 1: + arr = data.reshape(shape) + writer.add_tensor(t.name, arr, raw_shape=arr.shape, raw_dtype=t.tensor_type) + else: + # Q-type passthrough. gguf-0.18+ `add_tensor_info` treats + # `raw_shape` as **byte shape** for uint8 tensors (the + # innermost dim is bytes per row, not elements per row). + # Convert: byte_inner = elements_inner / block * type_size. + # Earlier versions of this script hit + # `ValueError: Quantized tensor bytes per row (N) is not a + # multiple of Q4_0 type size (18)` when re-quantising a + # GGUF that already had Q-type tensors — see §3.26. + byte_inner = shape[-1] // block_size * type_size + byte_shape = tuple(list(shape[:-1]) + [byte_inner]) + writer.add_tensor(t.name, data, raw_shape=byte_shape, raw_dtype=t.tensor_type) + kept_count += 1 + dst_bytes += data.nbytes + + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file() + writer.close() + + print(f"arch: {arch_name!r}") + print(f"quantized: {quantized_count} tensors to {args.dtype.upper()}") + print(f"kept: {kept_count} tensors as source dtype") + print(f"size: {src_bytes / 1e6:.1f} MB → {dst_bytes / 1e6:.1f} MB " + f"({dst_bytes / src_bytes * 100:.1f}%)") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/packages/tts-ggml/scripts/requirements.txt b/packages/tts-ggml/scripts/requirements.txt new file mode 100644 index 0000000000..29e32130c1 --- /dev/null +++ b/packages/tts-ggml/scripts/requirements.txt @@ -0,0 +1,43 @@ +# Runtime requirements for the source -> GGUF converters under +# scripts/ (convert-t3-turbo-to-gguf.py, convert-t3-mtl-to-gguf.py, +# convert-s3gen-to-gguf.py, convert-supertonic2-to-gguf.py). +# +# Real deps: +# +# gguf -- writes the output .gguf file +# numpy -- tensor shaping / quantization helpers +# torch -- loads the upstream Resemble Chatterbox PyTorch +# checkpoints (T3 + S3Gen + VoiceEncoder) +# safetensors -- reads the PyTorch checkpoint shards under the +# ResembleAI/chatterbox HuggingFace repo +# huggingface_hub -- snapshot_download fetches the upstream +# ResembleAI/chatterbox{,-turbo} and +# Supertone/supertonic-2 repos at convert time +# onnx -- parses the Supertonic ONNX bundle (onnxruntime +# not needed; the converter walks the protobuf +# directly via numpy_helper) +# librosa -- mel-filterbank generation for the VoiceEncoder +# mel pre-baking inside the chatterbox T3 +# (turbo + MTL) and S3Gen converters +# +# Mostly unpinned because this is a developer-time conversion tool, +# not a runtime dep of the addon. Pin here if a specific upstream +# release breaks one of the converters and we need to lock it. +# +# Exception: on darwin-x64 (Intel Macs) the latest available torch +# wheel is torch==2.2.2 -- PyTorch stopped shipping macOS x86_64 +# wheels after 2.2.x. That wheel is built against the NumPy 1.x C +# ABI and crashes at `import torch` when paired with NumPy 2.x +# ("Failed to initialize NumPy: _ARRAY_API not found" -> "Numpy is +# not available"). We pin numpy<2 only on that platform; every +# other platform stays free to resolve the latest numpy. +gguf +numpy<2; sys_platform == "darwin" and platform_machine == "x86_64" +numpy; sys_platform != "darwin" or platform_machine != "x86_64" +torch +safetensors +huggingface_hub +onnx +librosa +numba<0.60; sys_platform == "darwin" and platform_machine == "x86_64" +llvmlite<0.43; sys_platform == "darwin" and platform_machine == "x86_64" diff --git a/packages/tts-ggml/scripts/setup-venv.sh b/packages/tts-ggml/scripts/setup-venv.sh new file mode 100644 index 0000000000..6dbe858b53 --- /dev/null +++ b/packages/tts-ggml/scripts/setup-venv.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +# +# Create a local Python venv at ./venv with the packages the +# Chatterbox + Supertonic source -> GGUF converters need (see +# scripts/requirements.txt). Idempotent: safe to re-run. +# +# Usage: +# ./scripts/setup-venv.sh [flags] +# +# Flags: +# --python Base interpreter to seed the venv (default: $PYTHON +# or python3). Must be CPython 3.10+. +# --venv Venv location (default: ./venv) +# --force Recreate the venv even if ./venv already exists +# --help, -h Show this help +# +# The converters inside the venv are invoked via scripts/convert-models.sh, +# which auto-discovers ./venv/{bin,Scripts}/python before falling back to +# the system python3. + +set -euo pipefail + +PYTHON_BIN="${PYTHON:-}" +VENV_DIR="./venv" +FORCE=0 + +print_usage() { + sed -n '/^# Usage:/,/^set -euo/p' "$0" | sed -e '/^set -euo/d' -e 's/^# *//' >&2 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --python) PYTHON_BIN="$2"; shift 2;; + --venv) VENV_DIR="$2"; shift 2;; + --force) FORCE=1; shift;; + --help|-h) print_usage; exit 0;; + *) echo "Unknown flag: $1" >&2; print_usage; exit 2;; + esac +done + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REQS="$SCRIPT_DIR/requirements.txt" + +if [[ ! -f "$REQS" ]]; then + echo "Error: requirements.txt not found at $REQS" >&2 + exit 1 +fi + +# Pick a sane default interpreter when --python / PYTHON wasn't set. +# On Windows under Git Bash, $PATH's `python3` is often the MSYS2 / UCRT +# Python (platform tag mingw_x86_64_ucrt_gnu) which has zero wheels on +# PyPI for numpy / torch / friends, so pip falls back to a source build +# and needs Ninja + a C++ toolchain just to install numpy. The Windows +# Python launcher (`py -3`) finds the real CPython (platform tag +# win-amd64) which has wheels for everything. Resolve `py -3` to its +# absolute path so the rest of this script can treat PYTHON_BIN as a +# single argv[0]. +if [[ -z "$PYTHON_BIN" ]]; then + if command -v py >/dev/null 2>&1; then + if resolved=$(py -3 -c 'import sys; print(sys.executable)' 2>/dev/null) && [[ -n "$resolved" ]]; then + PYTHON_BIN="$resolved" + fi + fi +fi +if [[ -z "$PYTHON_BIN" ]]; then + if command -v python3 >/dev/null 2>&1; then + PYTHON_BIN="python3" + elif command -v python >/dev/null 2>&1; then + PYTHON_BIN="python" + else + PYTHON_BIN="python3" + fi +fi + +if ! command -v "$PYTHON_BIN" >/dev/null 2>&1 && [[ ! -x "$PYTHON_BIN" ]]; then + echo "Error: python interpreter not found: $PYTHON_BIN" >&2 + echo " pass --python /path/to/python (CPython 3.10+ required)." >&2 + exit 1 +fi + +base_platform=$("$PYTHON_BIN" -c 'import sysconfig; print(sysconfig.get_platform())' 2>/dev/null || echo "unknown") +case "$base_platform" in + mingw*|cygwin*|msys*) + echo "Error: $PYTHON_BIN reports platform '$base_platform'." >&2 + echo " PyPI does not ship binary wheels for that platform tag, so" >&2 + echo " pip would fall back to building numpy / torch from source." >&2 + echo " Use a native Windows CPython instead, e.g.:" >&2 + echo " npm run setup-models -- --python \"\$(py -3 -c 'import sys; print(sys.executable)')\"" >&2 + echo " or install Python from https://www.python.org/downloads/windows/" >&2 + echo " and re-run with PYTHON=/c/Python313/python.exe (or similar)." >&2 + exit 1 + ;; +esac + +# Cross-platform: Unix venvs put the interpreter at venv/bin/python, +# Windows ones at venv/Scripts/python.exe. Probe both. +venv_python() { + local v="$1" + if [[ -x "$v/bin/python" ]]; then + echo "$v/bin/python" + elif [[ -x "$v/Scripts/python.exe" ]]; then + echo "$v/Scripts/python.exe" + else + echo "" + fi +} + +if [[ "$FORCE" -eq 1 ]] && [[ -d "$VENV_DIR" ]]; then + echo "Removing existing venv at $VENV_DIR (--force)" + rm -rf "$VENV_DIR" +fi + +if [[ -z "$(venv_python "$VENV_DIR")" ]]; then + echo "Creating venv at $VENV_DIR using $PYTHON_BIN" + "$PYTHON_BIN" -m venv "$VENV_DIR" +fi + +VENV_PY="$(venv_python "$VENV_DIR")" +if [[ -z "$VENV_PY" ]]; then + echo "Error: venv was created but no interpreter found under $VENV_DIR/bin or $VENV_DIR/Scripts" >&2 + exit 1 +fi + +echo "Using venv interpreter: $VENV_PY" +echo "Upgrading pip" +"$VENV_PY" -m pip install --upgrade pip >/dev/null + +echo "Installing $REQS" +"$VENV_PY" -m pip install -r "$REQS" + +echo +echo "Venv ready. Next: ./scripts/convert-models.sh" +echo "(or: npm run setup-models)" diff --git a/packages/tts-ggml/scripts/validate-mobile-tests.js b/packages/tts-ggml/scripts/validate-mobile-tests.js new file mode 100644 index 0000000000..7445b59a17 --- /dev/null +++ b/packages/tts-ggml/scripts/validate-mobile-tests.js @@ -0,0 +1,85 @@ +#!/usr/bin/env node +'use strict' + +// Run with `node`, not `bare`: this script is a build-time helper that uses +// node's built-in `fs` / `path` (same convention as the sibling +// generate-mobile-integration-tests.js). Everything inside the addon itself +// runs under `bare` and uses bare-fs / bare-path instead. + +const fs = require('fs') +const path = require('path') + +const repoRoot = path.resolve(__dirname, '..') +const integrationDir = path.join(repoRoot, 'test', 'integration') +const mobileAutoFile = path.join(repoRoot, 'test', 'mobile', 'integration.auto.cjs') + +function getIntegrationTestFiles () { + if (!fs.existsSync(integrationDir)) { + throw new Error(`Integration directory not found: ${integrationDir}`) + } + + return fs.readdirSync(integrationDir) + .filter(f => f.endsWith('.test.js')) + .sort() +} + +function getGeneratedIntegrationRefs (content) { + const references = new Set() + const referencePattern = /runIntegrationModule\('\.\.\/integration\/([^']+)'(?:,\s*options)?\)/g + let match = referencePattern.exec(content) + + while (match !== null) { + references.add(match[1]) + match = referencePattern.exec(content) + } + + return references +} + +function setDiff (left, right) { + return [...left].filter(item => !right.has(item)).sort() +} + +function printMismatchDetails (label, items) { + console.error(` ${label}:`) + items.forEach(item => console.error(` - ${item}`)) +} + +try { + const integrationFiles = getIntegrationTestFiles() + if (!fs.existsSync(mobileAutoFile)) { + console.error('Mobile integration tests not generated!') + console.error(' Run: npm run test:mobile:generate') + process.exit(1) + } + + const expectedSet = new Set(integrationFiles) + const mobileAutoContent = fs.readFileSync(mobileAutoFile, 'utf8') + const generatedSet = getGeneratedIntegrationRefs(mobileAutoContent) + + const missingFromGenerated = setDiff(expectedSet, generatedSet) + const staleInGenerated = setDiff(generatedSet, expectedSet) + + if (missingFromGenerated.length > 0 || staleInGenerated.length > 0) { + console.error('Mobile integration tests are out of sync with test/integration') + if (missingFromGenerated.length > 0) { + printMismatchDetails('Missing from integration.auto.cjs', missingFromGenerated) + } + if (staleInGenerated.length > 0) { + printMismatchDetails('Stale references in integration.auto.cjs', staleInGenerated) + } + console.error(' Run: npm run test:mobile:generate') + process.exit(1) + } + + if (integrationFiles.length === 0) { + console.log('Mobile integration tests are up to date (no integration tests found)') + process.exit(0) + } + + console.log('Mobile integration tests are up to date') + process.exit(0) +} catch (error) { + console.error('Error validating mobile tests:', error.message) + process.exit(1) +} diff --git a/packages/tts-ggml/test/data/sentences-long.js b/packages/tts-ggml/test/data/sentences-long.js new file mode 100644 index 0000000000..519ac57241 --- /dev/null +++ b/packages/tts-ggml/test/data/sentences-long.js @@ -0,0 +1,251 @@ +'use strict' + +const en = `On a quiet evening in a small coastal town, a young man named Daniel stood by the edge of the pier, watching the waves roll in under a fading orange sky. The horizon stretched endlessly, where the sea met the sky in a delicate line that seemed almost unreal. It had been years since he last stood in this exact spot, yet everything felt strangely familiar, as if time had decided to slow down just for him. The salty breeze brushed against his face, carrying with it fragments of memory—childhood laughter, distant voices, and moments he thought he had long forgotten. + +Daniel had not planned this return. In fact, for years he had actively avoided it, choosing instead the noise and distraction of the city. But something had changed recently. Perhaps it was exhaustion, or perhaps it was the quiet realization that no matter how far he traveled, he could never fully escape the past. There was always a piece of him tied to this place, like an invisible thread pulling him back, gently but persistently. + +As he walked along the pier, the wooden planks creaked beneath his feet, each step echoing softly into the stillness of the evening. He noticed the old lighthouse standing tall in the distance, its light rotating slowly, faithfully guiding ships just as it had done for decades. That lighthouse had once fascinated him as a child. He used to imagine it as a guardian of the sea, watching over everything and everyone who dared to venture too far. + +Eventually, Daniel made his way into the town. The streets were nearly empty, lit only by a few warm lamps that cast long shadows along the pavement. There was a calmness here that he had forgotten existed—a silence that didn't feel empty, but rather full of unspoken stories. Every corner seemed to hold a memory, every building a reminder of a life he once knew. + +When he reached the small café at the corner of the main street, he paused. The sign above the door was slightly faded, but still recognizable. He remembered coming here with his parents, sitting by the window, watching the rain fall while sipping hot chocolate. Without overthinking it, he pushed the door open and stepped inside. + +The warmth of the café embraced him immediately. The scent of coffee and baked goods filled the air, and soft music played in the background. Behind the counter stood an elderly woman with kind eyes and a gentle smile. She looked at Daniel for a moment, tilting her head slightly, as if trying to place him in her memory. + +"You've been away for quite some time," she said softly. + +Daniel hesitated. "Do I know you?" he asked. + +The woman smiled again. "Not exactly. But I've seen many people come and go. You have the look of someone returning." + +Her words lingered in the air, and Daniel felt something shift inside him. He sat down at a small table near the window, just as he used to years ago. The chair felt familiar, the view unchanged, and for the first time in a long while, he allowed himself to simply sit and exist without rushing. + +They began to talk. At first, it was small things—the weather, the town, the changes that had taken place over the years. But gradually, the conversation deepened. The woman spoke about the people who had left, those who had stayed, and the stories that tied them all together. Daniel found himself listening more than speaking, absorbing every word as if it carried a piece of something he had been missing. + +Time passed without him noticing. Outside, the sky darkened completely, and the stars began to appear, one by one, like quiet witnesses to the unfolding night. Inside the café, the light remained warm and steady, creating a small refuge from the world beyond its walls. + +At some point, Daniel realized that the weight he had been carrying—the constant pressure, the uncertainty, the quiet anxiety that followed him everywhere—had begun to fade. It wasn't gone entirely, but it felt lighter, more manageable. As if being here, in this place, was slowly restoring something within him. + +When he finally stepped outside again, the air felt cooler, but also clearer. He made his way back to the pier, drawn once more to the sound of the sea. The waves moved rhythmically, their motion steady and predictable, unlike the chaos he had grown accustomed to in his daily life. + +He leaned against the railing and looked out into the darkness. The lighthouse continued its silent work, its beam sweeping across the water with unwavering consistency. Daniel realized then that some things never changed—not because they were unable to, but because they didn't need to. + +He thought about the years he had spent chasing success, constantly moving forward without ever stopping to ask himself why. He had built a life that looked impressive from the outside, but somewhere along the way, he had lost the sense of meaning that once guided him. + +Standing there, listening to the waves, he began to understand something simple yet profound. The answers he had been searching for were not hidden in distant places or future achievements. They were here, in the quiet moments, in the spaces between decisions, in the memories he had tried so hard to leave behind. + +A sense of calm settled over him, deeper than anything he had felt in years. It wasn't excitement, nor was it relief. It was something steadier—a quiet acceptance of where he was, and a gentle curiosity about where he might go next. + +The night grew deeper, and the town remained still. Daniel stayed there for a long time, watching the horizon, thinking, remembering. And for the first time in a very long while, he wasn't trying to escape anything. + +As the tide slowly rose and the sound of the water grew louder, Daniel took a deep breath and closed his eyes. Tomorrow would bring choices, as it always did. There would be decisions to make, paths to consider, and uncertainties to face. + +But for now, none of that mattered. + +In this moment, under the vast open sky, with the sea stretching endlessly before him, Daniel felt something he had almost forgotten was possible. + +He felt at peace. + +And in that quiet realization, he understood that sometimes, the journey back home is not about returning to a place—but about rediscovering who you are when you finally allow yourself to stop running.` + +const es = `En una tarde tranquila de verano, el pueblo costero de San Martín se preparaba para la llegada de la noche. Las barcas de los pescadores ya descansaban amarradas en el muelle, mecidas suavemente por una brisa tibia que arrastraba el olor a sal y a madera vieja. Las gaviotas trazaban círculos perezosos sobre el agua, y el cielo comenzaba a teñirse de tonos naranjas y violetas que se reflejaban en la superficie del mar como un cuadro que nadie se detenía a contemplar. + +Marta caminaba despacio por el paseo marítimo, sin rumbo fijo, dejándose llevar por el ritmo pausado del lugar. Hacía años que no volvía a San Martín, y sin embargo cada esquina le resultaba familiar, como si el tiempo allí se hubiera detenido esperando su regreso. Pasó junto a la vieja fuente de piedra donde jugaba de niña, y un recuerdo inesperado le arrancó una sonrisa. Aquellas tardes largas en las que no existían las prisas ni las preocupaciones parecían pertenecer a otra vida. + +Se sentó en un banco frente al mar y observó cómo las últimas luces del día se apagaban lentamente. Un anciano que paseaba a su perro se detuvo a su lado y le ofreció un saludo breve pero cálido, como si la conociera de toda la vida. En los pueblos pequeños, pensó Marta, la soledad nunca es completa; siempre hay alguien que te recuerda, alguien que te ve. La brisa se hizo más fresca y ella se abrigó cruzando los brazos, pero no se levantó; quería quedarse un poco más en aquel silencio amable que tanto había echado de menos. + +Cuando las primeras estrellas aparecieron sobre el horizonte, Marta sintió que algo dentro de ella se aflojaba, como un nudo que llevaba años apretado. No era felicidad exactamente, sino algo más sutil: la certeza tranquila de que, a pesar de todo lo vivido, aquel rincón del mundo seguía siendo suyo. Se puso de pie, respiró hondo y emprendió el camino de regreso con pasos lentos, sabiendo que no tenía prisa, porque por primera vez en mucho tiempo el lugar al que volvía era exactamente donde quería estar.` + +const zh = `在一个宁静的夜晚,在一个小小的海边小镇上,一个名叫丹尼尔的年轻人站在码头的尽头,望着夕阳下渐渐消失在橙色天空中的海浪。地平线无限延伸,大海与天空在远处交汇成一条几乎不真实的细线。距离他上一次站在这里已经过去了许多年,但一切却显得异常熟悉,仿佛时间为了他而放慢了脚步。咸咸的海风拂过他的脸庞,带来了记忆的碎片——童年的笑声、遥远的呼喊,以及那些他以为早已遗忘的瞬间。 + +丹尼尔并没有计划回到这里。事实上,多年来他一直刻意回避这个地方,选择沉浸在城市的喧嚣与忙碌之中。但最近,某种东西发生了变化。也许是疲惫,也许是他逐渐意识到,无论走多远,他都无法真正逃离过去。总有一部分的他与这个地方紧密相连,就像一条看不见的线,温柔却坚定地把他拉回来。 + +当他沿着码头慢慢行走时,木板在脚下轻轻作响,每一步都在寂静的夜晚中回荡。他注意到远处那座古老的灯塔依然矗立,灯光缓缓旋转,像多年前一样,默默指引着远航的船只。小时候,他总是对那座灯塔充满幻想,把它想象成大海的守护者,守望着每一个走得太远的人。 + +最终,丹尼尔走进了小镇。街道几乎空无一人,只有几盏温暖的路灯照亮着地面,拉出长长的影子。这里的宁静是他早已遗忘的存在——一种不空洞的安静,而是充满了无声故事的安静。每一个角落似乎都承载着回忆,每一栋建筑都提醒着他曾经的生活。 + +当他走到主街拐角处那家小咖啡馆时,他停了下来。门上的招牌有些褪色,但依然清晰可辨。他记得小时候和父母一起坐在窗边,看着雨滴落下,一边喝着热巧克力。没有多想,他推开门走了进去。 + +咖啡馆里的温暖立刻包围了他。空气中弥漫着咖啡和新鲜烘焙面包的香气,背景里传来柔和的音乐。柜台后站着一位年长的女人,眼神温和,带着淡淡的微笑。她看了丹尼尔一会儿,微微歪着头,仿佛在记忆中寻找他的影子。 + +"你离开很久了。"她轻声说道。 + +丹尼尔有些迟疑。"我们认识吗?"他问。 + +女人再次微笑。"不完全认识。但我见过很多人来来去去。你看起来像一个回家的人。" + +她的话在空气中停留着,让丹尼尔的内心微微震动。他坐在窗边的一张小桌旁,就像多年前那样。椅子依旧熟悉,景色没有改变,而他也终于允许自己停下来,不再匆忙地生活。 + +他们开始交谈。起初只是一些简单的话题——天气、小镇、这些年的变化。但渐渐地,谈话变得更深。女人讲述着那些离开的人、留下的人,以及把他们联系在一起的故事。丹尼尔发现自己更多地在倾听,仿佛这些话语中藏着他一直缺失的某种东西。 + +时间悄然流逝。他几乎没有察觉。窗外的天空已经完全变暗,星星一颗一颗地出现,像安静的见证者,注视着夜晚的展开。咖啡馆里的灯光依然温暖而稳定,仿佛是与外界隔绝的一片小小避风港。 + +不知何时,丹尼尔意识到自己长期背负的压力——那种持续的紧张、不确定,以及始终挥之不去的焦虑——开始慢慢减轻。它并没有完全消失,但变得更轻、更容易承受。仿佛这个地方,正在一点一点修复他内心深处的某些东西。 + +当他再次走出咖啡馆时,空气变得更加清凉,也更加清晰。他又一次回到了码头,被海浪的声音吸引。海水有节奏地起伏,稳定而可预测,与他习惯的混乱生活形成了鲜明的对比。 + +他靠在栏杆上,望向远方的黑暗。灯塔依旧默默地运作着,光束不断扫过海面,从未动摇。那一刻,丹尼尔意识到,有些东西之所以不变,并不是因为它们无法改变,而是因为它们本就不需要改变。 + +他想起这些年来自己对成功的追逐,不断向前,却从未停下来问自己为什么。他建立了一种在外人看来光鲜的生活,但在某个不知不觉的时刻,他失去了曾经指引他的意义。 + +站在那里,听着海浪的声音,他逐渐明白了一件简单却深刻的事情。他一直寻找的答案,并不在遥远的地方,也不在未来的成就之中。它就在这里,在这些安静的瞬间,在每一个决定之间的空隙,在那些他曾努力遗忘的记忆之中。 + +一种前所未有的平静在他心中蔓延。这不是兴奋,也不是解脱,而是一种更稳定的感觉——对当下的接受,以及对未来的温柔好奇。 + +夜色渐深,小镇依旧安静。丹尼尔在那里站了很久,望着地平线,思考,回忆。而这一次,他不再试图逃避任何东西。 + +随着潮水慢慢上涨,海浪的声音愈发清晰,丹尼尔深吸一口气,闭上了眼睛。明天仍然会带来选择,就像以往一样。仍然会有决定要做,有道路要走,有未知要面对。 + +但此刻,这一切都不重要。 + +在这片辽阔的天空之下,在无尽延伸的大海面前,丹尼尔感受到了一种他几乎遗忘的东西。 + +他感到内心的平静。 + +而在这份宁静之中,他终于明白,有时候,回家的旅程并不是回到某个地方,而是重新找回那个不再逃避的自己。` + +const ja = `静かな夕暮れ、小さな海辺の町で、ダニエルという名の青年が桟橋の端に立ち、夕焼けの空の下に押し寄せる波を見つめていた。水平線は果てしなく広がり、海と空がほとんど現実とは思えないような繊細な線で交わっていた。彼がこの場所に最後に立ったのはもう何年も前のことだったが、すべてが不思議なほどなじみ深く感じられた。まるで時間が彼のためだけに歩みを緩めたかのように。潮の香りを含んだ風が彼の顔を撫で、記憶の断片を運んできた。子供の頃の笑い声、遠くの声、そしてとうに忘れたと思っていた瞬間の数々。 + +ダニエルはこの帰還を計画していたわけではなかった。実際、何年もの間、彼は意識的にこの場所を避けていた。代わりに都会の喧騒と気晴らしの中に身を置くことを選んだ。しかし最近、何かが変わった。それは疲れだったのかもしれないし、どれほど遠くに行っても過去から完全に逃れることはできないという静かな気づきだったのかもしれない。彼の一部は常にこの場所に結びついていた。見えない糸のように、優しくも執拗に彼を引き戻す。 + +桟橋を歩くと、足元の木の板がきしみ、一歩一歩が夕暮れの静けさの中に柔らかく響いた。遠くに古い灯台が高くそびえ立っているのが見えた。その光はゆっくりと回転し、何十年もそうしてきたように忠実に船を導いていた。子供の頃、あの灯台に彼は夢中になったものだ。海の守護者として想像し、遠くに行きすぎたすべての人を見守っていると思っていた。 + +やがてダニエルは町の中に入った。通りはほとんど人がおらず、いくつかの温かな街灯だけが歩道に長い影を落としていた。ここには彼が忘れていた静けさがあった。空虚ではない沈黙、むしろ語られなかった物語に満ちた沈黙だった。すべての角が思い出を抱えているようであり、すべての建物がかつて知っていた生活を思い起こさせた。 + +メインストリートの角にある小さなカフェにたどり着いたとき、彼は立ち止まった。ドアの上の看板は少し色あせていたが、まだ読み取れた。両親と一緒にここに来て、窓際に座り、雨が降るのを眺めながらホットチョコレートを飲んだことを覚えている。深く考えずに、彼はドアを押して中に入った。 + +カフェの温もりがすぐに彼を包んだ。コーヒーと焼きたてのパンの香りが空気に満ち、穏やかな音楽がバックグラウンドで流れていた。カウンターの向こうには、優しい目と穏やかな微笑みを持つ年配の女性が立っていた。彼女はしばらくダニエルを見つめ、少し首をかしげた。まるで記憶の中から彼を見つけようとしているかのように。 + +「ずいぶん長く離れていたのね」と彼女は静かに言った。 + +ダニエルはためらった。「僕を知っているんですか」と尋ねた。 + +女性は再び微笑んだ。「正確には知らないわ。でも多くの人が来ては去るのを見てきたの。あなたは帰ってきた人の顔をしている。」 + +彼女の言葉が空気の中に漂い、ダニエルは内側で何かが動くのを感じた。彼は何年も前と同じように窓際の小さなテーブルに座った。椅子はなじみ深く、景色は変わっておらず、久しぶりに、急ぐことなくただ座って存在することを自分に許した。 + +彼らは話し始めた。最初は些細なことだった。天気、町、年月の間に起きた変化について。しかし次第に会話は深くなった。女性は去った人々、残った人々、そしてすべてを結びつける物語について語った。ダニエルは話すよりも聞くことの方が多い自分に気づいた。まるで彼女のすべての言葉が、自分が失っていた何かの欠片を運んでいるかのように。 + +時間が過ぎていくのに気づかなかった。外では空が完全に暗くなり、星が一つずつ現れ始めた。広がる夜の静かな証人のように。カフェの中では、光は温かく安定したままで、壁の向こうの世界からの小さな避難所を作り出していた。 + +いつの間にか、ダニエルは自分が背負ってきた重荷、絶え間ないプレッシャー、不確実さ、どこにでもつきまとう静かな不安が薄れ始めていることに気づいた。完全に消えたわけではないが、より軽く、より扱いやすくなっていた。まるでこの場所にいることが、彼の内面の何かをゆっくりと回復させているかのように。 + +再び外に出たとき、空気はより涼しく感じられたが、同時により澄んでいた。彼はもう一度海の音に引かれて桟橋に戻った。波がリズミカルに動いていた。その動きは安定していて予測可能で、日常生活で慣れ親しんだ混沌とは違っていた。 + +彼は手すりにもたれかかり、暗闇の中を見つめた。灯台は静かな仕事を続け、その光線が揺るぎない一貫性をもって水面を横切っていた。ダニエルはそのとき気づいた。変わらないものがあるのは、変われないからではなく、変わる必要がないからだと。 + +成功を追い求めて過ごした年月を思い返した。常に前に進み、なぜそうしているのか自分に問いかけることなく。外から見れば印象的な人生を築いたが、いつの間にか、かつて自分を導いていた意味の感覚を失っていた。 + +そこに立ち、波の音を聞きながら、彼はシンプルだが深い何かを理解し始めた。探し求めていた答えは、遠い場所や将来の業績の中に隠されていたのではない。それはここにあった。静かな瞬間の中に、決断と決断の間の空間に、懸命に置き去りにしようとした記憶の中に。 + +穏やかさが彼を包んだ。何年も感じたことのないほど深いものだった。それは興奮でも安堵でもなかった。もっと安定した何か、自分がいる場所への静かな受容と、次にどこに行くかへの穏やかな好奇心だった。 + +夜は深まり、町は静かなままだった。ダニエルは長い間そこに立ち、水平線を見つめ、考え、思い出していた。そして久しぶりに、彼は何からも逃げようとしていなかった。 + +潮がゆっくりと満ち、水の音が大きくなると、ダニエルは深呼吸をして目を閉じた。明日はいつものように選択をもたらすだろう。下すべき決断、考慮すべき道、直面すべき不確実さがあるだろう。 + +しかし今は、そのどれも重要ではなかった。 + +この瞬間、広大な空の下、果てしなく広がる海を前に、ダニエルはほとんど忘れかけていた何かを感じた。 + +彼は平和を感じた。 + +そしてその静かな気づきの中で、彼は理解した。時に、帰郷の旅は場所に戻ることではなく、ようやく逃げることをやめたとき、自分が誰であるかを再発見することなのだと。` + +const he = `בערב שקט בעיירה קטנה על חוף הים, צעיר בשם דניאל עמד בקצה המזח והביט בגלים המתגלגלים תחת שמיים כתומים דועכים. קו האופק השתרע ללא סוף, שם הים פגש את השמיים בקו עדין שנראה כמעט לא אמיתי. עברו שנים מאז שעמד במקום הזה בדיוק, ובכל זאת הכול הרגיש מוכר באופן מוזר, כאילו הזמן החליט להאט את קצבו רק בשבילו. הרוח המלוחה ליטפה את פניו, ונשאה עמה שברי זיכרונות, צחוק ילדות, קולות רחוקים ורגעים שחשב שכבר שכח מזמן. + +דניאל לא תכנן את החזרה הזו. למעשה, במשך שנים הוא נמנע מכך בכוונה, ובחר במקום זאת ברעש ובהסחות של העיר הגדולה. אבל משהו השתנה לאחרונה. אולי זו הייתה עייפות, ואולי זו הייתה ההבנה השקטה שלא משנה כמה רחוק הוא נוסע, הוא לעולם לא יוכל לברוח מהעבר לגמרי. תמיד היה חלק ממנו קשור למקום הזה, כמו חוט בלתי נראה שמושך אותו בחזרה, בעדינות אך בעקביות. + +כשהלך לאורך המזח, קרשי העץ חרקו מתחת לרגליו, וכל צעד הדהד בשקט אל תוך דממת הערב. הוא הבחין במגדלור הישן עומד גבוה במרחק, אורו מסתובב לאט ובנאמנות מנחה ספינות בדיוק כפי שעשה במשך עשורים. המגדלור הזה ריתק אותו כילד. הוא נהג לדמיין אותו כשומר הים, המשגיח על כל מי שהעז לצאת רחוק מדי. + +בסופו של דבר, דניאל הגיע לעיירה. הרחובות היו כמעט ריקים, מוארים רק בכמה פנסים חמים שהטילו צללים ארוכים על המדרכה. הייתה כאן שלווה שכבר שכח שקיימת, שקט שלא הרגיש ריק אלא דווקא מלא בסיפורים שלא נאמרו. כל פינה נראתה כאילו מחזיקה בזיכרון, כל בניין תזכורת לחיים שפעם הכיר. + +כשהגיע לבית הקפה הקטן בפינת הרחוב הראשי, הוא עצר. השלט מעל הדלת היה מעט דהוי, אך עדיין ניתן לזיהוי. הוא זכר שבא לכאן עם הוריו, ישב ליד החלון, הסתכל על הגשם יורד ולגם שוקו חם. בלי לחשוב יותר מדי, הוא דחף את הדלת ונכנס פנימה. + +החום של בית הקפה חיבק אותו מיד. ריח הקפה והמאפים מילא את האוויר, ומוזיקה שקטה נשמעה ברקע. מאחורי הדלפק עמדה אישה מבוגרת עם עיניים טובות וחיוך עדין. היא הביטה בדניאל לרגע, הטתה מעט את ראשה, כאילו ניסתה למצוא אותו בזיכרונותיה. + +"לא היית כאן הרבה זמן," היא אמרה בשקט. + +דניאל היסס. "את מכירה אותי?" הוא שאל. + +האישה חייכה שוב. "לא בדיוק. אבל ראיתי הרבה אנשים באים והולכים. יש לך את המראה של מישהו שחוזר." + +המילים שלה נשארו באוויר, ודניאל הרגיש שמשהו זז בתוכו. הוא ישב ליד שולחן קטן ליד החלון, בדיוק כמו שנהג לפני שנים. הכיסא הרגיש מוכר, הנוף לא השתנה, ולראשונה מזה זמן רב הוא הרשה לעצמו פשוט לשבת ולהיות בלי למהר. + +הם התחילו לדבר. בהתחלה על דברים קטנים, מזג האוויר, העיירה, השינויים שחלו לאורך השנים. אבל בהדרגה, השיחה העמיקה. האישה סיפרה על האנשים שעזבו, אלה שנשארו, והסיפורים שקושרים את כולם יחד. דניאל מצא את עצמו מקשיב יותר מדבר, סופג כל מילה כאילו היא נושאת חלק ממשהו שחסר לו. + +הזמן חלף בלי שהבחין. בחוץ השמיים החשיכו לגמרי, והכוכבים החלו להופיע אחד אחד, כמו עדים שקטים ללילה שנפרש. בתוך בית הקפה, האור נותר חם ויציב, ויצר מקלט קטן מהעולם שמעבר לקירותיו. + +בשלב מסוים, דניאל הבין שהמשקל שנשא, הלחץ המתמיד, חוסר הוודאות, החרדה השקטה שליוותה אותו לכל מקום, החלו להתמוסס. הם לא נעלמו לגמרי, אבל הרגישו קלים יותר, ניתנים יותר לניהול. כאילו להיות כאן, במקום הזה, משחזר לאט משהו בתוכו. + +כשיצא שוב החוצה, האוויר הרגיש קריר יותר, אך גם צלול יותר. הוא חזר שוב למזח, נמשך פעם נוספת לקול הים. הגלים נעו בקצב, תנועתם יציבה וצפויה, בניגוד לכאוס שהתרגל אליו בחיי היומיום. + +הוא נשען על המעקה והביט אל תוך החשיכה. המגדלור המשיך בעבודתו השקטה, קרן האור שלו סורקת את פני המים בעקביות בלתי מתפשרת. דניאל הבין אז שיש דברים שלא משתנים, לא כי הם לא מסוגלים, אלא כי הם לא צריכים. + +הוא חשב על השנים שבילה ברדיפה אחרי הצלחה, נע קדימה ללא הפסקה בלי לעצור אף פעם לשאול את עצמו למה. הוא בנה חיים שנראו מרשימים מבחוץ, אבל איפשהו בדרך איבד את תחושת המשמעות שפעם הנחתה אותו. + +עומד שם, מקשיב לגלים, הוא החל להבין משהו פשוט אך עמוק. התשובות שחיפש לא היו מוסתרות במקומות רחוקים או בהישגים עתידיים. הן היו כאן, ברגעים השקטים, במרחבים שבין ההחלטות, בזיכרונות שניסה כל כך קשה להשאיר מאחור. + +תחושת שלווה שקעה עליו, עמוקה מכל מה שהרגיש בשנים. זו לא הייתה התרגשות, ולא הקלה. זה היה משהו יציב יותר, קבלה שקטה של מקומו, וסקרנות עדינה לגבי לאן ילך הלאה. + +הלילה העמיק, והעיירה נותרה דוממת. דניאל נשאר שם זמן רב, מביט באופק, חושב, נזכר. ולראשונה מזה זמן רב, הוא לא ניסה לברוח משום דבר. + +כשהגאות עלתה לאט וקול המים התחזק, דניאל נשם עמוק ועצם את עיניו. מחר יביא בחירות, כמו תמיד. יהיו החלטות לקבל, דרכים לשקול, ואי ודאויות לעמוד בפניהן. + +אבל כרגע, שום דבר מזה לא חשוב. + +ברגע הזה, מתחת לשמיים הפתוחים, עם הים שנמתח ללא סוף לפניו, דניאל הרגיש משהו שכמעט שכח שאפשרי. + +הוא הרגיש שלווה. + +ובהבנה השקטה הזו, הוא הבין שלפעמים, המסע הביתה אינו חזרה למקום, אלא גילוי מחדש של מי שאתה כשאתה סוף סוף מפסיק לברוח.` + +const ko = `조용한 저녁, 작은 해안 마을에서 다니엘이라는 이름의 젊은 남자가 부두 끝에 서서 저물어가는 주황빛 하늘 아래로 밀려오는 파도를 바라보고 있었다. 수평선은 끝없이 펼쳐져 있었고, 바다와 하늘이 거의 비현실적으로 보이는 가느다란 선으로 만나는 곳이었다. 이곳에 마지막으로 서 있었던 것이 몇 년 전이었지만, 모든 것이 이상하리만치 익숙하게 느껴졌다. 마치 시간이 그를 위해 속도를 늦추기로 결정한 것처럼. 짠 바닷바람이 그의 얼굴을 스치며 기억의 조각들을 가져왔다. 어린 시절의 웃음소리, 먼 목소리, 그리고 오래전에 잊었다고 생각했던 순간들. + +다니엘은 이 귀환을 계획하지 않았다. 사실 수년간 그는 의도적으로 이곳을 피해왔으며, 대신 도시의 소음과 분주함 속에 자신을 묻었다. 하지만 최근에 무언가가 변했다. 아마도 피로였을 수도 있고, 아마도 아무리 멀리 떠나도 과거를 완전히 벗어날 수 없다는 조용한 깨달음이었을 수도 있다. 그의 일부는 항상 이 장소에 묶여 있었다. 보이지 않는 실처럼 부드럽지만 끈질기게 그를 다시 끌어당기는. + +부두를 따라 걸으며 나무 판자가 그의 발아래에서 삐걱거렸고, 각 걸음이 저녁의 고요함 속으로 부드럽게 울려 퍼졌다. 그는 멀리서 우뚝 솟은 오래된 등대를 보았다. 등대의 불빛이 천천히 회전하며 수십 년 동안 해왔던 것처럼 충실하게 배들을 안내하고 있었다. 그 등대는 어린 시절의 그를 매료시켰다. 그는 그것을 바다의 수호자로 상상하곤 했다. 너무 멀리 나간 모든 이를 지켜보는. + +마침내 다니엘은 마을로 들어섰다. 거리는 거의 비어 있었고, 따뜻한 몇 개의 가로등만이 보도 위에 긴 그림자를 드리우고 있었다. 여기에는 그가 잊고 있었던 평온함이 있었다. 공허하지 않은 침묵, 오히려 말하지 않은 이야기들로 가득 찬 침묵이었다. 모든 모퉁이가 기억을 간직하고 있는 것 같았고, 모든 건물이 한때 알았던 삶을 떠올리게 했다. + +중심가 모퉁이의 작은 카페에 도착했을 때, 그는 멈춰 섰다. 문 위의 간판은 약간 바랬지만 여전히 알아볼 수 있었다. 그는 부모님과 함께 이곳에 왔던 것을 기억했다. 창가에 앉아 비가 내리는 것을 보며 핫초코를 마셨던. 별생각 없이 그는 문을 밀고 안으로 들어갔다. + +카페의 온기가 즉시 그를 감쌌다. 커피와 갓 구운 빵 향이 공기에 가득했고, 배경에서 부드러운 음악이 흘러나왔다. 카운터 뒤에는 따뜻한 눈과 부드러운 미소를 가진 나이 든 여성이 서 있었다. 그녀는 잠시 다니엘을 바라보았다. 고개를 살짝 기울이며 기억 속에서 그를 찾으려는 듯이. + +"오랫동안 떠나 있었군요." 그녀가 조용히 말했다. + +다니엘은 잠시 머뭇거렸다. "저를 아세요?" 그가 물었다. + +여자는 다시 미소 지었다. "정확히는 아닙니다. 하지만 많은 사람이 오고 가는 것을 봤어요. 당신은 돌아온 사람의 모습을 하고 있어요." + +그녀의 말이 공기 중에 맴돌았고, 다니엘은 마음속에서 무언가가 움직이는 것을 느꼈다. 그는 예전처럼 창가의 작은 테이블에 앉았다. 의자는 익숙했고, 풍경은 변하지 않았으며, 오랜만에 처음으로 서두르지 않고 그냥 앉아서 존재하는 것을 자신에게 허락했다. + +그들은 이야기를 나누기 시작했다. 처음에는 사소한 것들이었다. 날씨, 마을, 수년간의 변화. 하지만 점차 대화가 깊어졌다. 여자는 떠난 사람들, 남은 사람들, 그리고 그들 모두를 연결하는 이야기들을 들려주었다. 다니엘은 자신이 말하기보다 더 많이 듣고 있음을 깨달았다. 마치 그녀의 모든 말이 자신이 잃어버린 무언가의 조각을 담고 있는 것처럼. + +시간이 흘러가는 것을 그는 알아차리지 못했다. 밖에서 하늘은 완전히 어두워졌고, 별들이 하나둘 나타나기 시작했다. 펼쳐지는 밤의 조용한 증인들처럼. 카페 안의 불빛은 여전히 따뜻하고 안정적이었다. 벽 너머 세상으로부터의 작은 피난처를 만들어내며. + +어느 순간, 다니엘은 자신이 짊어지고 있던 무게가, 그 끊임없는 압박감, 불확실함, 어디를 가든 따라다니던 조용한 불안이 점점 사라지고 있음을 깨달았다. 완전히 사라진 것은 아니었지만, 더 가벼워지고 더 다루기 쉬워진 것이다. 마치 이곳에 있는 것이 그의 내면 어딘가를 천천히 회복시키고 있는 것처럼. + +다시 밖으로 나왔을 때, 공기는 더 시원했지만 더 맑게 느껴졌다. 그는 다시 한번 바다 소리에 이끌려 부두로 돌아갔다. 파도가 리듬감 있게 움직이고 있었고, 그 움직임은 그가 일상에서 익숙해진 혼란과 달리 꾸준하고 예측 가능했다. + +그는 난간에 기대어 어둠 속을 바라보았다. 등대는 여전히 묵묵히 일하고 있었고, 빛줄기가 흔들림 없이 수면을 가로질러 흘렀다. 다니엘은 그때 깨달았다. 어떤 것들이 변하지 않는 이유는 변할 수 없어서가 아니라 변할 필요가 없기 때문이라는 것을. + +그는 성공을 쫓으며 보낸 세월을 떠올렸다. 끊임없이 앞으로 나아가면서도 왜 그러는지 한 번도 자신에게 묻지 않았다. 밖에서 보기에 인상적인 삶을 만들었지만, 어느 사이엔가 자신을 이끌던 의미를 잃어버렸다. + +그곳에 서서 파도 소리를 들으며, 그는 단순하지만 깊은 무언가를 이해하기 시작했다. 그가 찾고 있던 답은 먼 곳이나 미래의 성취에 숨겨져 있지 않았다. 그것은 여기에 있었다. 조용한 순간들 속에, 결정들 사이의 공간에, 그가 그토록 뒤에 남기려 했던 기억들 속에. + +깊은 평온함이 그를 감쌌다. 그것은 흥분도 아니었고 안도감도 아니었다. 더 안정적인 무언가였다. 자신이 있는 곳에 대한 조용한 수용, 그리고 앞으로 어디로 갈지에 대한 부드러운 호기심. + +밤이 깊어지고 마을은 여전히 고요했다. 다니엘은 오래도록 그곳에 서서 수평선을 바라보며 생각하고 기억했다. 그리고 아주 오랜만에 처음으로 그는 무언가로부터 도망치려 하지 않았다. + +조수가 천천히 밀려오고 물소리가 커지자, 다니엘은 깊은 숨을 들이쉬고 눈을 감았다. 내일은 언제나 그렇듯 선택을 가져올 것이다. 내려야 할 결정, 고려해야 할 길, 마주해야 할 불확실함이 있을 것이다. + +하지만 지금 이 순간, 그 어떤 것도 중요하지 않았다. + +광활한 하늘 아래, 끝없이 펼쳐진 바다 앞에서, 다니엘은 거의 잊고 있었던 무언가를 느꼈다. + +그는 평화를 느꼈다. + +그리고 그 조용한 깨달음 속에서 그는 이해했다. 때로는 집으로 돌아가는 여정이 장소로 돌아가는 것이 아니라, 더 이상 도망치지 않기로 했을 때 비로소 자신이 누구인지를 다시 발견하는 것임을.` + +module.exports = { en, es, zh, ja, he, ko } diff --git a/packages/tts-ggml/test/data/sentences-medium.js b/packages/tts-ggml/test/data/sentences-medium.js new file mode 100644 index 0000000000..73a073ada2 --- /dev/null +++ b/packages/tts-ggml/test/data/sentences-medium.js @@ -0,0 +1,10 @@ +'use strict' + +module.exports = { + en: 'This morning the sunlight came through the window and I felt refreshed. After breakfast I decided to work at a nearby coffee shop for a while. On the way I met an old friend and we chatted for a few minutes. At noon I ordered a simple bowl of noodles at a small restaurant. On the way home in the afternoon I bought some fruit. Tonight I plan to rest early because tomorrow I have an important meeting.', + es: 'Esta manana la luz del sol entro por la ventana y me senti lleno de energia. Despues de un desayuno ligero sali a correr un rato por el barrio. De vuelta me encontre con un vecino y hablamos sobre el clima. Al mediodia prepare una ensalada y pan fresco en la cocina. Por la tarde lei un articulo profesional y anote algunas ideas importantes. Por la noche planeo ver una serie y dormir temprano para empezar manana descansado.', + zh: '今天早晨阳光照进窗户,我感觉精神很好。吃完早餐后,我计划去附近的咖啡店工作一会儿。路上我遇见一位老朋友,我们聊了几分钟。中午我在小餐馆点了一份简单的面条。下午回家的路上我买了一些水果。晚上我想早点休息,明天还有重要的会议。', + ja: '今朝は窓から明るい光が差し込み、とても清々しい気分でした。朝食のあと、近くのカフェで少し仕事をすることにしました。途中で旧友に会い、立ち話を楽しみました。昼食には小さな店でラーメンを食べました。帰り道に果物を買い、家で家族と穏やかな時間を過ごしました。明日は忙しい一日になりそうですが、今夜はゆっくり休みたいと思います。', + he: 'היום התעוררתי מוקדם והרגשתי מלא אנרגיה. אחרי ארוחת בוקר קלה יצאתי לריצה קצרה בשכונה. בדרך חזרה פגשתי שכן ודיברנו על מזג האוויר. בצהריים הכנתי סלט ולחם טרי במטבח. אחר הצהריים קראתי מאמר מקצועי והערותי כמה רעיונות חשובים. בערב אני מתכנן לצפות בסדרה ולישון מוקדם כדי להתחיל מחר רענן.', + ko: '오늘 아침 햇살이 창문으로 들어와 기분이 상쾌했습니다. 아침을 먹은 뒤 근처 카페에서 잠시 일을 하기로 했습니다. 길에서 옛 친구를 만나 잠깐 이야기를 나눴습니다. 점심에는 작은 식당에서 국수를 먹었습니다. 오후에는 과일가게에 들러 사과와 바나나를 샀습니다. 저녁에는 가족과 함께 식사하고 일찍 쉬려고 합니다.' +} diff --git a/packages/tts-ggml/test/integration/addon.test.js b/packages/tts-ggml/test/integration/addon.test.js new file mode 100644 index 0000000000..c83bde7135 --- /dev/null +++ b/packages/tts-ggml/test/integration/addon.test.js @@ -0,0 +1,353 @@ +'use strict' + +const test = require('brittle') +const os = require('bare-os') +const path = require('bare-path') +const fs = require('bare-fs') + +const { loadChatterboxTTS, runChatterboxTTS, runChatterboxTTSWithSplit, runChatterboxStreaming } = require('../utils/runChatterboxTTS') +const { ensureChatterboxModels, ensureWhisperModel } = require('../utils/downloadModel') +const { loadWhisper, runWhisper } = require('../utils/runWhisper') + +const platform = os.platform() +const isMobile = platform === 'ios' || platform === 'android' +const isDarwin = platform === 'darwin' +const forceNoGpu = os.getEnv('NO_GPU') === 'true' + +const INPUT_SENTENCES = (isMobile ? 'short' : os.getEnv('INPUT_SENTENCES')) || 'short' +const useSplit = INPUT_SENTENCES !== 'short' + +function getBaseDir () { + return isMobile && global.testDir ? global.testDir : '.' +} + +const ENGLISH_SENTENCES_SHORT = [ + 'The quick brown fox jumps over the lazy dog.', + 'How are you doing today?' +] + +function getEnglishSentences () { + if (INPUT_SENTENCES === 'short') return ENGLISH_SENTENCES_SHORT + const { en } = require(`../data/sentences-${INPUT_SENTENCES}`) + return en +} + +test('Chatterbox TTS (ggml): English synthesis + optional WER verification', { timeout: 1800000 }, async (t) => { + const baseDir = getBaseDir() + const modelsDir = path.join(baseDir, 'models') + const whisperModelDir = path.join(baseDir, 'models', 'whisper') + + console.log('\n=== Ensuring Chatterbox GGUFs ===') + const download = await ensureChatterboxModels({ targetDir: modelsDir }) + if (!download.success) { + console.log('Chatterbox GGUFs not available locally; see instructions above.') + t.pass('Skipped: Chatterbox GGUFs not available locally') + return + } + t.ok(download.success, 'Chatterbox GGUFs should be available') + + if (isDarwin) { + console.log('\n=== Ensuring Whisper model (for WER verification) ===') + const whisperModelPath = path.join(whisperModelDir, 'ggml-small.bin') + await ensureWhisperModel(whisperModelPath) + t.pass('Whisper model present') + } + + const expectation = { + minSamples: 5000, + maxSamples: 5000000, + minDurationMs: 200, + maxDurationMs: 300000 + } + + const werEntries = [] + const englishSentences = getEnglishSentences() + + // `ensureChatterboxModels` may resolve to a different dir on Android + // (the adb-push-friendly candidate paths under /sdcard/...), so use + // the dir it actually found the GGUFs in. + const resolvedModelDir = download.targetDir + console.log(`\n=== English synthesis (${englishSentences.length} sentences, tier: ${INPUT_SENTENCES}, modelDir: ${resolvedModelDir}) ===`) + const model = await loadChatterboxTTS({ + modelDir: resolvedModelDir, + language: 'en' + }) + t.ok(model, 'Chatterbox (ggml) model should be loaded') + + const runner = useSplit ? runChatterboxTTSWithSplit : runChatterboxTTS + + for (let i = 0; i < englishSentences.length; i++) { + const text = englishSentences[i] + const preview = text.substring(0, 60) + (text.length > 60 ? '...' : '') + console.log(`\n--- English ${i + 1}/${englishSentences.length}: "${preview}" ---`) + + const saveWav = !isMobile + const wavPath = saveWav ? path.join(baseDir, 'test', 'output', `chatterbox-english-${i + 1}.wav`) : undefined + + const result = await runner(model, { text, saveWav, wavOutputPath: wavPath }, expectation) + console.log(result.output) + + t.ok(result.passed, `English TTS ${i + 1} should pass expectations`) + t.ok(result.data.sampleCount > 0, `English TTS ${i + 1} should produce audio samples`) + t.is(result.data.reportedSampleRate, 24000, 'Sample rate should be native 24 kHz') + + const wavBuffer = result.data?.wavBuffer ? Buffer.from(result.data.wavBuffer) : null + werEntries.push({ text, lang: 'en', wavBuffer, sampleCount: result.data.sampleCount, durationMs: result.data.durationMs }) + } + + await model.unload() + t.pass('Chatterbox model unloaded') + + console.log('\n=== WER verification ===') + if (!isDarwin) { + t.pass('WER verification skipped (non-darwin)') + } else if (INPUT_SENTENCES !== 'short') { + t.pass('WER verification skipped (non-short input)') + } else { + const whisperModel = await loadWhisper({ + modelName: 'ggml-small.bin', + diskPath: whisperModelDir, + language: 'en' + }) + t.ok(whisperModel, 'Whisper model should be loaded') + + for (let i = 0; i < werEntries.length; i++) { + const entry = werEntries[i] + if (!entry.wavBuffer) { + console.log(`\n--- Whisper ${i + 1}/${werEntries.length}: skipped (no WAV buffer) ---`) + continue + } + + console.log(`\n--- Whisper ${i + 1}/${werEntries.length}: "${entry.text.substring(0, 50)}..." ---`) + const whisperResult = await runWhisper(whisperModel, entry.text, entry.wavBuffer) + const werPct = (whisperResult.wer * 100).toFixed(1) + console.log(`>>> [WHISPER] [en] WER: ${werPct}%`) + + const threshold = 0.4 + t.ok(whisperResult.wer <= threshold, `WER should be ≤ ${threshold * 100}% (got ${werPct}%)`) + } + + await whisperModel.unload() + console.log('Whisper model unloaded') + } + + console.log('\n' + '='.repeat(60)) + console.log('CHATTERBOX (ggml) TEST SUMMARY') + console.log('='.repeat(60)) + for (const e of werEntries) { + console.log(` [${e.lang}] ${e.sampleCount} samples, ${e.durationMs?.toFixed(0) || 'N/A'}ms - "${e.text.substring(0, 50)}..."`) + } + console.log('='.repeat(60)) +}) + +test('Chatterbox TTS (ggml): synthesizes without referenceAudio using the built-in voice baked into the S3Gen GGUF', { timeout: 600000 }, async (t) => { + const baseDir = getBaseDir() + const modelsDir = path.join(baseDir, 'models') + + const download = await ensureChatterboxModels({ targetDir: modelsDir }) + if (!download.success) { + t.pass('Skipped: Chatterbox GGUFs not available locally') + return + } + + // referenceAudio omitted on purpose: chatterbox::Engine falls back to + // the voice profile baked into the S3Gen GGUF (see qvac-tts.cpp's + // built-in voice condition). ChatterboxModel::validateConfig only + // rejects referenceAudio when it's set AND the file is missing; an + // empty/undefined value flows through to the engine cleanly. + const TTSGgml = require('@qvac/tts-ggml') + const model = new TTSGgml({ + files: { modelDir: download.targetDir }, + config: { language: 'en', ...(forceNoGpu ? { useGPU: false } : {}) }, + opts: { stats: true } + }) + + try { + await model.load() + + const response = await model.run({ + type: 'text', + input: 'Hello from the built-in voice.' + }) + let samples = 0 + let reportedSampleRate = null + await response + .onUpdate(data => { + if (data && data.outputArray) samples += data.outputArray.length + if (data && data.sampleRate) reportedSampleRate = data.sampleRate + }) + .await() + + t.ok(samples > 5000, `built-in voice should produce > 5000 samples (got ${samples})`) + t.is(reportedSampleRate, 24000, 'built-in voice still emits at 24 kHz native rate') + if (response.stats) { + t.ok(response.stats.totalSamples > 0, 'built-in voice run reports stats') + t.ok(typeof response.stats.realTimeFactor === 'number', 'built-in voice run reports RTF') + } + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Chatterbox TTS (ggml): outputSampleRate option is accepted (pass-through for now)', { timeout: 300000 }, async (t) => { + const baseDir = getBaseDir() + const modelsDir = path.join(baseDir, 'models') + + const download = await ensureChatterboxModels({ targetDir: modelsDir }) + if (!download.success) { + t.pass('Skipped: Chatterbox GGUFs not available locally') + return + } + + // Native output is always 24 kHz for Chatterbox; outputSampleRate resampling + // is reserved for the persistent-engine milestone. This test just verifies + // the option flows end-to-end without errors. + const TTSGgml = require('@qvac/tts-ggml') + const model = new TTSGgml({ + files: { modelDir: download.targetDir }, + referenceAudio: path.join(__dirname, '..', 'reference-audio', 'jfk.wav'), + config: { language: 'en', outputSampleRate: 16000, ...(forceNoGpu ? { useGPU: false } : {}) }, + opts: { stats: true } + }) + await model.load() + + const response = await model.run({ type: 'text', input: 'Hello world.' }) + let samples = 0 + await response + .onUpdate(data => { + if (data && data.outputArray) samples += data.outputArray.length + }) + .await() + + t.ok(samples > 0, 'Should produce non-empty output audio') + await model.unload() + + if (!fs.existsSync(path.join(baseDir, 'test', 'output'))) { + // Just a touchpoint so CI logs show output dir; not strictly required. + try { fs.mkdirSync(path.join(baseDir, 'test', 'output'), { recursive: true }) } catch (e) { /* ignore */ } + } +}) + +test('Chatterbox TTS (ggml): native C++ chunk streaming via streamChunkTokens', { timeout: 600000 }, async (t) => { + const baseDir = getBaseDir() + const modelsDir = path.join(baseDir, 'models') + + const download = await ensureChatterboxModels({ targetDir: modelsDir }) + if (!download.success) { + t.pass('Skipped: Chatterbox GGUFs not available locally') + return + } + + // streamChunkTokens > 0 activates the native Engine chunked S3Gen+HiFT + // loop. The addon publishes each chunk's PCM via the outputQueue so + // every `onUpdate` carries a distinct chunk of audio rather than one + // concatenated final result. + const TTSGgml = require('@qvac/tts-ggml') + const model = new TTSGgml({ + files: { modelDir: download.targetDir }, + referenceAudio: path.join(__dirname, '..', 'reference-audio', 'jfk.wav'), + streamChunkTokens: 25, + streamFirstChunkTokens: 10, + cfmSteps: 1, + config: { language: 'en', ...(forceNoGpu ? { useGPU: false } : {}) }, + opts: { stats: true } + }) + await model.load() + t.pass('Chatterbox (ggml) model loaded with native streaming') + + const response = await model.run({ + type: 'text', + input: 'The quick brown fox jumps over the lazy dog. This is a slightly longer sentence to produce multiple native chunks.' + }) + + const chunkIndices = [] + let totalSamples = 0 + let sawIsLast = false + let lastSeenIsLast = null + await response + .onUpdate(data => { + if (data && data.outputArray) { + chunkIndices.push(data.chunkIndex) + totalSamples += data.outputArray.length + if (data.isLast === true) sawIsLast = true + lastSeenIsLast = data.isLast + } + }) + .await() + + t.ok(chunkIndices.length >= 2, `native streaming should emit multiple chunks (got ${chunkIndices.length})`) + t.ok(totalSamples > 0, 'native streaming should produce audio samples') + for (let i = 0; i < chunkIndices.length; i++) { + t.is(chunkIndices[i], i, `chunk ${i} should carry chunkIndex=${i}`) + } + t.ok(sawIsLast, 'one of the chunks should carry isLast=true') + t.is(lastSeenIsLast, true, 'the final chunk should carry isLast=true') + + await model.unload() + t.pass('Model unloaded after native streaming') +}) + +test('Chatterbox TTS (ggml): streaming input + streaming PCM output (runStreaming + onUpdate)', { timeout: 1800000 }, async (t) => { + const baseDir = getBaseDir() + const modelsDir = path.join(baseDir, 'models') + + console.log('\n=== Ensuring Chatterbox GGUFs (streaming) ===') + const download = await ensureChatterboxModels({ targetDir: modelsDir }) + if (!download.success) { + t.pass('Skipped: Chatterbox GGUFs not available locally') + return + } + t.ok(download.success, 'Chatterbox GGUFs should be available') + + const model = await loadChatterboxTTS({ + modelDir: download.targetDir, + language: 'en' + }) + t.ok(model, 'Chatterbox (ggml) model should be loaded') + + const phrases = [ + 'First phrase arrives from the upstream text stream.', + 'A short pause could sit between chunks.', + 'Each yield is one discrete synthesis job.' + ] + + const expectation = { + minSamples: 15000, + maxSamples: 5000000, + minDurationMs: 400, + maxDurationMs: 300000 + } + + const saveWav = !isMobile + const wavOutputPath = saveWav + ? path.join(baseDir, 'test', 'output', 'chatterbox-streaming.wav') + : undefined + + console.log(`\n=== Running Chatterbox IO stream synthesis (runStreaming, ${phrases.length} phrases) ===`) + const result = await runChatterboxStreaming( + model, + { phrases, saveWav, wavOutputPath }, + expectation + ) + console.log(result.output) + + t.ok(result.passed, 'Streaming synthesis should pass expectations') + t.ok(result.data.sampleCount > 0, 'Streaming should produce audio samples') + t.is(result.data.reportedSampleRate, 24000, 'Streaming sample rate is native 24 kHz') + t.is( + result.data.streamChunkCount, + phrases.length, + 'runStreaming should emit one chunk per yielded phrase' + ) + t.is(result.data.sentenceChunks.length, phrases.length) + for (let i = 0; i < phrases.length; i++) { + t.is( + result.data.sentenceChunks[i], + phrases[i], + `chunk ${i} sentenceChunk should match the streamed-in phrase` + ) + } + + await model.unload() + t.pass('Chatterbox model unloaded') +}) diff --git a/packages/tts-ggml/test/integration/chatterbox-mtl.test.js b/packages/tts-ggml/test/integration/chatterbox-mtl.test.js new file mode 100644 index 0000000000..b9a6c27f92 --- /dev/null +++ b/packages/tts-ggml/test/integration/chatterbox-mtl.test.js @@ -0,0 +1,128 @@ +'use strict' + +// Chatterbox multilingual integration: same engine class, but loads +// the MTL GGUFs (chatterbox-t3-mtl + chatterbox-s3gen-mtl) and +// exercises a small sweep of non-en languages. The turbo English +// integration test lives in addon.test.js; this file is a +// language-coverage smoke that surfaces any regression in the +// multilingual variant's tokenizer / language-conditioning code paths +// (e.g. mtl_tokenizer break, run_t3 variant dispatch in tts-cpp). + +const fs = require('bare-fs') +const os = require('bare-os') +const path = require('bare-path') +const proc = require('bare-process') +const test = require('brittle') + +const TTSGgml = require('@qvac/tts-ggml') +const { runTTS } = require('../utils/runTTS') +const { ensureChatterboxMtlModels } = require('../utils/downloadModel') + +const platform = os.platform() +const isMobile = platform === 'ios' || platform === 'android' +const NO_GPU = proc.env && proc.env.NO_GPU === 'true' + +function getBaseDir () { + return isMobile && global.testDir ? global.testDir : '.' +} + +const SAMPLE_RATE = 24000 + +const MTL_SENTENCES = [ + { lang: 'es', text: 'El zorro marrón salta sobre el perro perezoso.' }, + { lang: 'fr', text: 'Le renard brun saute par-dessus le chien paresseux.' }, + { lang: 'de', text: 'Der braune Fuchs springt über den faulen Hund.' }, + { lang: 'pt', text: 'A raposa marrom pula sobre o cachorro preguiçoso.' } +] + +async function loadChatterboxMtlTTS (params) { + const refWavPath = params.refWavPath || path.join(__dirname, '..', 'reference-audio', 'jfk.wav') + if (!fs.existsSync(refWavPath)) { + throw new Error('[Chatterbox MTL] reference audio not found at ' + refWavPath) + } + + const model = new TTSGgml({ + files: { + modelDir: params.modelDir, + t3Model: params.t3ModelPath, + s3genModel: params.s3genModelPath + }, + referenceAudio: refWavPath, + config: { + language: params.language || 'en', + ...(params.useGPU !== undefined ? { useGPU: params.useGPU } : {}) + }, + opts: { stats: true } + }) + await model.load() + return model +} + +test('Chatterbox MTL TTS (ggml): synthesizes across es/fr/de/pt with shared engine', { timeout: 1800000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureChatterboxMtlModels({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { + t.pass('Skipped: Chatterbox MTL GGUFs not available') + return + } + + const model = await loadChatterboxMtlTTS({ + modelDir: download.targetDir, + language: MTL_SENTENCES[0].lang, + useGPU: !NO_GPU + }) + try { + for (let i = 0; i < MTL_SENTENCES.length; i++) { + const { lang, text } = MTL_SENTENCES[i] + console.log(` [${lang}] "${text.slice(0, 50)}..."`) + if (i > 0) { + await model.reload({ language: lang }) + } + const result = await runTTS( + model, + { text }, + { minSamples: 5000, maxSamples: 5000000, minDurationMs: 200, maxDurationMs: 300000 }, + { sampleRate: SAMPLE_RATE, engineTag: 'Chatterbox MTL' } + ) + console.log(' ' + result.output) + + t.ok(result.passed, `MTL ${lang} run passes expectations`) + t.ok(result.data.sampleCount > 0, `MTL ${lang} produced audio`) + t.is(result.data.reportedSampleRate || SAMPLE_RATE, SAMPLE_RATE, `MTL ${lang} reports 24 kHz`) + } + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Chatterbox MTL TTS (ggml): backendDevice + backendId surfaced in stats', { timeout: 600000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureChatterboxMtlModels({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { + t.pass('Skipped: Chatterbox MTL GGUFs not available') + return + } + + const model = await loadChatterboxMtlTTS({ + modelDir: download.targetDir, + language: 'es', + useGPU: !NO_GPU + }) + try { + const result = await runTTS( + model, + { text: 'Comprobando los datos de telemetría del backend.' }, + { minSamples: 5000 }, + { sampleRate: SAMPLE_RATE, engineTag: 'Chatterbox MTL' } + ) + t.ok(result.passed, 'MTL run for backend telemetry passes') + if (result.data.stats) { + t.ok(typeof result.data.stats.backendDevice === 'number', 'backendDevice surfaced in stats') + t.ok(typeof result.data.stats.backendId === 'number', 'backendId surfaced in stats') + } else { + t.fail('expected stats from MTL run') + } + } finally { + try { await model.unload() } catch (_e) {} + } +}) diff --git a/packages/tts-ggml/test/integration/gpu-smoke.test.js b/packages/tts-ggml/test/integration/gpu-smoke.test.js new file mode 100644 index 0000000000..a0740d0816 --- /dev/null +++ b/packages/tts-ggml/test/integration/gpu-smoke.test.js @@ -0,0 +1,260 @@ +'use strict' + +// GPU smoke tests for both tts-ggml engines (chatterbox + supertonic). +// +// Mirrors qvac-lib-infer-parakeet/test/integration/gpu-smoke.test.js's +// strict-on-CPU policy: a useGPU=true request that resolves to the CPU +// backend on a GPU-capable platform is treated as a regression because +// it usually means a build / linkage / kernel-init drift that CI must +// catch. Set QVAC_TTS_GPU_SMOKE_RELAX=1 to downgrade the gate to a +// warning (e.g. for a Linux host without Vulkan SDK, an emulator +// without Metal, or an Adreno-tier device that ggml-opencl rejects by +// design). +// +// CI runners without a real GPU (or hosted macOS where the +// Paravirtual Metal device crashes ggml's encoder) export NO_GPU=true +// to skip every smoke entry. Real GPU runners and local dev leave +// NO_GPU unset so the strict assertions still fire there. +// +// The strict gate uses `response.stats.backendDevice` (0=CPU, 1=GPU) +// and `response.stats.backendId` (0=CPU, 1=Metal, 2=CUDA, 3=Vulkan, +// 4=OpenCL, 99=other), both surfaced by ChatterboxModel + +// SupertonicModel after Engine::backend_device() / backend_name() were +// added in tts-cpp. + +const fs = require('bare-fs') +const os = require('bare-os') +const path = require('bare-path') +const proc = require('bare-process') +const test = require('brittle') + +const { loadChatterboxTTS, runChatterboxTTS } = require('../utils/runChatterboxTTS') +const { loadSupertonicTTS, runSupertonicTTS } = require('../utils/runSupertonicTTS') +const { ensureChatterboxModels, ensureSupertonicModel } = require('../utils/downloadModel') + +const platform = os.platform() +const isMobile = platform === 'ios' || platform === 'android' +const RELAX = proc.env && proc.env.QVAC_TTS_GPU_SMOKE_RELAX === '1' +const NO_GPU = proc.env && proc.env.NO_GPU === 'true' + +function getBaseDir () { + return isMobile && global.testDir ? global.testDir : '.' +} + +function backendIdToName (id) { + switch (id) { + case 0: return 'CPU' + case 1: return 'Metal' + case 2: return 'CUDA' + case 3: return 'Vulkan' + case 4: return 'OpenCL' + case 99: return 'other-GPU' + default: return `unknown(${id})` + } +} + +// Which platforms wire up a GPU backend in tts-cpp's vcpkg port +// today (default-features in qvac-registry-vcpkg/ports/tts-cpp/vcpkg.json): +// - darwin / ios: metal +// - linux / win32: vulkan +// - android: vulkan + opencl +function expectsGpu () { + return ( + platform === 'darwin' || + platform === 'ios' || + platform === 'linux' || + platform === 'win32' || + platform === 'android' + ) +} + +function assertGpuBackend (t, engineTag, stats) { + if (!stats) { + t.fail(`${engineTag}/GPU: no response.stats returned (cannot verify backend)`) + return + } + const dev = stats.backendDevice + const id = stats.backendId + const name = backendIdToName(id) + console.log(`[${engineTag}/GPU] backendDevice=${dev} backendId=${id} (${name})`) + + if (!expectsGpu()) { + t.is(dev, 0, `${engineTag}/${platform}: backendDevice must be 0 (CPU) on platforms with no GPU wired in`) + return + } + + if (dev !== 1) { + const msg = `${engineTag}/${platform}: expected GPU backend, got ${name} (backendDevice=${dev}, backendId=${id}). ` + + 'useGPU=true was requested but the engine fell back to CPU. ' + + 'Inspect addon native logs for the load-time backend init message.' + if (RELAX) { + t.comment(`WARNING (relaxed): ${msg}`) + t.pass(`${engineTag}/GPU smoke completed (relaxed)`) + } else { + t.fail(msg) + } + return + } + + if (platform === 'darwin' || platform === 'ios') { + t.is(id, 1, `${engineTag}/${platform}: expected Metal backendId=1, got ${name}`) + } else if (platform === 'linux' || platform === 'win32') { + t.is(id, 3, `${engineTag}/${platform}: expected Vulkan backendId=3, got ${name}`) + } else if (platform === 'android') { + t.ok(id === 3 || id === 4, `${engineTag}/${platform}: expected Vulkan(3) or OpenCL(4) backendId, got ${name}`) + } +} + +// Companion to assertGpuBackend: when the caller passes useGPU=false we +// expect the engine to actually pick the CPU backend. This is the gate +// that prevents `useGPU=false` from silently still running on GPU when +// the underlying tts-cpp library default is non-zero n_gpu_layers. +function assertCpuBackend (t, engineTag, stats) { + if (!stats) { + t.fail(`${engineTag}/CPU: no response.stats returned (cannot verify backend)`) + return + } + const dev = stats.backendDevice + const id = stats.backendId + const name = backendIdToName(id) + console.log(`[${engineTag}/CPU] backendDevice=${dev} backendId=${id} (${name})`) + t.is(dev, 0, `${engineTag}: useGPU:false must resolve to backendDevice=0 (CPU), got ${name}`) + t.is(id, 0, `${engineTag}: useGPU:false must resolve to backendId=0 (CPU), got ${name}`) +} + +test('Chatterbox GPU smoke - useGPU=true must engage the GPU backend on GPU-capable platforms', { timeout: 600000, skip: NO_GPU }, async (t) => { + const baseDir = getBaseDir() + const modelsDir = path.join(baseDir, 'models') + + const download = await ensureChatterboxModels({ targetDir: modelsDir }) + if (!download.success) { + t.pass('Skipped: Chatterbox GGUFs not available locally') + return + } + + const refWavPath = path.join(__dirname, '..', 'reference-audio', 'jfk.wav') + if (!fs.existsSync(refWavPath)) { + t.pass('Skipped: reference audio missing') + return + } + + const model = await loadChatterboxTTS({ + modelDir: download.targetDir, + refWavPath, + language: 'en', + useGPU: true + }) + try { + const result = await runChatterboxTTS( + model, + { text: 'GPU smoke check.' }, + { minSamples: 5000 } + ) + console.log(result.output) + t.ok(result.passed, 'Chatterbox/GPU produced expected sample count') + t.ok(result.data.sampleCount > 0, 'Chatterbox/GPU produced audio') + assertGpuBackend(t, 'Chatterbox', result.data.stats) + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Supertonic GPU smoke - useGPU=true is rejected at constructor (engine is CPU-only today)', { timeout: 60000 }, async (t) => { + const TTSGgml = require('@qvac/tts-ggml') + let threw = false + try { + /* eslint no-new: 0 */ + new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: { supertonicModel: '/dev/null' }, + voice: 'F1', + config: { language: 'en', useGPU: true } + }) + } catch (e) { + threw = true + t.ok(/CPU only today/.test(e.message), + 'rejection message references the engine docstring') + t.ok(/Pass config:.*useGPU: false/.test(e.message), + 'rejection message tells user how to fix') + } + t.ok(threw, 'TTSGgml constructor should throw on Supertonic + useGPU:true') +}) + +// CPU smoke: useGPU:false must actually pin the engine to CPU on every +// platform (no NO_GPU skip — CPU is expected to work everywhere). This +// is the counterpart to the GPU smoke above and exists because the +// previous tts-ggml behaviour left n_gpu_layers at the tts-cpp library +// default when useGPU:false was passed without an explicit nGpuLayers, +// which could silently fall back to GPU. Now that ChatterboxModel / +// SupertonicModel translate explicit useGPU=false → n_gpu_layers=0, +// these tests lock that contract in. +test('Chatterbox CPU smoke - useGPU=false must run on the CPU backend', { timeout: 600000 }, async (t) => { + const baseDir = getBaseDir() + const modelsDir = path.join(baseDir, 'models') + + const download = await ensureChatterboxModels({ targetDir: modelsDir }) + if (!download.success) { + t.pass('Skipped: Chatterbox GGUFs not available locally') + return + } + + const refWavPath = path.join(__dirname, '..', 'reference-audio', 'jfk.wav') + if (!fs.existsSync(refWavPath)) { + t.pass('Skipped: reference audio missing') + return + } + + const model = await loadChatterboxTTS({ + modelDir: download.targetDir, + refWavPath, + language: 'en', + useGPU: false + }) + try { + const result = await runChatterboxTTS( + model, + { text: 'CPU smoke check.' }, + { minSamples: 5000 } + ) + console.log(result.output) + t.ok(result.passed, 'Chatterbox/CPU produced expected sample count') + t.ok(result.data.sampleCount > 0, 'Chatterbox/CPU produced audio') + assertCpuBackend(t, 'Chatterbox', result.data.stats) + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Supertonic CPU smoke - useGPU=false must run on the CPU backend', { timeout: 600000 }, async (t) => { + const baseDir = getBaseDir() + const modelsDir = path.join(baseDir, 'models') + + const download = await ensureSupertonicModel({ targetDir: modelsDir }) + if (!download || !download.success) { + t.pass('Skipped: Supertonic GGUF not available locally') + return + } + + const supertonicPath = download.path || + path.join(modelsDir, 'supertonic.gguf') + + const model = await loadSupertonicTTS({ + supertonicModelPath: supertonicPath, + language: 'en', + voice: 'F1', + useGPU: false + }) + try { + const result = await runSupertonicTTS( + model, + { text: 'CPU smoke check.' }, + { minSamples: 5000 } + ) + console.log(result.output) + t.ok(result.passed, 'Supertonic/CPU produced expected sample count') + t.ok(result.data.sampleCount > 0, 'Supertonic/CPU produced audio') + assertCpuBackend(t, 'Supertonic', result.data.stats) + } finally { + try { await model.unload() } catch (_e) {} + } +}) diff --git a/packages/tts-ggml/test/integration/multiple-runs.test.js b/packages/tts-ggml/test/integration/multiple-runs.test.js new file mode 100644 index 0000000000..6ef0e72596 --- /dev/null +++ b/packages/tts-ggml/test/integration/multiple-runs.test.js @@ -0,0 +1,276 @@ +'use strict' + +// Sequential / fresh-instance / reload stability tests for both engines. +// Mirrors qvac-lib-infer-parakeet/test/integration/multiple-transcriptions.test.js +// and qvac-lib-infer-onnx-tts's lifecycle assertions, adapted to the +// tts-ggml engine API. These exercise: +// +// 1. N back-to-back run() calls on the SAME loaded instance +// (catches per-call state leaks: stale _job handles, accumulating +// cancel flags, output queue draining, etc.). +// 2. Fresh model instances per run (catches addon-side +// destroyInstance regressions and ensures unload/load cycles are +// idempotent at the engine layer). +// 3. reload() across runs (catches engine swap-in semantics + +// sentence-stream context cleanup on reload). +// +// Both Chatterbox (turbo English) and Supertonic engines are exercised +// in sequence so a regression in either engine surfaces in CI. + +const fs = require('bare-fs') +const os = require('bare-os') +const path = require('bare-path') +const proc = require('bare-process') +const test = require('brittle') + +const { loadChatterboxTTS, runChatterboxTTS } = require('../utils/runChatterboxTTS') +const { loadSupertonicTTS, runSupertonicTTS } = require('../utils/runSupertonicTTS') +const { + ensureChatterboxModels, + ensureSupertonicModel +} = require('../utils/downloadModel') + +const platform = os.platform() +const isMobile = platform === 'ios' || platform === 'android' +const NO_GPU = proc.env && proc.env.NO_GPU === 'true' + +function getBaseDir () { + return isMobile && global.testDir ? global.testDir : '.' +} + +const PHRASES = [ + 'The quick brown fox jumps over the lazy dog.', + 'Multiple consecutive runs share one engine instance.', + 'This is the third sentence in the sequential run test.' +] + +test('Chatterbox: multiple sequential runs reuse the same engine instance', { timeout: 1800000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureChatterboxModels({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { t.pass('Skipped: Chatterbox GGUFs not available'); return } + + const refWavPath = path.join(__dirname, '..', 'reference-audio', 'jfk.wav') + if (!fs.existsSync(refWavPath)) { t.pass('Skipped: reference audio missing'); return } + + const model = await loadChatterboxTTS({ + modelDir: download.targetDir, + refWavPath, + language: 'en', + useGPU: !NO_GPU + }) + try { + const timings = [] + for (let i = 0; i < PHRASES.length; i++) { + const t0 = Date.now() + const result = await runChatterboxTTS( + model, + { text: PHRASES[i] }, + { minSamples: 5000 } + ) + const wallMs = Date.now() - t0 + timings.push(wallMs) + console.log(` run ${i + 1}/${PHRASES.length}: ${result.data.sampleCount} samples (${wallMs}ms)`) + + t.ok(result.passed, `Chatterbox run ${i + 1} should pass expectations`) + t.ok(result.data.sampleCount > 0, `Chatterbox run ${i + 1} should produce audio`) + const stats = result.data.stats + if (stats) { + t.ok(typeof stats.realTimeFactor === 'number', `Chatterbox run ${i + 1} reports RTF`) + } + } + + const avg = timings.reduce((a, b) => a + b, 0) / timings.length + console.log(` avg wall-time across ${PHRASES.length} runs: ${avg.toFixed(0)}ms`) + t.ok(timings.length === PHRASES.length, 'all sequential runs completed') + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Supertonic: multiple sequential runs reuse the same engine instance', { timeout: 1800000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureSupertonicModel({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { t.pass('Skipped: Supertonic GGUF not available'); return } + + const model = await loadSupertonicTTS({ + supertonicModelPath: download.path, + voice: 'F1', + language: 'en', + useGPU: false + }) + try { + const timings = [] + for (let i = 0; i < PHRASES.length; i++) { + const t0 = Date.now() + const result = await runSupertonicTTS( + model, + { text: PHRASES[i] }, + { minSamples: 5000 } + ) + const wallMs = Date.now() - t0 + timings.push(wallMs) + console.log(` run ${i + 1}/${PHRASES.length}: ${result.data.sampleCount} samples (${wallMs}ms)`) + + t.ok(result.passed, `Supertonic run ${i + 1} should pass expectations`) + t.ok(result.data.sampleCount > 0, `Supertonic run ${i + 1} should produce audio`) + const stats = result.data.stats + if (stats) { + t.ok(typeof stats.realTimeFactor === 'number', `Supertonic run ${i + 1} reports RTF`) + } + } + + const avg = timings.reduce((a, b) => a + b, 0) / timings.length + console.log(` avg wall-time across ${PHRASES.length} runs: ${avg.toFixed(0)}ms`) + t.ok(timings.length === PHRASES.length, 'all sequential runs completed') + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Chatterbox: fresh instance per run (app-restart simulation)', { timeout: 1800000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureChatterboxModels({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { t.pass('Skipped: Chatterbox GGUFs not available'); return } + + const refWavPath = path.join(__dirname, '..', 'reference-audio', 'jfk.wav') + if (!fs.existsSync(refWavPath)) { t.pass('Skipped: reference audio missing'); return } + + const N = 2 + const results = [] + for (let i = 0; i < N; i++) { + const t0 = Date.now() + const model = await loadChatterboxTTS({ + modelDir: download.targetDir, + refWavPath, + language: 'en', + useGPU: !NO_GPU + }) + const loadMs = Date.now() - t0 + try { + const t1 = Date.now() + const r = await runChatterboxTTS(model, { text: PHRASES[i % PHRASES.length] }, { minSamples: 5000 }) + const runMs = Date.now() - t1 + console.log(` instance ${i + 1}/${N}: load=${loadMs}ms run=${runMs}ms samples=${r.data.sampleCount}`) + results.push({ loadMs, runMs, sampleCount: r.data.sampleCount, passed: r.passed }) + } finally { + try { await model.unload() } catch (_e) {} + } + } + + t.ok(results.every(r => r.passed), 'every fresh instance should pass expectations') + t.ok(results.every(r => r.sampleCount > 0), 'every fresh instance should produce audio') +}) + +test('Supertonic: fresh instance per run (app-restart simulation)', { timeout: 1800000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureSupertonicModel({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { t.pass('Skipped: Supertonic GGUF not available'); return } + + const N = 2 + const results = [] + for (let i = 0; i < N; i++) { + const t0 = Date.now() + const model = await loadSupertonicTTS({ + supertonicModelPath: download.path, + voice: 'F1', + language: 'en', + useGPU: false + }) + const loadMs = Date.now() - t0 + try { + const t1 = Date.now() + const r = await runSupertonicTTS(model, { text: PHRASES[i % PHRASES.length] }, { minSamples: 5000 }) + const runMs = Date.now() - t1 + console.log(` instance ${i + 1}/${N}: load=${loadMs}ms run=${runMs}ms samples=${r.data.sampleCount}`) + results.push({ loadMs, runMs, sampleCount: r.data.sampleCount, passed: r.passed }) + } finally { + try { await model.unload() } catch (_e) {} + } + } + + t.ok(results.every(r => r.passed), 'every fresh instance should pass expectations') + t.ok(results.every(r => r.sampleCount > 0), 'every fresh instance should produce audio') +}) + +test('Chatterbox: reload() between runs preserves stability', { timeout: 1800000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureChatterboxModels({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { t.pass('Skipped: Chatterbox GGUFs not available'); return } + + const refWavPath = path.join(__dirname, '..', 'reference-audio', 'jfk.wav') + if (!fs.existsSync(refWavPath)) { t.pass('Skipped: reference audio missing'); return } + + const model = await loadChatterboxTTS({ + modelDir: download.targetDir, + refWavPath, + language: 'en', + useGPU: !NO_GPU + }) + try { + const r1 = await runChatterboxTTS(model, { text: 'First run before reload.' }, { minSamples: 5000 }) + t.ok(r1.passed, 'first run before reload should pass') + + await model.reload({ language: 'en' }) + t.pass('reload() resolved') + + const r2 = await runChatterboxTTS(model, { text: 'Second run after reload.' }, { minSamples: 5000 }) + t.ok(r2.passed, 'second run after reload should pass') + t.ok(r2.data.sampleCount > 0, 'reloaded model produces audio') + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Supertonic: reload() between runs preserves stability', { timeout: 1800000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureSupertonicModel({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { t.pass('Skipped: Supertonic GGUF not available'); return } + + const model = await loadSupertonicTTS({ + supertonicModelPath: download.path, + voice: 'F1', + language: 'en', + useGPU: false + }) + try { + const r1 = await runSupertonicTTS(model, { text: 'First supertonic run before reload.' }, { minSamples: 5000 }) + t.ok(r1.passed, 'first run before reload should pass') + + await model.reload({ language: 'en' }) + t.pass('reload() resolved') + + const r2 = await runSupertonicTTS(model, { text: 'Second supertonic run after reload.' }, { minSamples: 5000 }) + t.ok(r2.passed, 'second run after reload should pass') + t.ok(r2.data.sampleCount > 0, 'reloaded model produces audio') + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Engine swap: chatterbox -> supertonic -> chatterbox in separate instances', { timeout: 1800000 }, async (t) => { + const baseDir = getBaseDir() + const cb = await ensureChatterboxModels({ targetDir: path.join(baseDir, 'models') }) + const st = await ensureSupertonicModel({ targetDir: path.join(baseDir, 'models') }) + if (!cb.success || !st.success) { t.pass('Skipped: not all engines have models locally'); return } + + const refWavPath = path.join(__dirname, '..', 'reference-audio', 'jfk.wav') + if (!fs.existsSync(refWavPath)) { t.pass('Skipped: reference audio missing'); return } + + const c1 = await loadChatterboxTTS({ modelDir: cb.targetDir, refWavPath, language: 'en', useGPU: !NO_GPU }) + try { + const r = await runChatterboxTTS(c1, { text: 'Hello from chatterbox.' }, { minSamples: 5000 }) + t.ok(r.passed, 'first chatterbox instance OK') + } finally { try { await c1.unload() } catch (_e) {} } + + const s1 = await loadSupertonicTTS({ supertonicModelPath: st.path, voice: 'F1', language: 'en', useGPU: false }) + try { + const r = await runSupertonicTTS(s1, { text: 'Hello from supertonic.' }, { minSamples: 5000 }) + t.ok(r.passed, 'supertonic instance OK') + } finally { try { await s1.unload() } catch (_e) {} } + + const c2 = await loadChatterboxTTS({ modelDir: cb.targetDir, refWavPath, language: 'en', useGPU: !NO_GPU }) + try { + const r = await runChatterboxTTS(c2, { text: 'Hello from chatterbox again.' }, { minSamples: 5000 }) + t.ok(r.passed, 'second chatterbox instance OK after supertonic swap') + } finally { try { await c2.unload() } catch (_e) {} } +}) diff --git a/packages/tts-ggml/test/integration/supertonic-mtl.test.js b/packages/tts-ggml/test/integration/supertonic-mtl.test.js new file mode 100644 index 0000000000..e62b410ecf --- /dev/null +++ b/packages/tts-ggml/test/integration/supertonic-mtl.test.js @@ -0,0 +1,145 @@ +'use strict' + +// Supertonic multilingual integration: same engine class as +// supertonic.test.js but exercises a small sweep of non-en languages +// against the real ggml backend. Surfaces regressions in the MTL +// language-conditioning path (supertonic_preprocess.cpp's +// language_wrap_mode + is_supported_language gate). + +const os = require('bare-os') +const path = require('bare-path') +const test = require('brittle') + +const TTSGgml = require('@qvac/tts-ggml') +const { runSupertonicTTS } = require('../utils/runSupertonicTTS') +const { ensureSupertonicMtlModel } = require('../utils/downloadModel') + +const platform = os.platform() +const isMobile = platform === 'ios' || platform === 'android' + +function getBaseDir () { + return isMobile && global.testDir ? global.testDir : '.' +} + +const SAMPLE_RATE = 44100 + +const MTL_SENTENCES = [ + { lang: 'es', text: 'El zorro marrón salta sobre el perro perezoso.' }, + { lang: 'fr', text: 'Le renard brun saute par-dessus le chien paresseux.' }, + { lang: 'pt', text: 'A raposa marrom pula sobre o cachorro preguiçoso.' } +] + +async function loadSupertonicMtlTTS (params) { + const model = new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: { supertonicModel: params.supertonicModelPath }, + voice: params.voice || 'F1', + config: { language: params.language || 'en', useGPU: false }, + opts: { stats: true } + }) + await model.load() + return model +} + +test('Supertonic MTL TTS (ggml): synthesizes across es/fr/pt with shared engine', { timeout: 1800000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureSupertonicMtlModel({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { + t.pass('Skipped: Supertonic MTL GGUF not available') + return + } + + const model = await loadSupertonicMtlTTS({ + supertonicModelPath: download.path, + language: MTL_SENTENCES[0].lang + }) + try { + for (let i = 0; i < MTL_SENTENCES.length; i++) { + const { lang, text } = MTL_SENTENCES[i] + console.log(` [${lang}] "${text.slice(0, 50)}..."`) + if (i > 0) { + await model.reload({ language: lang }) + } + const result = await runSupertonicTTS( + model, + { text }, + { minSamples: 5000, maxSamples: 5000000, minDurationMs: 200, maxDurationMs: 300000 } + ) + console.log(' ' + result.output) + + t.ok(result.passed, `Supertonic MTL ${lang} run passes expectations`) + t.ok(result.data.sampleCount > 0, `Supertonic MTL ${lang} produced audio`) + t.is(result.data.reportedSampleRate || SAMPLE_RATE, SAMPLE_RATE, `Supertonic MTL ${lang} reports 44.1 kHz`) + } + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Supertonic MTL TTS (ggml): unsupported language fails fast at engine load', { timeout: 600000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureSupertonicMtlModel({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { + t.pass('Skipped: Supertonic MTL GGUF not available') + return + } + + // 'de' is in Chatterbox MTL's tier-1 set but NOT in Supertonic's + // (which only handles en/ko/es/pt/fr today; see + // supertonic_preprocess.cpp::is_supported_language). The native + // engine should reject the run with a clear "invalid Supertonic + // language" error rather than silently producing garbage. + const model = await loadSupertonicMtlTTS({ + supertonicModelPath: download.path, + language: 'de' + }) + try { + let failed = false + let message = '' + try { + const response = await model.run({ + type: 'text', + input: 'Der braune Fuchs springt über den faulen Hund.' + }) + await response.await() + } catch (e) { + failed = true + message = String(e && e.message) + } + t.ok(failed, 'unsupported language should reject the synthesis call') + t.ok(/language|Supertonic/i.test(message), + `error mentions language / Supertonic (got: ${message})`) + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Supertonic MTL TTS (ggml): backendDevice + backendId surfaced in stats', { timeout: 600000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureSupertonicMtlModel({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { + t.pass('Skipped: Supertonic MTL GGUF not available') + return + } + + const model = await loadSupertonicMtlTTS({ + supertonicModelPath: download.path, + language: 'es' + }) + try { + const result = await runSupertonicTTS( + model, + { text: 'Comprobando los datos de telemetría del backend.' }, + { minSamples: 5000 } + ) + t.ok(result.passed, 'MTL run for backend telemetry passes') + if (result.data.stats) { + t.ok(typeof result.data.stats.backendDevice === 'number', 'backendDevice surfaced in stats') + t.ok(typeof result.data.stats.backendId === 'number', 'backendId surfaced in stats') + } else { + t.fail('expected stats from Supertonic MTL run') + } + } finally { + try { await model.unload() } catch (_e) {} + } +}) diff --git a/packages/tts-ggml/test/integration/supertonic.test.js b/packages/tts-ggml/test/integration/supertonic.test.js new file mode 100644 index 0000000000..e0aff6e5ad --- /dev/null +++ b/packages/tts-ggml/test/integration/supertonic.test.js @@ -0,0 +1,203 @@ +'use strict' + +// Supertonic engine integration smoke + basic config / cancel coverage. +// Mirrors the per-engine integration shape used in +// qvac-lib-infer-onnx-tts/test/unit/supertonic.inference.test.js but +// runs against the real native ggml backend instead of the JS mock. + +const os = require('bare-os') +const path = require('bare-path') +const test = require('brittle') + +const { loadSupertonicTTS, runSupertonicTTS } = require('../utils/runSupertonicTTS') +const { ensureSupertonicModel } = require('../utils/downloadModel') + +const platform = os.platform() +const isMobile = platform === 'ios' || platform === 'android' + +function getBaseDir () { + return isMobile && global.testDir ? global.testDir : '.' +} + +test('Supertonic TTS (ggml): basic synthesis returns ~44.1 kHz audio + stats', { timeout: 600000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureSupertonicModel({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { t.pass('Skipped: Supertonic GGUF not available'); return } + + const model = await loadSupertonicTTS({ + supertonicModelPath: download.path, + voice: 'F1', + language: 'en', + useGPU: false + }) + try { + const wavPath = isMobile ? undefined : path.join(baseDir, 'test', 'output', 'supertonic-en.wav') + const result = await runSupertonicTTS( + model, + { text: 'The supertonic engine produces high quality speech in real time.', saveWav: !isMobile, wavOutputPath: wavPath }, + { minSamples: 10000, maxSamples: 5000000, minDurationMs: 250, maxDurationMs: 300000 } + ) + console.log(result.output) + + t.ok(result.passed, 'supertonic synth passes expectations') + t.ok(result.data.sampleCount > 0, 'supertonic produced audio') + t.is(result.data.reportedSampleRate, 44100, 'supertonic reports 44.1 kHz native sample rate') + if (result.data.stats) { + t.ok(typeof result.data.stats.realTimeFactor === 'number', 'supertonic stats include RTF') + t.ok(typeof result.data.stats.audioDurationMs === 'number', 'supertonic stats include audio duration') + t.ok(typeof result.data.stats.backendDevice === 'number', 'supertonic stats include backendDevice') + t.ok(typeof result.data.stats.backendId === 'number', 'supertonic stats include backendId') + } + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Supertonic TTS (ggml): cancel mid-flight rejects the response', { timeout: 600000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureSupertonicModel({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { t.pass('Skipped: Supertonic GGUF not available'); return } + + const model = await loadSupertonicTTS({ + supertonicModelPath: download.path, + voice: 'F1', + language: 'en', + useGPU: false + }) + try { + const response = await model.run({ type: 'text', input: 'Cancel this synthesis call before it completes.' }) + setTimeout(() => { response.cancel().catch(() => {}) }, 50) + + let failed = false + try { + await response.await() + } catch (e) { + failed = true + console.log(' cancel rejected with: ' + e.message) + } + t.ok(failed, 'cancelled supertonic response should reject') + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Supertonic TTS (ggml): runStream emits per-sentence chunks with chunkIndex / sentenceChunk / isLast', { timeout: 600000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureSupertonicModel({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { t.pass('Skipped: Supertonic GGUF not available'); return } + + const TTSGgml = require('@qvac/tts-ggml') + const model = new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: { supertonicModel: download.path }, + voice: 'F1', + config: { language: 'en', useGPU: false }, + opts: { stats: true } + }) + await model.load() + try { + const text = 'First sentence. Second sentence. Third sentence here.' + const response = await model.runStream(text, { maxChunkScalars: 30 }) + const chunkIndices = [] + const sentenceChunks = [] + const isLastFlags = [] + let totalSamples = 0 + let lastSampleRate = null + await response + .onUpdate(d => { + if (d && d.outputArray) { + chunkIndices.push(d.chunkIndex) + sentenceChunks.push(d.sentenceChunk) + isLastFlags.push(!!d.isLast) + totalSamples += d.outputArray.length + if (d.sampleRate) lastSampleRate = d.sampleRate + } + }) + .await() + + t.ok(chunkIndices.length >= 2, `runStream produced multiple chunks (got ${chunkIndices.length})`) + for (let i = 0; i < chunkIndices.length; i++) { + t.is(chunkIndices[i], i, `chunk ${i} carries chunkIndex=${i}`) + t.ok(typeof sentenceChunks[i] === 'string' && sentenceChunks[i].length > 0, + `chunk ${i} carries non-empty sentenceChunk`) + } + t.is(isLastFlags.filter(Boolean).length, 1, 'exactly one isLast=true emitted') + t.is(isLastFlags[isLastFlags.length - 1], true, 'final chunk carries isLast=true') + t.is(lastSampleRate, 44100, 'supertonic sentence-stream chunks report 44.1 kHz') + t.ok(totalSamples > 0, 'stream produced audio samples') + if (response.stats) { + t.ok(response.stats.totalSamples >= totalSamples * 0.95, + 'merged stats totalSamples roughly matches concatenated chunk samples') + } + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Supertonic TTS (ggml): runStreaming with async iterator emits one job per yielded sentence', { timeout: 600000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureSupertonicModel({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { t.pass('Skipped: Supertonic GGUF not available'); return } + + const TTSGgml = require('@qvac/tts-ggml') + const model = new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: { supertonicModel: download.path }, + voice: 'F1', + config: { language: 'en', useGPU: false }, + opts: { stats: true } + }) + await model.load() + try { + async function * yields () { + yield 'First yielded sentence.' + yield 'Second yielded sentence.' + yield 'Third yielded sentence.' + } + const response = await model.runStreaming(yields()) + const updates = [] + await response.onUpdate(d => { + if (d && d.outputArray) updates.push(d) + }).await() + + t.is(updates.length, 3, 'one chunk per yielded sentence') + t.is(updates[0].chunkIndex, 0, 'chunk 0 has chunkIndex=0') + t.is(updates[2].chunkIndex, 2, 'chunk 2 has chunkIndex=2') + t.ok(updates.every(u => u.isLast === undefined), + 'isLast is undefined for async-iter mode (count not known up-front)') + t.ok(updates.every(u => u.sampleRate === 44100), + 'every chunk reports 44.1 kHz native sample rate') + } finally { + try { await model.unload() } catch (_e) {} + } +}) + +test('Supertonic TTS (ggml): voice + steps + speed knobs survive ttsParams round-trip', { timeout: 600000 }, async (t) => { + const baseDir = getBaseDir() + const download = await ensureSupertonicModel({ targetDir: path.join(baseDir, 'models') }) + if (!download.success) { t.pass('Skipped: Supertonic GGUF not available'); return } + + const model = await loadSupertonicTTS({ + supertonicModelPath: download.path, + voice: 'F1', + steps: 4, + speed: 1.0, + language: 'en', + useGPU: false + }) + try { + const params = model._buildTtsParams() + t.is(params.voice, 'F1') + t.is(params.steps, 4) + t.is(params.speed, 1) + + const result = await runSupertonicTTS( + model, + { text: 'Voice and steps test.' }, + { minSamples: 5000 } + ) + t.ok(result.passed, 'voice+steps run passes expectations') + } finally { + try { await model.unload() } catch (_e) {} + } +}) diff --git a/packages/tts-ggml/test/mobile/integration-runtime.cjs b/packages/tts-ggml/test/mobile/integration-runtime.cjs new file mode 100644 index 0000000000..968d3201c3 --- /dev/null +++ b/packages/tts-ggml/test/mobile/integration-runtime.cjs @@ -0,0 +1,41 @@ +'use strict' + +const path = require('bare-path') +const fs = require('bare-fs') +const proc = require('bare-process') +const { pathToFileURL } = require('bare-url') + +// Force the gpu-smoke integration test (and any other test that opts +// into NO_GPU) to skip the GPU paths on Device Farm. The desktop +// integration-test workflow toggles this via matrix `no_gpu: 'true'` +// -> job env, but mobile bundles execute on real devices where workflow +// env vars do not propagate. Setting it here means every test that +// reads `process.env.NO_GPU` (gpu-smoke.test.js etc.) sees the same +// off-switch on Device Farm. Drop or gate this assignment when the +// tts-ggml mobile GPU paths are stable enough for strict CI coverage on +// Adreno / Apple Silicon devices. +proc.env.NO_GPU = 'true' + +if (typeof Bare !== 'undefined' && typeof Bare.on === 'function') { + Bare.on('unhandledRejection', (reason) => { + console.error('[integration-runner] Unhandled rejection:', reason) + }) + Bare.on('uncaughtException', (err) => { + console.error('[integration-runner] Uncaught exception:', err) + }) +} + +async function runIntegrationModule (relativeModulePath, options = {}) { + const modulePath = path.join(__dirname, relativeModulePath) + + if (!fs.existsSync(modulePath)) { + console.warn(`[integration-runner] Missing module: ${relativeModulePath}`) + return 'missing' + } + + const moduleUrl = pathToFileURL(modulePath).href + await import(moduleUrl) + return modulePath +} + +global.runIntegrationModule = runIntegrationModule diff --git a/packages/tts-ggml/test/mobile/integration.auto.cjs b/packages/tts-ggml/test/mobile/integration.auto.cjs new file mode 100644 index 0000000000..f1c8ef8c32 --- /dev/null +++ b/packages/tts-ggml/test/mobile/integration.auto.cjs @@ -0,0 +1,40 @@ +'use strict' +require('./integration-runtime.cjs') + +// AUTO-GENERATED FILE. Run `npm run test:mobile:generate` to update. +// Each function mirrors a single file under test/integration/. + +/* global runIntegrationModule */ + +async function runAddonTest (options = {}) { // eslint-disable-line no-unused-vars + return runIntegrationModule('../integration/addon.test.js', options) +} + +async function runChatterboxMtlTest (options = {}) { // eslint-disable-line no-unused-vars + return runIntegrationModule('../integration/chatterbox-mtl.test.js', options) +} + +async function runGpuSmokeTest (options = {}) { // eslint-disable-line no-unused-vars + return runIntegrationModule('../integration/gpu-smoke.test.js', options) +} + +async function runMultipleRunsTest (options = {}) { // eslint-disable-line no-unused-vars + return runIntegrationModule('../integration/multiple-runs.test.js', options) +} + +async function runSupertonicMtlTest (options = {}) { // eslint-disable-line no-unused-vars + return runIntegrationModule('../integration/supertonic-mtl.test.js', options) +} + +async function runSupertonicTest (options = {}) { // eslint-disable-line no-unused-vars + return runIntegrationModule('../integration/supertonic.test.js', options) +} + +module.exports = { + runAddonTest, + runChatterboxMtlTest, + runGpuSmokeTest, + runMultipleRunsTest, + runSupertonicMtlTest, + runSupertonicTest +} diff --git a/packages/tts-ggml/test/mobile/testAssets/jfk.wav b/packages/tts-ggml/test/mobile/testAssets/jfk.wav new file mode 100644 index 0000000000..3184d372cd Binary files /dev/null and b/packages/tts-ggml/test/mobile/testAssets/jfk.wav differ diff --git a/packages/tts-ggml/test/mock/MockedBinding.js b/packages/tts-ggml/test/mock/MockedBinding.js new file mode 100644 index 0000000000..57fea9388b --- /dev/null +++ b/packages/tts-ggml/test/mock/MockedBinding.js @@ -0,0 +1,154 @@ +'use strict' + +const state = Object.freeze({ + LOADING: 'loading', + LISTENING: 'listening', + PROCESSING: 'processing', + IDLE: 'idle' +}) + +class MockedBinding { + constructor ({ jobDelayMs = 10 } = {}) { + this._handle = null + this._state = state.LOADING + this.outputCb = null + this._baseInferenceCallback = null + this._cancelRequested = false + this._jobDelayMs = jobDelayMs + } + + createInstance (interfaceType, configurationParams, outputCb) { + console.log('Constructing the TTS addon') + this.outputCb = outputCb + this._handle = { id: Date.now() } + return this._handle + } + + setBaseInferenceCallback (callback) { + this._baseInferenceCallback = callback + } + + _callCallbacks (event, data, error) { + // addon-cpp 1.1.5 emits event, data, error only; job ownership stays in JS. + if (this.outputCb) { + this.outputCb(this, event, data, error) + } + if (this._baseInferenceCallback) { + this._baseInferenceCallback(this, event, data, error) + } + } + + activate (handle) { + if (handle !== this._handle) throw new Error('Invalid handle') + console.log('Activated the TTS addon') + this._state = state.LISTENING + } + + cancel (handle) { + if (handle !== this._handle) throw new Error('Invalid handle') + this._cancelRequested = true + } + + runJob (handle, data) { + if (handle !== this._handle) throw new Error('Invalid handle') + + if (this._state !== state.LISTENING) { + throw new Error('TTS addon is not accepting jobs (not in listening state)') + } + + if (!data || data.type !== 'text' || typeof data.input !== 'string') { + throw new TypeError('runJob(data) expects { type: "text", input: string }') + } + + this._state = state.PROCESSING + this._cancelRequested = false + + setTimeout(() => { + if (this._cancelRequested) { + this._callCallbacks('Error', null, 'Job cancelled') + this._state = state.LISTENING + return + } + + const sampleCount = data.input.length * 100 + const mockAudioSamples = new Int16Array(sampleCount) + for (let i = 0; i < sampleCount; i++) { + mockAudioSamples[i] = Math.floor(Math.sin(i * 0.1) * 10000) + } + + this._callCallbacks('AudioResult', { outputArray: mockAudioSamples }, null) + this._callCallbacks('RuntimeStats', { + totalTime: 0.12, + tokensPerSecond: 120, + realTimeFactor: 0.08, + audioDurationMs: sampleCount / 24, + totalSamples: sampleCount + }, null) + this._state = state.LISTENING + }, this._jobDelayMs) + return true + } + + loadWeights (handle, weightsData) { + if (handle !== this._handle) throw new Error('Invalid handle') + } + + destroyInstance (handle) { + if (handle !== this._handle) throw new Error('Invalid handle') + this._handle = null + this._state = state.IDLE + } + + unload (handle) { + if (handle !== this._handle) throw new Error('Invalid handle') + this.destroyInstance(handle) + } + + status () { + return this._state + } + + pause () { + throw new Error('pause() is not supported in addon-cpp 1.x') + } + + stop () { + throw new Error('stop() is not supported in addon-cpp 1.x') + } + + load () { + throw new Error('load() is not supported in addon-cpp 1.x') + } + + reload () { + throw new Error('reload() is not supported in addon-cpp 1.x') + } + + append () { + throw new Error('append() is not supported in addon-cpp 1.x') + } + + unloadWeights () { + throw new Error('unloadWeights() is not supported in addon-cpp 1.x') + } + + set transitionCb (_) { + // no-op in addon-cpp 1.x mock + } + + get transitionCb () { + return null + } + + get state () { + return this._state + } + + set state (nextState) { + if (Object.values(state).includes(nextState)) { + this._state = nextState + } + } +} + +module.exports = MockedBinding diff --git a/packages/tts-ggml/test/mock/utils.js b/packages/tts-ggml/test/mock/utils.js new file mode 100644 index 0000000000..29cba8e5b0 --- /dev/null +++ b/packages/tts-ggml/test/mock/utils.js @@ -0,0 +1,14 @@ +'use strict' + +// A helper function to wait a short time (to allow setImmediate callbacks to fire). +const wait = (ms = 20) => new Promise(resolve => setTimeout(resolve, ms)) + +// Transition callback to log state changes. +const transitionCb = (instance, newState) => { + console.log(`Transitioned to: ${newState}`) +} + +module.exports = { + wait, + transitionCb +} diff --git a/packages/tts-ggml/test/reference-audio/jfk.wav b/packages/tts-ggml/test/reference-audio/jfk.wav new file mode 100644 index 0000000000..3184d372cd Binary files /dev/null and b/packages/tts-ggml/test/reference-audio/jfk.wav differ diff --git a/packages/tts-ggml/test/unit/chatterbox-mtl.inference.test.js b/packages/tts-ggml/test/unit/chatterbox-mtl.inference.test.js new file mode 100644 index 0000000000..a31f3a1ffe --- /dev/null +++ b/packages/tts-ggml/test/unit/chatterbox-mtl.inference.test.js @@ -0,0 +1,168 @@ +'use strict' + +const test = require('brittle') +const path = require('bare-path') +const TTSGgml = require('../../index.js') +const { TTSInterface } = require('../../tts.js') +const MockedBinding = require('../mock/MockedBinding.js') +const sinon = require('sinon') +const process = require('process') + +global.process = process + +function createMockedMtlModel ({ + onOutput = () => {}, + binding, + language = 'es', + files, + exclusiveRun = false, + extra = {} +} = {}) { + const model = new TTSGgml({ + files: files || { + t3Model: './models/chatterbox-t3-mtl.gguf', + s3genModel: './models/chatterbox-s3gen-mtl.gguf' + }, + config: { language }, + opts: { stats: true }, + exclusiveRun, + ...extra + }) + + sinon.stub(model, '_createAddon').callsFake((configurationParams, outputCb) => { + const _binding = binding || new MockedBinding() + const addon = new TTSInterface(_binding, configurationParams, outputCb) + if (_binding.setBaseInferenceCallback) { + _binding.setBaseInferenceCallback(onOutput) + } + return addon + }) + return model +} + +test('Chatterbox MTL: explicit MTL gguf paths route to chatterbox engine', (t) => { + const model = createMockedMtlModel() + t.is(model.getEngineType(), TTSGgml.ENGINE_CHATTERBOX, 'MTL gguf is still chatterbox') + t.is(model._t3ModelPath, './models/chatterbox-t3-mtl.gguf') + t.is(model._s3genModelPath, './models/chatterbox-s3gen-mtl.gguf') +}) + +test('Chatterbox MTL: language config is forwarded into ttsParams', async (t) => { + for (const lang of ['es', 'fr', 'de', 'pt', 'it', 'zh', 'ja', 'ko']) { + const model = createMockedMtlModel({ language: lang }) + const params = model._buildTtsParams() + t.is(params.language, lang, `language ${lang} should be forwarded`) + t.is(params.engineType, TTSGgml.ENGINE_CHATTERBOX, `language ${lang} keeps chatterbox engine`) + } +}) + +test('Chatterbox MTL: synthesis returns audio output and stats with non-en language', async (t) => { + const events = [] + const model = createMockedMtlModel({ + language: 'fr', + onOutput: (addon, event, data, error) => events.push({ event, data, error }) + }) + await model.load() + + const response = await model.run({ type: 'text', input: 'Bonjour le monde.' }) + const outputs = [] + await response.onUpdate(d => outputs.push(d)).await() + + t.ok(outputs.length > 0, 'MTL run emits at least one update') + t.ok(outputs.some(d => d.outputArray), 'MTL output has outputArray') + t.ok(response.stats.totalSamples > 0, 'MTL stats include totalSamples') + t.ok(events.length > 0, 'raw addon callback fired for MTL run') + await model.unload() +}) + +test('Chatterbox MTL: cancel propagates as job failure', async (t) => { + const model = createMockedMtlModel({ language: 'es' }) + await model.load() + + const response = await model.run({ type: 'text', input: 'Cancelar esto' }) + await response.cancel() + + let failed = false + try { + await response.await() + } catch (error) { + failed = true + t.ok(String(error.message).includes('cancel'), 'cancelled MTL response rejects') + } + t.ok(failed, 'cancelled MTL response should fail') + await model.unload() +}) + +test('Chatterbox MTL: reload({ language }) swaps language without reloading weights from disk', async (t) => { + const model = createMockedMtlModel({ language: 'es' }) + await model.load() + + const r1 = await model.run({ type: 'text', input: 'Hola' }) + await r1.await() + + await model.reload({ language: 'fr' }) + t.is(model._config.language, 'fr', 'language updated to fr') + + const r2 = await model.run({ type: 'text', input: 'Bonjour' }) + await r2.await() + t.ok(r2.stats.totalSamples > 0, 'reloaded MTL model produces audio') + await model.unload() +}) + +test('Chatterbox MTL: modelDir auto-detects MTL gguf when only MTL files are present', async (t) => { + const fs = require('bare-fs') + const os = require('bare-os') + const tmpRoot = path.join(os.tmpdir(), 'tts-ggml-mtl-detect-' + Date.now()) + try { + fs.mkdirSync(tmpRoot, { recursive: true }) + fs.writeFileSync(path.join(tmpRoot, 'chatterbox-t3-mtl.gguf'), 'mtl-marker') + fs.writeFileSync(path.join(tmpRoot, 'chatterbox-s3gen-mtl.gguf'), 'mtl-marker') + + const model = new TTSGgml({ + files: { modelDir: tmpRoot }, + config: { language: 'es' } + }) + t.is( + model._t3ModelPath, + path.join(tmpRoot, 'chatterbox-t3-mtl.gguf'), + 'MTL t3 wins when only MTL is present' + ) + t.is( + model._s3genModelPath, + path.join(tmpRoot, 'chatterbox-s3gen-mtl.gguf'), + 'MTL s3gen wins when only MTL is present' + ) + } finally { + try { fs.rmSync(tmpRoot, { recursive: true, force: true }) } catch (_e) {} + } +}) + +test('Chatterbox MTL: modelDir prefers turbo over MTL when both are present', async (t) => { + const fs = require('bare-fs') + const os = require('bare-os') + const tmpRoot = path.join(os.tmpdir(), 'tts-ggml-mtl-mixed-' + Date.now()) + try { + fs.mkdirSync(tmpRoot, { recursive: true }) + fs.writeFileSync(path.join(tmpRoot, 'chatterbox-t3-turbo.gguf'), 'turbo-marker') + fs.writeFileSync(path.join(tmpRoot, 'chatterbox-t3-mtl.gguf'), 'mtl-marker') + fs.writeFileSync(path.join(tmpRoot, 'chatterbox-s3gen.gguf'), 'turbo-marker') + fs.writeFileSync(path.join(tmpRoot, 'chatterbox-s3gen-mtl.gguf'), 'mtl-marker') + + const model = new TTSGgml({ + files: { modelDir: tmpRoot }, + config: { language: 'en' } + }) + t.is( + model._t3ModelPath, + path.join(tmpRoot, 'chatterbox-t3-turbo.gguf'), + 'turbo t3 wins over MTL when both are on disk' + ) + t.is( + model._s3genModelPath, + path.join(tmpRoot, 'chatterbox-s3gen.gguf'), + 'turbo s3gen wins over MTL when both are on disk' + ) + } finally { + try { fs.rmSync(tmpRoot, { recursive: true, force: true }) } catch (_e) {} + } +}) diff --git a/packages/tts-ggml/test/unit/chatterbox.inference.test.js b/packages/tts-ggml/test/unit/chatterbox.inference.test.js new file mode 100644 index 0000000000..4790a796f7 --- /dev/null +++ b/packages/tts-ggml/test/unit/chatterbox.inference.test.js @@ -0,0 +1,217 @@ +'use strict' + +const test = require('brittle') +const TTSGgml = require('../../index.js') +const { TTSInterface } = require('../../tts.js') +const MockedBinding = require('../mock/MockedBinding.js') +const process = require('process') + +global.process = process +const sinon = require('sinon') + +function createMockedModel ({ + onOutput = () => { }, + binding = undefined, + exclusiveRun = false +} = {}) { + const model = new TTSGgml({ + files: { + t3Model: './models/chatterbox-t3-turbo.gguf', + s3genModel: './models/chatterbox-s3gen.gguf' + }, + config: { language: 'en' }, + opts: { stats: true }, + exclusiveRun + }) + + sinon.stub(model, '_createAddon').callsFake((configurationParams, outputCb) => { + const _binding = binding || new MockedBinding() + const addon = new TTSInterface(_binding, configurationParams, outputCb) + if (_binding.setBaseInferenceCallback) { + _binding.setBaseInferenceCallback(onOutput) + } + return addon + }) + return model +} + +async function waitWithTimeout (promise, timeoutMs, message) { + let timeoutId + const timeoutPromise = new Promise((resolve, reject) => { + timeoutId = setTimeout(() => reject(new Error(message)), timeoutMs) + }) + try { + return await Promise.race([promise, timeoutPromise]) + } finally { + clearTimeout(timeoutId) + } +} + +test('Chatterbox: run returns audio output and stats', async (t) => { + const events = [] + const callbackArity = [] + const model = createMockedModel({ + onOutput: function (addon, event, data, error) { + callbackArity.push(arguments.length) + events.push({ event, data, error }) + } + }) + await model.load() + + const response = await model.run({ type: 'text', input: 'Hello world' }) + const outputs = [] + await response.onUpdate(data => outputs.push(data)).await() + + t.ok(outputs.length > 0, 'Response should emit at least one update') + t.ok(outputs.some(d => d.outputArray), 'Response should contain outputArray payload') + t.ok(response.stats.totalSamples > 0, 'Response stats should include total samples') + t.ok(events.length > 0, 'Raw addon callback should have been called') + t.ok(callbackArity.every(length => length === 4), 'Native callbacks should not include a native jobId argument') + await model.unload() +}) + +test('Chatterbox: exclusiveRun does not deadlock run()', async (t) => { + const model = createMockedModel({ exclusiveRun: true }) + await model.load() + + const response = await waitWithTimeout( + model.run({ type: 'text', input: 'Hello with exclusive run' }), + 1000, + 'run() timed out under exclusiveRun' + ) + + await waitWithTimeout( + response.await(), + 1000, + 'response.await() timed out under exclusiveRun' + ) + + t.ok(response.stats.totalSamples > 0, 'Exclusive run should still produce runtime stats') + await model.unload() +}) + +test('Chatterbox: reload reloads configuration', async (t) => { + const model = createMockedModel() + await model.load() + + const before = await model.run({ type: 'text', input: 'hello' }) + await before.await() + + await model.reload({ language: 'en' }) + const after = await model.run({ type: 'text', input: 'hello again' }) + await after.await() + + t.ok(after.stats.audioDurationMs > 0, 'Reloaded model should still produce stats') + await model.unload() +}) + +test('Chatterbox: exclusiveRun does not deadlock reload() or unload()', async (t) => { + const model = createMockedModel({ exclusiveRun: true }) + await model.load() + + await waitWithTimeout( + model.reload({ language: 'en' }), + 1000, + 'reload() timed out under exclusiveRun' + ) + + const response = await waitWithTimeout( + model.run({ type: 'text', input: 'after reload' }), + 1000, + 'run() after reload timed out under exclusiveRun' + ) + await waitWithTimeout( + response.await(), + 1000, + 'response.await() after reload timed out under exclusiveRun' + ) + + await waitWithTimeout( + model.unload(), + 1000, + 'unload() timed out under exclusiveRun' + ) + t.pass('exclusiveRun operations complete without deadlock') +}) + +test('Chatterbox: reload during in-flight job does not stay busy', async (t) => { + const binding = new MockedBinding({ jobDelayMs: 100 }) + const model = createMockedModel({ binding }) + await model.load() + + const inFlight = await model.run({ type: 'text', input: 'hello before reload' }) + await model.reload({ language: 'en' }) + + let rejected = false + try { + await inFlight.await() + } catch (error) { + rejected = true + t.ok(String(error.message).includes('reloaded'), 'In-flight job should fail on reload') + } + t.ok(rejected, 'Reload should reject the in-flight response') + + // Let stale callbacks from the destroyed addon drain before submitting a new job. + await new Promise(resolve => setTimeout(resolve, 150)) + + const afterReload = await model.run({ type: 'text', input: 'hello after reload' }) + await afterReload.await() + t.ok(afterReload.stats.totalSamples > 0, 'Model should accept and complete jobs after reload') + + await model.unload() +}) + +test('Chatterbox: static methods return expected values', async (t) => { + const modelKey = TTSGgml.getModelKey({}) + t.is(modelKey, 'tts-ggml', 'getModelKey should return "tts-ggml"') + t.ok(TTSGgml.inferenceManagerConfig, 'inferenceManagerConfig should exist') + t.is(TTSGgml.inferenceManagerConfig.noAdditionalDownload, true, 'noAdditionalDownload should be true') +}) + +test('Chatterbox: modelDir fills in the two GGUF paths', async (t) => { + const path = require('bare-path') + const model = new TTSGgml({ + files: { modelDir: './models' } + }) + t.is( + model._t3ModelPath, + path.join('./models', 'chatterbox-t3-turbo.gguf'), + 'modelDir derives T3 GGUF path' + ) + t.is( + model._s3genModelPath, + path.join('./models', 'chatterbox-s3gen.gguf'), + 'modelDir derives S3Gen GGUF path' + ) +}) + +test('Chatterbox: explicit t3Model / s3genModel override modelDir defaults', async (t) => { + const model = new TTSGgml({ + files: { + modelDir: './models', + t3Model: '/abs/custom-t3.gguf', + s3genModel: '/abs/custom-s3gen.gguf' + } + }) + t.is(model._t3ModelPath, '/abs/custom-t3.gguf', 'explicit t3Model wins over modelDir') + t.is(model._s3genModelPath, '/abs/custom-s3gen.gguf', 'explicit s3genModel wins over modelDir') +}) + +test('Chatterbox: cancel propagates as job failure', async (t) => { + const model = createMockedModel() + await model.load() + + const response = await model.run({ type: 'text', input: 'cancel me' }) + await response.cancel() + + let failed = false + try { + await response.await() + } catch (error) { + failed = true + t.ok(String(error.message).includes('cancel'), 'Cancelled response should reject') + } + + t.ok(failed, 'Cancelled response should fail') + await model.unload() +}) diff --git a/packages/tts-ggml/test/unit/supertonic-mtl.inference.test.js b/packages/tts-ggml/test/unit/supertonic-mtl.inference.test.js new file mode 100644 index 0000000000..21db590d43 --- /dev/null +++ b/packages/tts-ggml/test/unit/supertonic-mtl.inference.test.js @@ -0,0 +1,124 @@ +'use strict' + +// Supertonic multilingual unit coverage: same engine class as +// supertonic.inference.test.js but exercises the language knob to make +// sure non-en codes are forwarded through ttsParams + reload, and that +// the JS layer doesn't introduce a hidden 'en'-only allow-list (the +// real allow-list lives in tts-cpp's supertonic_preprocess.cpp and is +// already covered by the integration test). + +const test = require('brittle') +const TTSGgml = require('../../index.js') +const { TTSInterface } = require('../../tts.js') +const MockedBinding = require('../mock/MockedBinding.js') +const sinon = require('sinon') +const process = require('process') + +global.process = process + +function createMockedSupertonicMtlModel ({ + onOutput = () => {}, + binding, + language = 'es', + voice = 'F1', + files, + exclusiveRun = false, + extra = {} +} = {}) { + const model = new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: files || { supertonicModel: './models/supertonic2.gguf' }, + voice, + config: { language, useGPU: false }, + opts: { stats: true }, + exclusiveRun, + ...extra + }) + + sinon.stub(model, '_createAddon').callsFake((configurationParams, outputCb) => { + const _binding = binding || new MockedBinding() + const addon = new TTSInterface(_binding, configurationParams, outputCb) + if (_binding.setBaseInferenceCallback) { + _binding.setBaseInferenceCallback(onOutput) + } + return addon + }) + return model +} + +test('Supertonic MTL: language config is forwarded into ttsParams', (t) => { + for (const lang of ['en', 'ko', 'es', 'pt', 'fr']) { + const model = createMockedSupertonicMtlModel({ language: lang }) + const params = model._buildTtsParams() + t.is(params.language, lang, `language ${lang} should be forwarded`) + t.is(params.engineType, TTSGgml.ENGINE_SUPERTONIC, `language ${lang} keeps supertonic engine`) + } +}) + +test('Supertonic MTL: synthesis returns audio output and stats with non-en language', async (t) => { + const events = [] + const model = createMockedSupertonicMtlModel({ + language: 'fr', + onOutput: (addon, event, data, error) => events.push({ event, data, error }) + }) + await model.load() + + const response = await model.run({ type: 'text', input: 'Bonjour le monde.' }) + const outputs = [] + await response.onUpdate(d => outputs.push(d)).await() + + t.ok(outputs.length > 0, 'MTL run emits at least one update') + t.ok(outputs.some(d => d.outputArray), 'MTL output has outputArray') + t.ok(response.stats.totalSamples > 0, 'MTL stats include totalSamples') + t.ok(events.length > 0, 'raw addon callback fired for MTL run') + await model.unload() +}) + +test('Supertonic MTL: cancel propagates as job failure', async (t) => { + const model = createMockedSupertonicMtlModel({ language: 'es' }) + await model.load() + + const response = await model.run({ type: 'text', input: 'Cancelar esto' }) + await response.cancel() + + let failed = false + try { + await response.await() + } catch (error) { + failed = true + t.ok(String(error.message).includes('cancel'), 'cancelled MTL response rejects') + } + t.ok(failed, 'cancelled MTL response should fail') + await model.unload() +}) + +test('Supertonic MTL: reload({ language }) swaps language without reloading weights from disk', async (t) => { + const model = createMockedSupertonicMtlModel({ language: 'es' }) + await model.load() + + const r1 = await model.run({ type: 'text', input: 'Hola' }) + await r1.await() + + await model.reload({ language: 'fr' }) + t.is(model._config.language, 'fr', 'language updated to fr') + + const r2 = await model.run({ type: 'text', input: 'Bonjour' }) + await r2.await() + t.ok(r2.stats.totalSamples > 0, 'reloaded MTL model produces audio') + await model.unload() +}) + +test('Supertonic MTL: voice + language together survive ttsParams round-trip', (t) => { + const model = createMockedSupertonicMtlModel({ + language: 'pt', + voice: 'M2', + extra: { steps: 6, speed: 1.1, seed: 13 } + }) + const params = model._buildTtsParams() + t.is(params.language, 'pt') + t.is(params.voice, 'M2') + t.is(params.steps, 6) + t.is(params.speed, 1.1) + t.is(params.seed, 13) + t.is(params.useGPU, false, 'supertonic stays CPU-only on the JS side') +}) diff --git a/packages/tts-ggml/test/unit/supertonic.inference.test.js b/packages/tts-ggml/test/unit/supertonic.inference.test.js new file mode 100644 index 0000000000..3379388b7c --- /dev/null +++ b/packages/tts-ggml/test/unit/supertonic.inference.test.js @@ -0,0 +1,234 @@ +'use strict' + +const test = require('brittle') +const path = require('bare-path') +const TTSGgml = require('../../index.js') +const { TTSInterface } = require('../../tts.js') +const MockedBinding = require('../mock/MockedBinding.js') +const sinon = require('sinon') +const process = require('process') + +global.process = process + +function createMockedSupertonicModel ({ + onOutput = () => {}, + binding, + files, + voice = 'F1', + steps = 5, + speed = 1, + language = 'en', + exclusiveRun = false, + extra = {} +} = {}) { + const model = new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: files || { supertonicModel: './models/supertonic.gguf' }, + voice, + steps, + speed, + config: { language, useGPU: false }, + opts: { stats: true }, + exclusiveRun, + ...extra + }) + + sinon.stub(model, '_createAddon').callsFake((configurationParams, outputCb) => { + const _binding = binding || new MockedBinding() + const addon = new TTSInterface(_binding, configurationParams, outputCb) + if (_binding.setBaseInferenceCallback) { + _binding.setBaseInferenceCallback(onOutput) + } + return addon + }) + return model +} + +test('Supertonic: explicit engine option routes to supertonic', (t) => { + const model = createMockedSupertonicModel() + t.is(model.getEngineType(), TTSGgml.ENGINE_SUPERTONIC, 'engine: supertonic detected') + t.is(model._supertonicModelPath, './models/supertonic.gguf') + t.absent(model._t3ModelPath, 'no t3 path on supertonic') + t.absent(model._s3genModelPath, 'no s3gen path on supertonic') +}) + +test('Supertonic: supertonicModel file path alone routes to supertonic engine', (t) => { + const model = new TTSGgml({ + files: { supertonicModel: './models/super.gguf' }, + config: { language: 'en' } + }) + t.is(model.getEngineType(), TTSGgml.ENGINE_SUPERTONIC, 'supertonicModel file detected') +}) + +test('Supertonic: ttsParams shape passes voice/steps/speed/seed/threads/useGPU', (t) => { + const model = createMockedSupertonicModel({ + voice: 'M2', + steps: 8, + speed: 1.25, + extra: { seed: 7, threads: 2, nGpuLayers: 0 } + }) + const params = model._buildTtsParams() + t.is(params.engineType, TTSGgml.ENGINE_SUPERTONIC) + t.is(params.supertonicModelPath, './models/supertonic.gguf') + t.is(params.voice, 'M2') + t.is(params.steps, 8) + t.is(params.speed, 1.25) + t.is(params.seed, 7) + t.is(params.threads, 2) + t.is(params.nGpuLayers, 0, 'nGpuLayers=0 is the only allowed GPU value for supertonic today') + t.is(params.useGPU, false, 'useGPU follows config.useGPU') + t.absent(params.t3ModelPath, 'no t3 path leaked into supertonic params') + t.absent(params.s3genModelPath, 'no s3gen path leaked into supertonic params') +}) + +test('Supertonic: voice option also accepts voiceName for ONNX-tts cross-compat', (t) => { + const model = new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: { supertonicModel: './models/supertonic.gguf' }, + voiceName: 'F1', + numInferenceSteps: 3, + config: { language: 'en' } + }) + const params = model._buildTtsParams() + t.is(params.voice, 'F1', 'voiceName aliases to voice') + t.is(params.steps, 3, 'numInferenceSteps aliases to steps') +}) + +test('Supertonic: synthesis returns audio output and stats', async (t) => { + const events = [] + const model = createMockedSupertonicModel({ + onOutput: (addon, event, data, error) => events.push({ event, data, error }) + }) + await model.load() + + const response = await model.run({ type: 'text', input: 'Hello supertonic.' }) + const outputs = [] + await response.onUpdate(d => outputs.push(d)).await() + + t.ok(outputs.length > 0, 'supertonic emits at least one update') + t.ok(outputs.some(d => d.outputArray), 'supertonic output has outputArray') + t.ok(response.stats.totalSamples > 0, 'supertonic stats include totalSamples') + t.ok(events.length > 0, 'raw addon callback fired for supertonic run') + await model.unload() +}) + +test('Supertonic: cancel propagates as job failure', async (t) => { + const model = createMockedSupertonicModel() + await model.load() + + const response = await model.run({ type: 'text', input: 'Cancel this' }) + await response.cancel() + + let failed = false + try { + await response.await() + } catch (error) { + failed = true + t.ok(String(error.message).includes('cancel'), 'cancelled supertonic response rejects') + } + t.ok(failed, 'cancelled supertonic response should fail') + await model.unload() +}) + +test('Supertonic: invalid engine option rejects at constructor time', (t) => { + let threw = false + try { + /* eslint no-new: 0 */ + new TTSGgml({ + engine: 'parakeet', + files: { supertonicModel: './models/supertonic.gguf' } + }) + } catch (e) { + threw = true + t.ok(String(e.message).includes('chatterbox'), 'error message lists valid engines') + } + t.ok(threw, 'invalid engine should throw') +}) + +test('Supertonic: streamChunkTokens / streamFirstChunkTokens rejected at constructor', (t) => { + for (const knob of ['streamChunkTokens', 'streamFirstChunkTokens']) { + let threw = false + try { + /* eslint no-new: 0 */ + new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: { supertonicModel: './models/supertonic.gguf' }, + [knob]: 25 + }) + } catch (e) { + threw = true + t.ok( + /Chatterbox-only/.test(e.message), + `${knob} error mentions Chatterbox-only` + ) + t.ok( + /runStream\(\) \/ runStreaming\(\)/.test(e.message), + `${knob} error points at sentence-level streaming alternative` + ) + } + t.ok(threw, `passing ${knob} on supertonic should throw`) + } +}) + +test('Supertonic: runStream emits per-sentence chunks with chunkIndex + isLast (mocked)', async (t) => { + const model = createMockedSupertonicModel() + await model.load() + const text = 'First chunk one. Second chunk two. Third chunk three.' + const r = await model.runStream(text, { maxChunkScalars: 18 }) + const updates = [] + await r.onUpdate(d => updates.push(d)).await() + + const withChunk = updates.filter(u => u.chunkIndex !== undefined) + t.ok(withChunk.length >= 2, 'supertonic runStream emits multiple chunks') + t.is(withChunk[0].chunkIndex, 0, 'first chunkIndex is 0') + t.ok(typeof withChunk[0].sentenceChunk === 'string', 'sentenceChunk is a string') + const isLastFlags = withChunk.map(u => !!u.isLast) + t.is(isLastFlags.filter(Boolean).length, 1, 'exactly one isLast=true on the final chunk') + t.is(isLastFlags[isLastFlags.length - 1], true, 'final chunk carries isLast=true') + t.is(isLastFlags[0], false, 'first chunk is not isLast (if multiple chunks)') + await model.unload() +}) + +test('Supertonic: runStreaming with async iterator drives one job per sentence (mocked)', async (t) => { + const model = createMockedSupertonicModel() + await model.load() + async function * lines () { + yield 'First yielded sentence.' + yield 'Second yielded sentence.' + yield 'Third yielded sentence.' + } + const r = await model.runStreaming(lines()) + const updates = [] + await r.onUpdate(d => updates.push(d)).await() + + const withChunk = updates.filter(u => u.chunkIndex !== undefined) + t.is(withChunk.length, 3, 'supertonic runStreaming emits 3 chunks') + t.is(withChunk[0].chunkIndex, 0) + t.is(withChunk[2].chunkIndex, 2) + t.ok(withChunk.every(u => u.isLast === undefined), 'isLast is undefined for async-iter mode (count not known up-front)') + await model.unload() +}) + +test('Supertonic: modelDir auto-detects supertonic.gguf', async (t) => { + const fs = require('bare-fs') + const os = require('bare-os') + const tmpRoot = path.join(os.tmpdir(), 'tts-ggml-supertonic-detect-' + Date.now()) + try { + fs.mkdirSync(tmpRoot, { recursive: true }) + fs.writeFileSync(path.join(tmpRoot, 'supertonic.gguf'), 'super-marker') + + const model = new TTSGgml({ + files: { modelDir: tmpRoot }, + voice: 'F1', + config: { language: 'en', useGPU: false } + }) + t.is(model.getEngineType(), TTSGgml.ENGINE_SUPERTONIC, 'modelDir with supertonic.gguf detected') + t.is( + model._supertonicModelPath, + path.join(tmpRoot, 'supertonic.gguf'), + 'supertonic path resolved from modelDir' + ) + } finally { + try { fs.rmSync(tmpRoot, { recursive: true, force: true }) } catch (_e) {} + } +}) diff --git a/packages/tts-ggml/test/unit/textChunker.test.js b/packages/tts-ggml/test/unit/textChunker.test.js new file mode 100644 index 0000000000..43a7594c24 --- /dev/null +++ b/packages/tts-ggml/test/unit/textChunker.test.js @@ -0,0 +1,41 @@ +'use strict' + +const test = require('brittle') +const { + splitTtsText, + intlSentenceSegmentationAvailable, + splitByAsciiAndCjkPunctuation +} = require('../../lib/textChunker.js') + +test('splitByAsciiAndCjkPunctuation splits on CJK full stops', (t) => { + const parts = splitByAsciiAndCjkPunctuation('第一句。第二句。') + t.is(parts.length, 2) + t.ok(parts[0].includes('一')) + t.ok(parts[1].includes('二')) +}) + +test('splitTtsText respects max length for long unbroken text', (t) => { + const long = 'x'.repeat(500) + const chunks = splitTtsText(long, { language: 'en', maxScalars: 300 }) + t.ok(chunks.length >= 2) + for (const c of chunks) { + t.ok([...c].length <= 300) + } +}) + +test('splitTtsText uses shorter chunks for Korean default', (t) => { + const body = '가'.repeat(200) + const chunks = splitTtsText(body, { language: 'ko' }) + t.ok(chunks.length >= 2) +}) + +test('intlSentenceSegmentationAvailable is boolean', (t) => { + t.is(typeof intlSentenceSegmentationAvailable(), 'boolean') +}) + +test('splitTtsText mergeToMaxScalars:false does not merge sentences by max length', (t) => { + const text = 'A. B. C. D. E.' + const sentenceLevel = splitTtsText(text, { language: 'en', mergeToMaxScalars: false }) + const merged = splitTtsText(text, { language: 'en', mergeToMaxScalars: true, maxScalars: 100 }) + t.ok(sentenceLevel.length >= merged.length) +}) diff --git a/packages/tts-ggml/test/unit/tts-ggml.lifecycle.test.js b/packages/tts-ggml/test/unit/tts-ggml.lifecycle.test.js new file mode 100644 index 0000000000..567e8b4118 --- /dev/null +++ b/packages/tts-ggml/test/unit/tts-ggml.lifecycle.test.js @@ -0,0 +1,72 @@ +'use strict' + +const test = require('brittle') +const sinon = require('sinon') +const TTSGgml = require('../../index.js') +const { TTSInterface } = require('../../tts.js') +const MockedBinding = require('../mock/MockedBinding.js') +const { QvacErrorAddonTTSGgml, ERR_CODES } = require('../../lib/error.js') +const process = require('process') + +global.process = process + +function createStubbedModel () { + const model = new TTSGgml({ + files: { + t3Model: './models/chatterbox-t3-turbo.gguf', + s3genModel: './models/chatterbox-s3gen.gguf' + }, + config: { language: 'en' } + }) + sinon.stub(model, '_createAddon').callsFake((configurationParams, outputCb) => { + return new TTSInterface(new MockedBinding(), configurationParams, outputCb) + }) + return model +} + +test('unload() clears load flags but not destroyed', async (t) => { + const model = createStubbedModel() + await model.load() + let s = model.getState() + t.ok(s.configLoaded) + t.ok(s.weightsLoaded) + t.not(s.destroyed) + + await model.unload() + s = model.getState() + t.not(s.configLoaded) + t.not(s.weightsLoaded) + t.not(s.destroyed) +}) + +test('destroy() clears load flags and sets destroyed', async (t) => { + const model = createStubbedModel() + await model.load() + await model.destroy() + const s = model.getState() + t.not(s.configLoaded, 'config should be cleared') + t.not(s.weightsLoaded, 'weights should be cleared') + t.ok(s.destroyed, 'destroyed flag should be set') +}) + +test('destroy() without load still sets destroyed', async (t) => { + const model = createStubbedModel() + await model.destroy() + const s = model.getState() + t.not(s.configLoaded) + t.not(s.weightsLoaded) + t.ok(s.destroyed) +}) + +test('load() after destroy() rejects with FAILED_TO_LOAD', async (t) => { + const model = createStubbedModel() + await model.load() + await model.destroy() + try { + await model.load() + t.fail('load() should throw after destroy()') + } catch (err) { + t.ok(err instanceof QvacErrorAddonTTSGgml, 'should throw QvacErrorAddonTTSGgml') + t.is(err.code, ERR_CODES.FAILED_TO_LOAD, 'code should be FAILED_TO_LOAD') + } +}) diff --git a/packages/tts-ggml/test/unit/tts-ggml.sentence-stream.test.js b/packages/tts-ggml/test/unit/tts-ggml.sentence-stream.test.js new file mode 100644 index 0000000000..54a7459260 --- /dev/null +++ b/packages/tts-ggml/test/unit/tts-ggml.sentence-stream.test.js @@ -0,0 +1,194 @@ +'use strict' + +const test = require('brittle') +const sinon = require('sinon') +const { buildSentenceEndTester } = require('../../lib/textStreamAccumulator.js') +const TTSGgml = require('../../index.js') +const { TTSInterface } = require('../../tts.js') +const MockedBinding = require('../mock/MockedBinding.js') +const process = require('process') + +global.process = process + +function createStubbedModel (opts = {}) { + const model = new TTSGgml({ + files: { + t3Model: './models/chatterbox-t3-turbo.gguf', + s3genModel: './models/chatterbox-s3gen.gguf' + }, + config: { language: 'en' }, + opts: { stats: true }, + ...opts + }) + sinon.stub(model, '_createAddon').callsFake((configurationParams, outputCb) => { + return new TTSInterface(new MockedBinding({ jobDelayMs: 5 }), configurationParams, outputCb) + }) + return model +} + +test('runStream runs multiple native jobs and enriches output (onUpdate + await)', async (t) => { + const runJobSpy = sinon.spy(MockedBinding.prototype, 'runJob') + const model = createStubbedModel() + await model.load() + const text = + 'This is long text one. This is long text two. This is long text three.' + const response = await model.runStream(text, { maxChunkScalars: 18 }) + const updates = [] + response.onUpdate(d => { + updates.push(d) + }) + await response.await() + t.ok(runJobSpy.callCount >= 2, 'expected multiple runJob calls') + const withChunk = updates.filter(u => u.chunkIndex !== undefined) + t.ok(withChunk.length >= 2, 'expected chunk metadata on outputs') + t.is(withChunk[0].chunkIndex, 0) + t.ok(typeof withChunk[0].sentenceChunk === 'string') + t.ok(response.stats && typeof response.stats.totalTime === 'number') + runJobSpy.restore() +}) + +test('run({ streamOutput: true }) matches chunked runStream behavior', async (t) => { + const runJobSpy = sinon.spy(MockedBinding.prototype, 'runJob') + const model = createStubbedModel() + await model.load() + const text = + 'This is long text one. This is long text two. This is long text three.' + const response = await model.run({ + input: text, + streamOutput: true, + maxChunkScalars: 18 + }) + const updates = [] + response.onUpdate(d => { + updates.push(d) + }) + await response.await() + t.ok(runJobSpy.callCount >= 2, 'expected multiple runJob calls') + const withChunk = updates.filter(u => u.chunkIndex !== undefined) + t.ok(withChunk.length >= 2, 'expected chunk metadata on outputs') + t.is(withChunk[0].chunkIndex, 0) + t.ok(typeof withChunk[0].sentenceChunk === 'string') + t.ok(response.stats && typeof response.stats.totalTime === 'number') + runJobSpy.restore() +}) + +test('runStreaming accumulate merges token stream into one job when sentence completes', async (t) => { + const runJobSpy = sinon.spy(MockedBinding.prototype, 'runJob') + const model = createStubbedModel() + await model.load() + async function * tokens () { + yield 'One ' + yield 'sentence ' + yield 'only.' + } + const response = await model.runStreaming(tokens()) + await response.await() + t.is(runJobSpy.callCount, 1) + runJobSpy.restore() +}) + +test('runStreaming accumulateSentences false runs one job per yield', async (t) => { + const runJobSpy = sinon.spy(MockedBinding.prototype, 'runJob') + const model = createStubbedModel() + await model.load() + async function * tokens () { + yield 'a' + yield 'b' + } + const response = await model.runStreaming(tokens(), { accumulateSentences: false }) + await response.await() + t.is(runJobSpy.callCount, 2) + runJobSpy.restore() +}) + +test('runStreaming accumulate hard-splits when buffer exceeds maxBufferScalars', async (t) => { + const runJobSpy = sinon.spy(MockedBinding.prototype, 'runJob') + const model = createStubbedModel() + await model.load() + async function * oneBig () { + yield 'a'.repeat(250) + } + const response = await model.runStreaming(oneBig(), { maxBufferScalars: 100 }) + await response.await() + t.is(runJobSpy.callCount, 3) + runJobSpy.restore() +}) + +test('runStreaming maxBufferScalars 0 falls back to default (no infinite loop)', async (t) => { + const runJobSpy = sinon.spy(MockedBinding.prototype, 'runJob') + const model = createStubbedModel() + await model.load() + async function * oneBig () { + yield 'a'.repeat(250) + } + const response = await model.runStreaming(oneBig(), { maxBufferScalars: 0 }) + await response.await() + t.is(runJobSpy.callCount, 1, '250 graphemes under default max ~300 → one job') + runJobSpy.restore() +}) + +test('buildSentenceEndTester resets global delimiter lastIndex before each test', (t) => { + const delimiter = /[.!?]\s*$/g + const testEnd = buildSentenceEndTester({ sentenceDelimiter: delimiter }) + t.ok(testEnd('A.')) + t.ok(testEnd('B.'), 'second buffer must match from lastIndex 0 (global /g otherwise sticks)') +}) + +test('runStreaming custom sentenceDelimiter with /g still flushes each fragment', async (t) => { + const runJobSpy = sinon.spy(MockedBinding.prototype, 'runJob') + const model = createStubbedModel() + await model.load() + const delimiter = /[.!?]\s*$/g + async function * parts () { + yield 'A.' + yield 'B.' + } + const response = await model.runStreaming(parts(), { sentenceDelimiter: delimiter }) + await response.await() + t.is(runJobSpy.callCount, 2) + runJobSpy.restore() +}) + +test('runStreaming yields multiple jobs from async text chunks', async (t) => { + const runJobSpy = sinon.spy(MockedBinding.prototype, 'runJob') + const model = createStubbedModel() + await model.load() + + async function * lines () { + yield 'First sentence for TTS.' + yield 'Second sentence follows.' + yield 'Third sentence ends here.' + } + + const response = await model.runStreaming(lines()) + const updates = [] + response.onUpdate(d => { + updates.push(d) + }) + await response.await() + t.is(runJobSpy.callCount, 3, 'expected one runJob per yielded string') + const withChunk = updates.filter(u => u.chunkIndex !== undefined) + t.is(withChunk.length, 3) + t.is(withChunk[0].chunkIndex, 0) + t.is(withChunk[2].chunkIndex, 2) + t.ok(typeof withChunk[1].sentenceChunk === 'string') + runJobSpy.restore() +}) + +test('plain run() uses single job', async (t) => { + const runJobSpy = sinon.spy(MockedBinding.prototype, 'runJob') + const model = createStubbedModel() + await model.load() + const response = await model.run({ + input: 'Single block of text without extra splitting.' + }) + const updates = [] + for await (const d of response.iterate()) { + updates.push(d) + } + await response.await() + t.is(runJobSpy.callCount, 1) + const withChunk = updates.filter(u => u.chunkIndex !== undefined) + t.is(withChunk.length, 0) + runJobSpy.restore() +}) diff --git a/packages/tts-ggml/test/unit/tts.error.test.js b/packages/tts-ggml/test/unit/tts.error.test.js new file mode 100644 index 0000000000..7754ce2d7c --- /dev/null +++ b/packages/tts-ggml/test/unit/tts.error.test.js @@ -0,0 +1,180 @@ +'use strict' + +const test = require('brittle') +const { TTSInterface } = require('../../tts.js') +const { QvacErrorAddonTTSGgml, ERR_CODES } = require('../../lib/error.js') + +function createErrorBinding (errorMethods = {}) { + return { + createInstance: () => ({ id: 1 }), + activate: (handle) => { + if (errorMethods.activate) throw new Error(errorMethods.activate) + }, + runJob: (handle, data) => { + if (errorMethods.runJob) throw new Error(errorMethods.runJob) + return true + }, + cancel: (handle) => { + if (errorMethods.cancel) throw new Error(errorMethods.cancel) + }, + destroyInstance: (handle) => { + if (errorMethods.destroyInstance) throw new Error(errorMethods.destroyInstance) + }, + loadWeights: (handle, weightsData) => { + if (errorMethods.loadWeights) throw new Error(errorMethods.loadWeights) + } + } +} + +test('activate() throws QvacErrorAddonTTSGgml with FAILED_TO_ACTIVATE code', async (t) => { + const errorMessage = 'Activation failed due to invalid state' + const binding = createErrorBinding({ activate: errorMessage }) + const tts = new TTSInterface(binding, {}) + + try { + await tts.activate() + t.fail('Should have thrown an error') + } catch (error) { + t.ok(error instanceof QvacErrorAddonTTSGgml, 'Error should be instance of QvacErrorAddonTTSGgml') + t.is(error.code, ERR_CODES.FAILED_TO_ACTIVATE, 'Error code should be FAILED_TO_ACTIVATE') + t.ok(error.message.includes(errorMessage), 'Error message should contain original error') + } +}) + +test('runJob() throws QvacErrorAddonTTSGgml with FAILED_TO_APPEND code', async (t) => { + const errorMessage = 'runJob failed' + const binding = createErrorBinding({ runJob: errorMessage }) + const tts = new TTSInterface(binding, {}) + + try { + await tts.runJob({ type: 'text', input: 'Hello' }) + t.fail('Should have thrown an error') + } catch (error) { + t.ok(error instanceof QvacErrorAddonTTSGgml, 'Error should be instance of QvacErrorAddonTTSGgml') + t.is(error.code, ERR_CODES.FAILED_TO_APPEND, 'Error code should be FAILED_TO_APPEND') + t.ok(error.message.includes(errorMessage), 'Error message should contain original error') + } +}) + +test('loadWeights() throws QvacErrorAddonTTSGgml with FAILED_TO_LOAD code', async (t) => { + const errorMessage = 'Load weights failed' + const binding = createErrorBinding({ loadWeights: errorMessage }) + const tts = new TTSInterface(binding, {}) + + try { + await tts.loadWeights({ filename: 'foo', contents: new Uint8Array(0) }) + t.fail('Should have thrown an error') + } catch (error) { + t.ok(error instanceof QvacErrorAddonTTSGgml, 'Error should be instance of QvacErrorAddonTTSGgml') + t.is(error.code, ERR_CODES.FAILED_TO_LOAD, 'Error code should be FAILED_TO_LOAD') + t.ok(error.message.includes(errorMessage), 'Error message should contain original error') + } +}) + +test('cancel() throws QvacErrorAddonTTSGgml with FAILED_TO_CANCEL code', async (t) => { + const errorMessage = 'Cancel operation failed' + const binding = createErrorBinding({ cancel: errorMessage }) + const tts = new TTSInterface(binding, {}) + + try { + await tts.cancel() + t.fail('Should have thrown an error') + } catch (error) { + t.ok(error instanceof QvacErrorAddonTTSGgml, 'Error should be instance of QvacErrorAddonTTSGgml') + t.is(error.code, ERR_CODES.FAILED_TO_CANCEL, 'Error code should be FAILED_TO_CANCEL') + t.ok(error.message.includes(errorMessage), 'Error message should contain original error') + } +}) + +test('cancel() calls native binding with the addon handle only', async (t) => { + const calls = [] + const binding = createErrorBinding() + binding.cancel = function () { + calls.push(Array.from(arguments)) + } + const tts = new TTSInterface(binding, {}) + + await tts.cancel() + + t.is(calls.length, 1, 'cancel should call the native binding once') + t.is(calls[0].length, 1, 'cancel should not forward a jobId argument') + t.alike(calls[0][0], tts._handle, 'cancel should forward the addon handle') +}) + +test('destroyInstance() throws QvacErrorAddonTTSGgml with FAILED_TO_DESTROY code', async (t) => { + const errorMessage = 'Failed to destroy instance' + const binding = createErrorBinding({ destroyInstance: errorMessage }) + const tts = new TTSInterface(binding, {}) + + try { + await tts.destroyInstance() + t.fail('Should have thrown an error') + } catch (error) { + t.ok(error instanceof QvacErrorAddonTTSGgml, 'Error should be instance of QvacErrorAddonTTSGgml') + t.is(error.code, ERR_CODES.FAILED_TO_DESTROY, 'Error code should be FAILED_TO_DESTROY') + t.ok(error.message.includes(errorMessage), 'Error message should contain original error') + } +}) + +test('destroyInstance() returns early if handle is null', async (t) => { + const binding = createErrorBinding({ destroyInstance: 'Should not be called' }) + const tts = new TTSInterface(binding, {}) + + // Manually set handle to null + tts._handle = null + + // Should not throw + await tts.destroyInstance() + t.pass('destroyInstance should return early without error when handle is null') +}) + +test('unload() delegates to destroyInstance and preserves errors', async (t) => { + const errorMessage = 'Destroy failed during unload' + const binding = createErrorBinding({ destroyInstance: errorMessage }) + const tts = new TTSInterface(binding, {}) + + try { + await tts.unload() + t.fail('Should have thrown an error') + } catch (error) { + t.ok(error instanceof QvacErrorAddonTTSGgml, 'Error should be instance of QvacErrorAddonTTSGgml') + t.is(error.code, ERR_CODES.FAILED_TO_DESTROY, 'Error code should be FAILED_TO_DESTROY') + t.ok(error.message.includes(errorMessage), 'Error message should contain original error') + } +}) + +test('Error cause is preserved in QvacErrorAddonTTSGgml', async (t) => { + const errorMessage = 'Original error message' + const binding = createErrorBinding({ activate: errorMessage }) + const tts = new TTSInterface(binding, {}) + + try { + await tts.activate() + t.fail('Should have thrown an error') + } catch (error) { + t.ok(error.cause, 'Error should have a cause property') + t.ok(error.cause instanceof Error, 'Cause should be an Error instance') + t.is(error.cause.message, errorMessage, 'Cause message should match original error') + } +}) + +test('All ERR_CODES are defined and unique', async (t) => { + const codes = Object.values(ERR_CODES) + const uniqueCodes = new Set(codes) + + t.is(codes.length, 11, 'Should have 11 error codes') + t.is(uniqueCodes.size, codes.length, 'All error codes should be unique') + + // Range 13001-14000 is reserved for @qvac/tts-ggml (see lib/error.js). + t.is(ERR_CODES.FAILED_TO_ACTIVATE, 13001, 'FAILED_TO_ACTIVATE should be 13001') + t.is(ERR_CODES.FAILED_TO_APPEND, 13002, 'FAILED_TO_APPEND should be 13002') + t.is(ERR_CODES.FAILED_TO_GET_STATUS, 13003, 'FAILED_TO_GET_STATUS should be 13003') + t.is(ERR_CODES.FAILED_TO_PAUSE, 13004, 'FAILED_TO_PAUSE should be 13004') + t.is(ERR_CODES.FAILED_TO_CANCEL, 13005, 'FAILED_TO_CANCEL should be 13005') + t.is(ERR_CODES.FAILED_TO_DESTROY, 13006, 'FAILED_TO_DESTROY should be 13006') + t.is(ERR_CODES.FAILED_TO_UNLOAD, 13007, 'FAILED_TO_UNLOAD should be 13007') + t.is(ERR_CODES.FAILED_TO_LOAD, 13008, 'FAILED_TO_LOAD should be 13008') + t.is(ERR_CODES.FAILED_TO_RELOAD, 13009, 'FAILED_TO_RELOAD should be 13009') + t.is(ERR_CODES.FAILED_TO_STOP, 13010, 'FAILED_TO_STOP should be 13010') + t.is(ERR_CODES.JOB_ALREADY_RUNNING, 13011, 'JOB_ALREADY_RUNNING should be 13011') +}) diff --git a/packages/tts-ggml/test/utils/downloadModel.js b/packages/tts-ggml/test/utils/downloadModel.js new file mode 100644 index 0000000000..3b9c0defa2 --- /dev/null +++ b/packages/tts-ggml/test/utils/downloadModel.js @@ -0,0 +1,529 @@ +'use strict' + +const fs = require('bare-fs') +const path = require('bare-path') +const os = require('bare-os') +const process = require('bare-process') + +const platform = os.platform() +const isMobile = platform === 'ios' || platform === 'android' + +// Returns base directory for models - uses global.testDir on mobile, current dir otherwise +function getBaseDir () { + return isMobile && global.testDir ? global.testDir : '.' +} + +/** Returns true if file exists and is valid JSON; false if missing, wrong size, or invalid. */ +function isValidJsonCache (filepath) { + try { + if (!fs.existsSync(filepath)) return false + const stats = fs.statSync(filepath) + // 1024 bytes is the binary placeholder size - treat as invalid cache for JSON + if (stats.size === 1024) return false + if (stats.size < 10) return false + const raw = fs.readFileSync(filepath, 'utf8') + const parsed = JSON.parse(raw) + return typeof parsed === 'object' && parsed !== null + } catch (e) { + return false + } +} + +/** + * Mobile-friendly HTTPS download using bare-https. + * Handles redirects and writes directly to file. + */ +async function downloadWithHttp (url, filepath, maxRedirects = 10) { + return new Promise((resolve, reject) => { + const https = require('bare-https') + const { URL } = require('bare-url') + + const parsedUrl = new URL(url) + + const options = { + hostname: parsedUrl.hostname, + port: parsedUrl.port || 443, + path: parsedUrl.pathname + parsedUrl.search, + method: 'GET', + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; bare-download/1.0)' + } + } + + console.log(` [HTTPS] Requesting: ${parsedUrl.hostname}${parsedUrl.pathname}`) + + const req = https.request(options, (res) => { + if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { + if (maxRedirects <= 0) { + reject(new Error('Too many redirects')) + return + } + const location = res.headers.location + let redirectUrl + if (location.startsWith('http://') || location.startsWith('https://')) { + redirectUrl = location + } else if (location.startsWith('/')) { + redirectUrl = `${parsedUrl.protocol}//${parsedUrl.host}${location}` + } else { + const basePath = parsedUrl.pathname.substring(0, parsedUrl.pathname.lastIndexOf('/') + 1) + redirectUrl = `${parsedUrl.protocol}//${parsedUrl.host}${basePath}${location}` + } + console.log(` [HTTPS] Redirecting to: ${redirectUrl}`) + downloadWithHttp(redirectUrl, filepath, maxRedirects - 1).then(resolve).catch(reject) + return + } + + if (res.statusCode !== 200) { + reject(new Error(`HTTP ${res.statusCode}: ${res.statusMessage}`)) + return + } + + const dir = path.dirname(filepath) + if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true }) + + const writeStream = fs.createWriteStream(filepath) + let downloadedBytes = 0 + const contentLength = parseInt(res.headers['content-length'] || '0', 10) + + res.on('data', (chunk) => { + writeStream.write(chunk) + downloadedBytes += chunk.length + if (contentLength > 0 && downloadedBytes % (1024 * 1024) < chunk.length) { + const percent = ((downloadedBytes / contentLength) * 100).toFixed(1) + console.log(` [HTTPS] Progress: ${percent}% (${downloadedBytes} / ${contentLength} bytes)`) + } + }) + + res.on('end', () => { + writeStream.end() + writeStream.on('finish', () => resolve({ success: true, path: filepath })) + writeStream.on('error', reject) + }) + + res.on('error', reject) + }) + + req.on('error', reject) + req.end() + }) +} + +function getFileSizeFromUrl (url) { + try { + const { spawnSync } = require('bare-subprocess') + const result = spawnSync('curl', [ + '-I', '-L', url, + '--fail', '--silent', '--show-error', + '--connect-timeout', '10', + '--max-time', '30' + ], { stdio: ['inherit', 'pipe', 'pipe'] }) + + if (result.status === 0 && result.stdout) { + const output = result.stdout.toString() + const match = output.match(/content-length:\s*(\d+)/i) + if (match) return parseInt(match[1], 10) + } + } catch (e) { + console.log(` Warning: Could not get file size from URL: ${e.message}`) + } + return null +} + +async function ensureFileDownloaded (url, filepath) { + const isJson = filepath.endsWith('.json') + const dir = path.dirname(filepath) + if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true }) + + const expectedSize = isMobile ? null : getFileSizeFromUrl(url) + const minSize = expectedSize ? Math.floor(expectedSize * 0.9) : (isJson ? 100 : 1000000) + + if (fs.existsSync(filepath)) { + const stats = fs.statSync(filepath) + if (stats.size >= minSize) { + if (isJson && !isValidJsonCache(filepath)) { + console.log(` Cached JSON invalid or placeholder (${stats.size} bytes), re-downloading...`) + fs.unlinkSync(filepath) + } else { + console.log(` ✓ Using cached model: ${path.basename(filepath)} (${stats.size} bytes)`) + return { success: true, path: filepath, isReal: true } + } + } else { + console.log(` Cached file too small (${stats.size} bytes), re-downloading...`) + fs.unlinkSync(filepath) + } + } + + console.log(` Downloading: ${path.basename(filepath)}...`) + if (expectedSize) console.log(` Expected size: ${expectedSize} bytes`) + + if (isMobile) { + try { + const result = await downloadWithHttp(url, filepath) + if (result.success && fs.existsSync(filepath)) { + const stats = fs.statSync(filepath) + if (stats.size >= minSize) { + if (isJson && !isValidJsonCache(filepath)) { + console.log(' Downloaded file is not valid JSON, discarding') + fs.unlinkSync(filepath) + } else { + console.log(` ✓ Downloaded: ${path.basename(filepath)} (${stats.size} bytes)`) + return { success: true, path: filepath, isReal: true } + } + } else { + console.log(` Downloaded file too small: ${stats.size} bytes (expected >${minSize})`) + } + } + } catch (e) { + console.log(` HTTP download error: ${e.message}`) + } + } else { + try { + const { spawnSync } = require('bare-subprocess') + if (isJson) { + const result = spawnSync('curl', [ + '-L', url, + '--fail', '--silent', '--show-error', + '--connect-timeout', '30', + '--max-time', '300' + ], { stdio: ['inherit', 'pipe', 'pipe'] }) + + if (result.status === 0 && result.stdout) { + fs.writeFileSync(filepath, result.stdout) + const stats = fs.statSync(filepath) + if (stats.size >= minSize && isValidJsonCache(filepath)) { + console.log(` ✓ Downloaded: ${path.basename(filepath)} (${stats.size} bytes)`) + return { success: true, path: filepath, isReal: true } + } + fs.unlinkSync(filepath) + } else { + console.log(` Download failed with exit code: ${result.status}`) + } + } else { + const result = spawnSync('curl', [ + '-L', '-o', filepath, url, + '--fail', '--silent', '--show-error', + '--connect-timeout', '30', + '--max-time', '1800' + ], { stdio: ['inherit', 'inherit', 'pipe'] }) + + if (result.status === 0 && fs.existsSync(filepath)) { + const stats = fs.statSync(filepath) + if (stats.size >= minSize) { + console.log(` ✓ Downloaded: ${path.basename(filepath)} (${stats.size} bytes)`) + return { success: true, path: filepath, isReal: true } + } + console.log(` Downloaded file too small: ${stats.size} bytes (expected >${minSize})`) + } else { + console.log(` Download failed with exit code: ${result.status}`) + } + } + } catch (e) { + console.log(` Download error: ${e.message}`) + } + } + + // Only create placeholder for binary files; JSON placeholders confuse the size check. + if (!isJson) { + console.log(' Creating placeholder model for error testing') + fs.writeFileSync(filepath, Buffer.alloc(1024)) + } + return { success: false, path: filepath, isReal: false } +} + +// Whisper GGML (for the transcription-WER integration check). +const WHISPER_MODELS = { + 'ggml-small.bin': { url: 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin', minSize: 460000000 }, + 'ggml-medium.bin': { url: 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin', minSize: 1400000000 } +} + +async function ensureWhisperModel (targetPath = null) { + if (!targetPath) { + targetPath = path.join(getBaseDir(), 'models', 'whisper', 'ggml-medium.bin') + } + const modelFile = path.basename(targetPath) + const modelInfo = WHISPER_MODELS[modelFile] || WHISPER_MODELS['ggml-medium.bin'] + + if (fs.existsSync(targetPath)) { + const stats = fs.statSync(targetPath) + if (stats.size > modelInfo.minSize) { + console.log(` ✓ Whisper model already exists (${stats.size} bytes)`) + return { success: true, path: targetPath } + } + console.log(` Cached Whisper model too small (${stats.size} bytes), re-downloading...`) + fs.unlinkSync(targetPath) + } + + const dir = path.dirname(targetPath) + if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true }) + + const result = await ensureFileDownloaded(modelInfo.url, targetPath) + return { success: result.success, path: targetPath } +} + +const CHATTERBOX_GGUFS = [ + { name: 'chatterbox-t3-turbo.gguf', minSize: 500_000_000 }, + { name: 'chatterbox-s3gen.gguf', minSize: 500_000_000 } +] + +const CHATTERBOX_MTL_GGUFS = [ + { name: 'chatterbox-t3-mtl.gguf', minSize: 500_000_000 }, + { name: 'chatterbox-s3gen-mtl.gguf', minSize: 500_000_000 } +] + +const SUPERTONIC_GGUFS = [ + { name: 'supertonic.gguf', minSize: 100_000_000 } +] + +const SUPERTONIC_MTL_GGUFS = [ + { name: 'supertonic2.gguf', minSize: 100_000_000 } +] + +/** Directories searched on Android (in order) when the caller-supplied + * `targetDir` doesn't already have both GGUFs. All of these are + * `adb push`-friendly locations on a standard (non-rooted) device. */ +const ANDROID_CANDIDATE_DIRS = [ + '/sdcard/qvac-tts-ggml/models', + '/storage/emulated/0/qvac-tts-ggml/models', + '/data/local/tmp/qvac-tts-ggml/models' +] + +/** Optional `TTS_GGML_LOCAL_MODELS_DIR` env override + a desktop dev + * fallback that points at chatterbox.cpp's converter output dir. + * Both are appended to the candidate list AFTER the caller-supplied + * `targetDir` so production runs remain deterministic. */ +function desktopFallbackDirs () { + const out = [] + const env = (process && process.env) ? process.env.TTS_GGML_LOCAL_MODELS_DIR : null + if (env) out.push(env) + out.push('./models') + out.push('../../../chatterbox.cpp/models') + return out +} + +/** Returns true if `dir` contains every file in `ggufs` at the expected size. */ +function hasAllGgufsIn (dir, ggufs) { + for (const f of ggufs) { + const p = path.join(dir, f.name) + if (!fs.existsSync(p)) return false + try { + const stats = fs.statSync(p) + if (stats.size < f.minSize) return false + } catch (e) { + return false + } + } + return true +} + +function hasAllGgufs (dir) { + return hasAllGgufsIn(dir, CHATTERBOX_GGUFS) +} + +/** + * Ensure the Chatterbox GGUFs are present under a directory the native + * addon can read, and return the directory that won. + * + * The GGUFs aren't published to a canonical HuggingFace repo yet (the + * teammate will pick the home when qvac-tts.cpp stabilises), so this + * helper is **check-only** — it doesn't download anything. On Android it + * additionally scans a handful of `adb push`-friendly paths because the + * mobile test harness's `global.testDir` (the app's internal files dir) + * isn't writable by `adb push` on stock Android without `run-as`. + * + * Dev flow on Android: + * + * adb push models/chatterbox-t3-turbo.gguf /sdcard/qvac-tts-ggml/models/ + * adb push models/chatterbox-s3gen.gguf /sdcard/qvac-tts-ggml/models/ + * + * TODO: once the GGUFs land on a known HuggingFace repo, wire up the + * download URLs here and switch the default to "fetch from HF". + */ +async function ensureChatterboxModels (options = {}) { + const requestedDir = options.targetDir || path.join(getBaseDir(), 'models') + console.log(`Ensuring Chatterbox GGUFs (requested dir: ${requestedDir})...`) + + const candidateDirs = [requestedDir] + if (isMobile && platform === 'android') { + for (const d of ANDROID_CANDIDATE_DIRS) { + if (!candidateDirs.includes(d)) candidateDirs.push(d) + } + } else { + for (const d of desktopFallbackDirs()) { + if (!candidateDirs.includes(d)) candidateDirs.push(d) + } + } + + let resolvedDir = null + for (const dir of candidateDirs) { + if (hasAllGgufs(dir)) { + resolvedDir = dir + break + } + } + + if (resolvedDir) { + console.log(` ✓ using Chatterbox GGUFs at ${resolvedDir}`) + const results = {} + for (const f of CHATTERBOX_GGUFS) { + results[f.name] = { success: true, path: path.join(resolvedDir, f.name), cached: true } + } + return { success: true, results, targetDir: resolvedDir } + } + + try { + if (!fs.existsSync(requestedDir)) fs.mkdirSync(requestedDir, { recursive: true }) + } catch (e) { /* ignore — informational dir only */ } + + const results = {} + for (const f of CHATTERBOX_GGUFS) { + const p = path.join(requestedDir, f.name) + const exists = fs.existsSync(p) + const size = exists ? fs.statSync(p).size : 0 + console.log(` ✗ ${f.name} ${exists ? `too small (${size} bytes, expected ≥ ${f.minSize})` : `missing at ${p}`}`) + results[f.name] = { success: false, path: p } + } + console.log('') + if (isMobile && platform === 'android') { + console.log('Chatterbox GGUFs not found. On Android, `adb push` them to one of:') + for (const d of ANDROID_CANDIDATE_DIRS) console.log(` ${d}`) + console.log('(or copy into the app-internal dir that testDir maps to).') + } else { + console.log('Chatterbox GGUFs are not published on HuggingFace yet. Generate them') + console.log('locally from the upstream tts-cpp conversion scripts:') + console.log('') + console.log(' git clone git@github.com:tetherto/qvac-ext-lib-whisper.cpp.git') + console.log(' cd qvac-ext-lib-whisper.cpp/tts-cpp') + console.log(' python -m venv .venv && . .venv/bin/activate') + console.log(' pip install torch numpy gguf safetensors scipy librosa resampy') + console.log(' python scripts/convert-t3-turbo-to-gguf.py --out chatterbox-t3-turbo.gguf') + console.log(' python scripts/convert-s3gen-to-gguf.py --out chatterbox-s3gen.gguf') + console.log('') + console.log(`Then copy both .gguf files into ${requestedDir}.`) + } + + return { success: false, results, targetDir: requestedDir } +} + +async function ensureChatterboxMtlModels (options = {}) { + const requestedDir = options.targetDir || path.join(getBaseDir(), 'models') + console.log(`Ensuring Chatterbox MTL GGUFs (requested dir: ${requestedDir})...`) + + const candidateDirs = [requestedDir] + if (isMobile && platform === 'android') { + for (const d of ANDROID_CANDIDATE_DIRS) { + if (!candidateDirs.includes(d)) candidateDirs.push(d) + } + } else { + for (const d of desktopFallbackDirs()) { + if (!candidateDirs.includes(d)) candidateDirs.push(d) + } + } + + let resolvedDir = null + for (const dir of candidateDirs) { + if (hasAllGgufsIn(dir, CHATTERBOX_MTL_GGUFS)) { + resolvedDir = dir + break + } + } + + if (resolvedDir) { + console.log(` ✓ using Chatterbox MTL GGUFs at ${resolvedDir}`) + const results = {} + for (const f of CHATTERBOX_MTL_GGUFS) { + results[f.name] = { success: true, path: path.join(resolvedDir, f.name), cached: true } + } + return { success: true, results, targetDir: resolvedDir } + } + + console.log(' Chatterbox MTL GGUFs not found. Convert with:') + console.log(' python scripts/convert-t3-mtl-to-gguf.py --out chatterbox-t3-mtl.gguf') + console.log(' python scripts/convert-s3gen-to-gguf.py --variant mtl --out chatterbox-s3gen-mtl.gguf') + console.log(` and place under one of: ${candidateDirs.join(', ')}`) + return { success: false, results: {}, targetDir: requestedDir } +} + +async function ensureSupertonicModel (options = {}) { + const requestedDir = options.targetDir || path.join(getBaseDir(), 'models') + console.log(`Ensuring Supertonic GGUF (requested dir: ${requestedDir})...`) + + const candidateDirs = [requestedDir] + if (isMobile && platform === 'android') { + for (const d of ANDROID_CANDIDATE_DIRS) { + if (!candidateDirs.includes(d)) candidateDirs.push(d) + } + } else { + for (const d of desktopFallbackDirs()) { + if (!candidateDirs.includes(d)) candidateDirs.push(d) + } + } + + let resolvedDir = null + for (const dir of candidateDirs) { + if (hasAllGgufsIn(dir, SUPERTONIC_GGUFS)) { + resolvedDir = dir + break + } + } + + if (resolvedDir) { + console.log(` ✓ using Supertonic GGUF at ${resolvedDir}`) + return { + success: true, + path: path.join(resolvedDir, 'supertonic.gguf'), + targetDir: resolvedDir + } + } + + console.log(' Supertonic GGUF not found. Convert with:') + console.log(' python scripts/convert-supertonic2-to-gguf.py --arch supertonic --out supertonic.gguf') + console.log(` and place under one of: ${candidateDirs.join(', ')}`) + return { success: false, path: null, targetDir: requestedDir } +} + +async function ensureSupertonicMtlModel (options = {}) { + const requestedDir = options.targetDir || path.join(getBaseDir(), 'models') + console.log(`Ensuring Supertonic MTL GGUF (requested dir: ${requestedDir})...`) + + const candidateDirs = [requestedDir] + if (isMobile && platform === 'android') { + for (const d of ANDROID_CANDIDATE_DIRS) { + if (!candidateDirs.includes(d)) candidateDirs.push(d) + } + } else { + for (const d of desktopFallbackDirs()) { + if (!candidateDirs.includes(d)) candidateDirs.push(d) + } + } + + let resolvedDir = null + for (const dir of candidateDirs) { + if (hasAllGgufsIn(dir, SUPERTONIC_MTL_GGUFS)) { + resolvedDir = dir + break + } + } + + if (resolvedDir) { + console.log(` ✓ using Supertonic MTL GGUF at ${resolvedDir}`) + return { + success: true, + path: path.join(resolvedDir, 'supertonic2.gguf'), + targetDir: resolvedDir + } + } + + console.log(' Supertonic MTL GGUF not found. Convert with:') + console.log(' python scripts/convert-supertonic2-to-gguf.py --arch supertonic2 --out supertonic2.gguf') + console.log(` and place under one of: ${candidateDirs.join(', ')}`) + return { success: false, path: null, targetDir: requestedDir } +} + +module.exports = { + ensureFileDownloaded, + ensureWhisperModel, + ensureChatterboxModels, + ensureChatterboxMtlModels, + ensureSupertonicModel, + ensureSupertonicMtlModel +} diff --git a/packages/tts-ggml/test/utils/loader.fake.js b/packages/tts-ggml/test/utils/loader.fake.js new file mode 100644 index 0000000000..45bd712cf9 --- /dev/null +++ b/packages/tts-ggml/test/utils/loader.fake.js @@ -0,0 +1,45 @@ +'use strict' + +const Base = require('@qvac/dl-base') +const path = require('bare-path') +const { Readable } = require('bare-stream') + +// Fake files available via the loader. +const files = { + 'ggml-tiny.bin': Buffer.from('binary file ggml-tiny.bin') +} + +class FakeDL extends Base { + async start () { } + + async stop () { } + + async list (path) { + return Object.keys(files) + } + + async getStream (filepath) { + const name = path.basename(filepath) + return Readable.from(Buffer.from(files[name])) + } + + async download (filepath, destPath) { + const name = path.basename(filepath) + const content = files[name] + if (!content) { + throw new Error(`File ${filepath} not found`) + } + + // Simulate downloading by returning a response object with await method + return { + await: async () => ({ + success: true, + filepath, + destPath, + size: content.length + }) + } + } +} + +module.exports = FakeDL diff --git a/packages/tts-ggml/test/utils/pcmConcatenator.js b/packages/tts-ggml/test/utils/pcmConcatenator.js new file mode 100644 index 0000000000..2d44c2594e --- /dev/null +++ b/packages/tts-ggml/test/utils/pcmConcatenator.js @@ -0,0 +1,84 @@ +'use strict' + +const DEFAULT_CROSSFADE_SAMPLES = 720 +const SILENCE_GAP_SAMPLES = 2400 + +function applyCrossfade (prevChunk, nextChunk, crossfadeSamples) { + if (crossfadeSamples <= 0) return { prev: prevChunk, next: nextChunk } + if (prevChunk.length < crossfadeSamples || nextChunk.length < crossfadeSamples) { + return { prev: prevChunk, next: nextChunk } + } + + const prevCopy = prevChunk.slice() + const nextCopy = nextChunk.slice() + const fadeStart = prevCopy.length - crossfadeSamples + + for (let i = 0; i < crossfadeSamples; i++) { + const fadeOut = 1.0 - (i / crossfadeSamples) + const fadeIn = i / crossfadeSamples + prevCopy[fadeStart + i] = Math.round(prevCopy[fadeStart + i] * fadeOut) + nextCopy[i] = Math.round(nextCopy[i] * fadeIn) + } + + return { prev: prevCopy, next: nextCopy } +} + +function createSilenceGap (samples) { + return new Int16Array(samples) +} + +function concatenatePcmChunks (chunks, options = {}) { + if (chunks.length === 0) return new Int16Array(0) + if (chunks.length === 1) return toInt16Array(chunks[0]) + + const crossfadeSamples = options.crossfadeSamples ?? DEFAULT_CROSSFADE_SAMPLES + const silenceGapSamples = options.silenceGapSamples ?? SILENCE_GAP_SAMPLES + + const parts = [] + let previous = toInt16Array(chunks[0]) + + for (let i = 1; i < chunks.length; i++) { + let current = toInt16Array(chunks[i]) + + if (crossfadeSamples > 0) { + const result = applyCrossfade(previous, current, crossfadeSamples) + previous = result.prev + current = result.next + } + + parts.push(previous) + + if (silenceGapSamples > 0) { + parts.push(createSilenceGap(silenceGapSamples)) + } + + previous = current + } + + parts.push(previous) + + return mergeInt16Arrays(parts) +} + +function toInt16Array (arr) { + if (arr instanceof Int16Array) return arr + return Int16Array.from(arr) +} + +function mergeInt16Arrays (arrays) { + let totalLength = 0 + for (const arr of arrays) { + totalLength += arr.length + } + + const result = new Int16Array(totalLength) + let offset = 0 + for (const arr of arrays) { + result.set(arr, offset) + offset += arr.length + } + + return result +} + +module.exports = { concatenatePcmChunks } diff --git a/packages/tts-ggml/test/utils/runChatterboxTTS.js b/packages/tts-ggml/test/utils/runChatterboxTTS.js new file mode 100644 index 0000000000..74157967e2 --- /dev/null +++ b/packages/tts-ggml/test/utils/runChatterboxTTS.js @@ -0,0 +1,221 @@ +'use strict' + +const fs = require('bare-fs') +const path = require('bare-path') +const proc = require('bare-process') +const TTSGgml = require('@qvac/tts-ggml') +const { getBaseDir, isMobile, runTTS, runTTSWithSplit } = require('./runTTS') +const { concatenatePcmChunks } = require('./pcmConcatenator') +const { createWavBuffer } = require('./wav-helper') + +const CHATTERBOX_SAMPLE_RATE = 24000 + +/** + * Resolve the reference-audio WAV path. Precedence: + * 1. params.refWavPath + * 2. On mobile, a bundled test asset under global.assetPaths + * 3. Fallback to test/reference-audio/jfk.wav + * + * Unlike the ONNX backend, we pass the path as-is to the native addon + * (which forwards to qvac-tts-cli's --reference-audio), so no decode / + * resample is needed on the JS side. + */ +function resolveRefWavPath (params) { + if (params.refWavPath) return params.refWavPath + if (isMobile && global.assetPaths) { + const assetKey = '../../testAssets/jfk.wav' + if (global.assetPaths[assetKey]) { + return global.assetPaths[assetKey].replace('file://', '') + } + } + return path.join(__dirname, '..', 'reference-audio', 'jfk.wav') +} + +async function loadChatterboxTTS (params = {}) { + const baseDir = getBaseDir() + const defaultModelDir = path.resolve(path.join(baseDir, 'models')) + const modelDir = params.modelDir || defaultModelDir + + const t3ModelPath = params.t3ModelPath || path.join(modelDir, 'chatterbox-t3-turbo.gguf') + const s3genModelPath = params.s3genModelPath || path.join(modelDir, 'chatterbox-s3gen.gguf') + + const refWavPath = resolveRefWavPath(params) + if (!fs.existsSync(refWavPath)) { + throw new Error(`[Chatterbox] reference audio not found at ${refWavPath}`) + } + console.log(`[Chatterbox] using reference audio: ${refWavPath}`) + + const config = { language: params.language || 'en' } + if (params.useGPU !== undefined) { + config.useGPU = params.useGPU + } else if (proc.env && proc.env.NO_GPU === 'true') { + // Honour the workflow matrix's `no_gpu: 'true'` flag (which sets the + // NO_GPU env var on the job). Without this the addon's + // index.js::_validateConfig defaults Chatterbox to `useGPU = true`, + // which on runners without a Vulkan-capable driver (e.g. windows-2022, + // ubuntu-22.04 without a discrete GPU) crashes during the addon's + // ggml_backend_vk_init probe with `vk::createInstance: + // ErrorIncompatibleDriver`. Forcing CPU here keeps the no-GPU + // matrix entries on the CPU code path that load_model_gguf actually + // exercises with n_gpu_layers=0. + config.useGPU = false + } + + const model = new TTSGgml({ + files: { + modelDir, + t3Model: t3ModelPath, + s3genModel: s3genModelPath + }, + referenceAudio: refWavPath, + voiceDir: params.voiceDir, + seed: params.seed, + threads: params.threads, + nGpuLayers: params.nGpuLayers, + config, + opts: { stats: true } + }) + await model.load() + + return model +} + +async function runChatterboxTTS (model, params, expectation = {}) { + return runTTS(model, params, expectation, { + sampleRate: CHATTERBOX_SAMPLE_RATE, + engineTag: 'Chatterbox' + }) +} + +async function runChatterboxTTSWithSplit (model, params, expectation = {}) { + return runTTSWithSplit(model, params, expectation, { + sampleRate: CHATTERBOX_SAMPLE_RATE, + engineTag: 'Chatterbox' + }) +} + +function checkExpectations (sampleCount, durationMs, expectation) { + if (expectation.minSamples !== undefined && sampleCount < expectation.minSamples) return false + if (expectation.maxSamples !== undefined && sampleCount > expectation.maxSamples) return false + if (expectation.minDurationMs !== undefined && durationMs < expectation.minDurationMs) return false + if (expectation.maxDurationMs !== undefined && durationMs > expectation.maxDurationMs) return false + return true +} + +function saveWavIfNeeded (params, wavBuffer, tag) { + if (params.saveWav !== true) return + if (isMobile && !params.wavOutputPath) { + console.log(`${tag}Skipping WAV save on mobile (no writable path provided)`) + return + } + const defaultWavPath = path.join(__dirname, '../output/chatterbox-stream.wav') + const wavPath = params.wavOutputPath || defaultWavPath + const outputDir = path.dirname(wavPath) + try { fs.mkdirSync(outputDir, { recursive: true }) } catch (err) {} + fs.writeFileSync(wavPath, wavBuffer) + console.log(`${tag}Saved WAV to: ${wavPath}`) +} + +/** + * Run `model.runStreaming()` over an async iterator of `phrases` and + * collect PCM per chunk. Mirrors `runSupertonicStreaming` in + * @qvac/tts-onnx so downstream test shape stays consistent. + */ +async function runChatterboxStreaming (model, params, expectation = {}) { + const sampleRate = CHATTERBOX_SAMPLE_RATE + const tag = '[Chatterbox] ' + + if (!model) { + return { output: `${tag}Error: Missing required parameter: model`, passed: false } + } + const phrases = params && Array.isArray(params.phrases) ? params.phrases : null + if (!phrases || phrases.length === 0) { + return { + output: `${tag}Error: Missing required parameter: phrases (non-empty string array)`, + passed: false + } + } + + try { + async function * textStream () { + for (let i = 0; i < phrases.length; i++) { + yield phrases[i] + } + } + + const streamingOptions = + params.streamingOptions && typeof params.streamingOptions === 'object' + ? params.streamingOptions + : undefined + const response = streamingOptions + ? await model.runStreaming(textStream(), streamingOptions) + : await model.runStreaming(textStream()) + + const pcmByChunk = new Map() + const textByChunk = new Map() + let jobStats = null + + response.onUpdate(data => { + if (data && data.outputArray != null && data.chunkIndex !== undefined) { + pcmByChunk.set(data.chunkIndex, Int16Array.from(data.outputArray)) + if (typeof data.sentenceChunk === 'string') { + textByChunk.set(data.chunkIndex, data.sentenceChunk) + } + } + if (data && data.event === 'JobEnded') { + jobStats = data + } + }) + + await response.await() + + const indices = [...pcmByChunk.keys()].sort((a, b) => a - b) + const pcmChunks = indices.map(i => pcmByChunk.get(i)) + const sentenceChunks = indices.map(i => textByChunk.get(i) || '') + const combined = concatenatePcmChunks(pcmChunks, { + crossfadeSamples: 0, + silenceGapSamples: 0 + }) + const sampleCount = combined.length + const durationMs = + response.stats?.audioDurationMs || + jobStats?.audioDurationMs || + (sampleCount / (sampleRate / 1000)) + + const passed = checkExpectations(sampleCount, durationMs, expectation) + const wavBuffer = createWavBuffer(Array.from(combined), sampleRate) + saveWavIfNeeded(params, wavBuffer, tag) + + const stats = response.stats || jobStats + const output = `${tag}Streamed ${indices.length} chunk(s), ${sampleCount} samples (duration: ${durationMs.toFixed(0)}ms, RTF: ${stats?.realTimeFactor?.toFixed(4) || 'N/A'})` + + return { + output, + passed, + data: { + samples: Array.from(combined), + sampleCount, + durationMs, + sampleRate, + reportedSampleRate: sampleRate, + wavBuffer, + streamChunkCount: indices.length, + sentenceChunks, + stats + } + } + } catch (error) { + return { + output: `${tag}Error: ${error.message}`, + passed: false, + data: { error: error.message } + } + } +} + +module.exports = { + loadChatterboxTTS, + runChatterboxTTS, + runChatterboxTTSWithSplit, + runChatterboxStreaming +} diff --git a/packages/tts-ggml/test/utils/runSupertonicTTS.js b/packages/tts-ggml/test/utils/runSupertonicTTS.js new file mode 100644 index 0000000000..d6c24d10b0 --- /dev/null +++ b/packages/tts-ggml/test/utils/runSupertonicTTS.js @@ -0,0 +1,111 @@ +'use strict' + +const fs = require('bare-fs') +const path = require('bare-path') +const proc = require('bare-process') +const TTSGgml = require('@qvac/tts-ggml') +const { getBaseDir, isMobile } = require('./runTTS') +const { createWavBuffer } = require('./wav-helper') + +const SUPERTONIC_SAMPLE_RATE = 44100 + +async function loadSupertonicTTS (params = {}) { + const baseDir = getBaseDir() + const defaultModelDir = path.resolve(path.join(baseDir, 'models')) + + const supertonicPath = + params.supertonicModelPath || path.join(defaultModelDir, 'supertonic.gguf') + + const config = { language: params.language || 'en' } + if (params.useGPU !== undefined) { + config.useGPU = params.useGPU + } else if (proc.env && proc.env.NO_GPU === 'true') { + config.useGPU = false + } + + const model = new TTSGgml({ + engine: TTSGgml.ENGINE_SUPERTONIC, + files: { supertonicModel: supertonicPath }, + voice: params.voice || 'F1', + steps: params.steps, + speed: params.speed, + seed: params.seed, + threads: params.threads, + nGpuLayers: params.nGpuLayers, + config, + opts: { stats: true } + }) + await model.load() + return model +} + +async function runSupertonicTTS (model, params = {}, expectation = {}) { + const tag = '[Supertonic] ' + const sampleRate = SUPERTONIC_SAMPLE_RATE + + if (!model) { + return { output: `${tag}Error: Missing required parameter: model`, passed: false } + } + if (!params || typeof params.text !== 'string') { + return { output: `${tag}Error: Missing required parameter: text`, passed: false } + } + + try { + let outputArray = [] + let reportedSampleRate = null + const response = await model.run({ input: params.text, type: 'text' }) + + await response + .onUpdate(data => { + if (data && data.outputArray) { + outputArray = outputArray.concat(Array.from(data.outputArray)) + } + if (data && data.sampleRate) reportedSampleRate = data.sampleRate + }) + .await() + + const sampleCount = outputArray.length + const stats = response.stats || null + const durationMs = stats?.audioDurationMs || (sampleCount / (sampleRate / 1000)) + + let passed = true + if (expectation.minSamples !== undefined && sampleCount < expectation.minSamples) passed = false + if (expectation.maxSamples !== undefined && sampleCount > expectation.maxSamples) passed = false + if (expectation.minDurationMs !== undefined && durationMs < expectation.minDurationMs) passed = false + if (expectation.maxDurationMs !== undefined && durationMs > expectation.maxDurationMs) passed = false + + const wavBuffer = createWavBuffer(outputArray, sampleRate) + + if (params.saveWav === true) { + const wavPath = params.wavOutputPath || path.join(__dirname, '../output/supertonic.wav') + try { fs.mkdirSync(path.dirname(wavPath), { recursive: true }) } catch (_e) {} + if (!isMobile || params.wavOutputPath) { + fs.writeFileSync(wavPath, wavBuffer) + } + } + + const output = `${tag}Synthesized ${sampleCount} samples (duration: ${durationMs.toFixed(0)}ms, RTF: ${stats?.realTimeFactor?.toFixed(4) || 'N/A'})` + + return { + output, + passed, + data: { + samples: outputArray, + sampleCount, + durationMs, + sampleRate, + reportedSampleRate, + wavBuffer, + stats + } + } + } catch (error) { + return { output: `${tag}Error: ${error.message}`, passed: false, data: { error: error.message } } + } +} + +module.exports = { + loadSupertonicTTS, + runSupertonicTTS, + SUPERTONIC_SAMPLE_RATE +} diff --git a/packages/tts-ggml/test/utils/runTTS.js b/packages/tts-ggml/test/utils/runTTS.js new file mode 100644 index 0000000000..b865447eb0 --- /dev/null +++ b/packages/tts-ggml/test/utils/runTTS.js @@ -0,0 +1,269 @@ +'use strict' + +const path = require('bare-path') +const fs = require('bare-fs') +const os = require('bare-os') +const { createWavBuffer } = require('./wav-helper') +const { splitTtsText } = require('@qvac/tts-ggml/text-chunker') +const { concatenatePcmChunks } = require('./pcmConcatenator') + +const platform = os.platform() +const isMobile = platform === 'ios' || platform === 'android' + +function getBaseDir () { + return isMobile && global.testDir ? global.testDir : '.' +} + +async function synthesizeChunk (model, text, tag) { + let outputArray = [] + let jobStats = null + let reportedSampleRate = null + + const response = await model.run({ + input: text, + type: 'text' + }) + + await response + .onUpdate(data => { + if (data && data.outputArray) { + const temp = Array.from(data.outputArray) + outputArray = outputArray.concat(temp) + } + if (data && data.sampleRate) { + reportedSampleRate = data.sampleRate + } + if (data.event === 'JobEnded') { + jobStats = data + } + }) + .await() + + return { outputArray, reportedSampleRate, jobStats, stats: response.stats || jobStats } +} + +async function runTTSWithSplit (model, params, expectation = {}, options = {}) { + const sampleRate = options.sampleRate || 24000 + const engineTag = options.engineTag || '' + const tag = engineTag ? `[${engineTag}] ` : '' + + if (!model) { + return { output: `${tag}Error: Missing required parameter: model`, passed: false } + } + if (!params || !params.text) { + return { output: `${tag}Error: Missing required parameter: text`, passed: false } + } + + try { + const splitLanguage = + typeof params.splitLanguage === 'string' && params.splitLanguage.length > 0 + ? params.splitLanguage + : model?._config?.language || 'en' + const splitOpts = { + language: splitLanguage, + mergeToMaxScalars: false, + ...(typeof params.splitLocale === 'string' && params.splitLocale.length > 0 + ? { locale: params.splitLocale } + : {}) + } + const chunks = splitTtsText(params.text, splitOpts) + console.log(`${tag}Split text into ${chunks.length} chunk(s)`) + + const pcmChunks = [] + let totalTime = 0 + let totalSamples = 0 + let lastReportedSampleRate = null + + for (let i = 0; i < chunks.length; i++) { + const chunkText = chunks[i] + console.log(`${tag} Chunk ${i + 1}/${chunks.length}: "${chunkText.substring(0, 60)}${chunkText.length > 60 ? '...' : ''}"`) + + const result = await synthesizeChunk(model, chunkText, tag) + pcmChunks.push(Int16Array.from(result.outputArray)) + + if (result.stats?.totalTime) totalTime += result.stats.totalTime + totalSamples += result.outputArray.length + if (result.reportedSampleRate) lastReportedSampleRate = result.reportedSampleRate + + console.log(`${tag} -> ${result.outputArray.length} samples`) + } + + const combined = concatenatePcmChunks(pcmChunks) + const sampleCount = combined.length + const durationMs = (sampleCount / sampleRate) * 1000 + + const passed = checkExpectations(sampleCount, durationMs, expectation) + const wavBuffer = createWavBuffer(Array.from(combined), sampleRate) + + saveWavIfNeeded(params, wavBuffer, tag) + + const output = `${tag}Synthesized ${sampleCount} samples (${chunks.length} chunks, duration: ${durationMs.toFixed(0)}ms) from text: "${params.text.substring(0, 50)}${params.text.length > 50 ? '...' : ''}"` + + return { + output, + passed, + data: { + samples: Array.from(combined), + sampleCount, + durationMs, + sampleRate, + reportedSampleRate: lastReportedSampleRate, + wavBuffer, + stats: { totalTime, totalSamples, audioDurationMs: durationMs } + } + } + } catch (error) { + return { output: `${tag}Error: ${error.message}`, passed: false, data: { error: error.message } } + } +} + +function checkExpectations (sampleCount, durationMs, expectation) { + if (expectation.minSamples !== undefined && sampleCount < expectation.minSamples) return false + if (expectation.maxSamples !== undefined && sampleCount > expectation.maxSamples) return false + if (expectation.minDurationMs !== undefined && durationMs < expectation.minDurationMs) return false + if (expectation.maxDurationMs !== undefined && durationMs > expectation.maxDurationMs) return false + return true +} + +function saveWavIfNeeded (params, wavBuffer, tag) { + if (params.saveWav !== true) return + if (isMobile && !params.wavOutputPath) { + console.log(`${tag}Skipping WAV save on mobile (no writable path provided)`) + return + } + const defaultWavPath = path.join(__dirname, '../output/test.wav') + const wavPath = params.wavOutputPath || defaultWavPath + const outputDir = path.dirname(wavPath) + try { fs.mkdirSync(outputDir, { recursive: true }) } catch (err) {} + fs.writeFileSync(wavPath, wavBuffer) + console.log(`${tag}Saved WAV to: ${wavPath}`) +} + +async function runTTS (model, params, expectation = {}, options = {}) { + const sampleRate = options.sampleRate || 24000 + const engineTag = options.engineTag || '' + const tag = engineTag ? `[${engineTag}] ` : '' + + if (!model) { + return { + output: `${tag}Error: Missing required parameter: model`, + passed: false + } + } + + if (!params || !params.text) { + return { + output: `${tag}Error: Missing required parameter: text`, + passed: false + } + } + + try { + let outputArray = [] + let jobStats = null + let reportedSampleRate = null + const response = await model.run({ + input: params.text, + type: 'text' + }) + + await response + .onUpdate(data => { + if (data && data.outputArray) { + const temp = Array.from(data.outputArray) + outputArray = outputArray.concat(temp) + } + if (data && data.sampleRate) { + reportedSampleRate = data.sampleRate + } + if (data.event === 'JobEnded') { + jobStats = data + } + }) + .await() + + let passed = true + const sampleCount = outputArray.length + const durationMs = response.stats?.audioDurationMs || jobStats?.audioDurationMs || (sampleCount / (sampleRate / 1000)) + + if (expectation.minSamples !== undefined && sampleCount < expectation.minSamples) { + passed = false + } + if (expectation.maxSamples !== undefined && sampleCount > expectation.maxSamples) { + passed = false + } + if (expectation.minDurationMs !== undefined && durationMs < expectation.minDurationMs) { + passed = false + } + if (expectation.maxDurationMs !== undefined && durationMs > expectation.maxDurationMs) { + passed = false + } + + const wavBuffer = createWavBuffer(outputArray, sampleRate) + + if (params.saveWav === true) { + if (isMobile && !params.wavOutputPath) { + console.log(`${tag}Skipping WAV save on mobile (no writable path provided)`) + } else { + const defaultWavPath = path.join(__dirname, '../output/test.wav') + const wavPath = params.wavOutputPath || defaultWavPath + + const outputDir = path.dirname(wavPath) + try { + fs.mkdirSync(outputDir, { recursive: true }) + } catch (err) {} + + fs.writeFileSync(wavPath, wavBuffer) + console.log(`${tag}Saved WAV to: ${wavPath}`) + } + } + + const stats = response.stats || jobStats + + const roundedStats = stats + ? { + totalTime: stats.totalTime ? Number(stats.totalTime.toFixed(4)) : stats.totalTime, + tokensPerSecond: stats.tokensPerSecond ? Number(stats.tokensPerSecond.toFixed(2)) : stats.tokensPerSecond, + realTimeFactor: stats.realTimeFactor ? Number(stats.realTimeFactor.toFixed(5)) : stats.realTimeFactor, + audioDurationMs: stats.audioDurationMs, + totalSamples: stats.totalSamples, + backendDevice: stats.backendDevice, + backendId: stats.backendId + } + : null + + const statsInfo = stats + ? `duration: ${durationMs.toFixed(0)}ms, RTF: ${stats.realTimeFactor?.toFixed(4) || 'N/A'}` + : `duration: ${durationMs.toFixed(0)}ms (calculated)` + const output = `${tag}Synthesized ${sampleCount} samples (${statsInfo}) from text: "${params.text.substring(0, 50)}${params.text.length > 50 ? '...' : ''}"` + + return { + output, + passed, + data: { + samples: outputArray, + sampleCount, + durationMs, + sampleRate, + reportedSampleRate, + wavBuffer, + stats: roundedStats + } + } + } catch (error) { + return { + output: `${tag}Error: ${error.message}`, + passed: false, + data: { error: error.message } + } + } +} + +module.exports = { + getBaseDir, + isMobile, + runTTS, + runTTSWithSplit, + checkExpectations, + saveWavIfNeeded +} diff --git a/packages/tts-ggml/test/utils/runWhisper.js b/packages/tts-ggml/test/utils/runWhisper.js new file mode 100644 index 0000000000..c73b1ba260 --- /dev/null +++ b/packages/tts-ggml/test/utils/runWhisper.js @@ -0,0 +1,181 @@ +const TranscriptionWhispercpp = require('@qvac/transcription-whispercpp') +const { Readable } = require('bare-stream') +const path = require('bare-path') +const os = require('bare-os') +const FakeDL = require('./loader.fake') + +const WHISPER_SAMPLE_RATE = 16000 + +const platform = os.platform() +const isMobile = platform === 'ios' || platform === 'android' + +function getBaseDir () { + return isMobile && global.testDir ? global.testDir : '.' +} + +async function loadWhisper (params = {}) { + const defaultPath = path.join(getBaseDir(), 'models', 'whisper') + const modelName = params.modelName || 'ggml-tiny.bin' + const diskPath = params.diskPath || defaultPath + console.log('>>> [WHISPER] Loading model from:', diskPath) + + const hdDL = new FakeDL({}) + + const constructorArgs = { + loader: hdDL, + modelName, + diskPath + } + const config = { + opts: { stats: true }, + whisperConfig: { + audio_format: 's16le', + language: params.language || 'en', + translate: false, + temperature: 0.0 + } + } + + const whisperModel = new TranscriptionWhispercpp(constructorArgs, config) + await whisperModel._load() + console.log('>>> [WHISPER] Model loaded') + + return whisperModel +} + +function extractWavPcm (wavBuf) { + if (wavBuf.length < 44) return { raw: wavBuf, sampleRate: WHISPER_SAMPLE_RATE } + const isRiff = wavBuf[0] === 0x52 && wavBuf[1] === 0x49 && + wavBuf[2] === 0x46 && wavBuf[3] === 0x46 + if (!isRiff) return { raw: wavBuf, sampleRate: WHISPER_SAMPLE_RATE } + + const sampleRate = wavBuf[24] | (wavBuf[25] << 8) | + (wavBuf[26] << 16) | (wavBuf[27] << 24) + + let dataOffset = 12 + while (dataOffset + 8 <= wavBuf.length) { + const id = String.fromCharCode( + wavBuf[dataOffset], wavBuf[dataOffset + 1], + wavBuf[dataOffset + 2], wavBuf[dataOffset + 3] + ) + const chunkSize = wavBuf[dataOffset + 4] | (wavBuf[dataOffset + 5] << 8) | + (wavBuf[dataOffset + 6] << 16) | (wavBuf[dataOffset + 7] << 24) + if (id === 'data') { + const start = dataOffset + 8 + const end = Math.min(start + chunkSize, wavBuf.length) + return { raw: wavBuf.slice(start, end), sampleRate } + } + dataOffset += 8 + chunkSize + if (chunkSize % 2 === 1 && dataOffset < wavBuf.length) dataOffset += 1 + } + + return { raw: wavBuf.slice(44), sampleRate } +} + +function resampleS16le (pcmBuf, fromRate, toRate) { + if (fromRate === toRate) return pcmBuf + + const numSamples = Math.floor(pcmBuf.length / 2) + const ratio = fromRate / toRate + const outLen = Math.round(numSamples / ratio) + const out = Buffer.alloc(outLen * 2) + + for (let i = 0; i < outLen; i++) { + const srcIdx = i * ratio + const lo = Math.floor(srcIdx) + const hi = Math.min(lo + 1, numSamples - 1) + const frac = srcIdx - lo + const sLo = pcmBuf.readInt16LE(lo * 2) + const sHi = pcmBuf.readInt16LE(hi * 2) + const val = Math.round(sLo * (1 - frac) + sHi * frac) + out.writeInt16LE(Math.max(-32768, Math.min(32767, val)), i * 2) + } + + return out +} + +async function runWhisper (model, text, wavBuffer) { + const buf = Buffer.from(wavBuffer) + const { raw, sampleRate } = extractWavPcm(buf) + const pcm16k = resampleS16le(Buffer.from(raw), sampleRate, WHISPER_SAMPLE_RATE) + + console.log(`>>> [WHISPER] Audio: ${sampleRate}Hz -> ${WHISPER_SAMPLE_RATE}Hz, ${pcm16k.length / 2} samples`) + + const audioStream = Readable.from([pcm16k]) + const response = await model.run(audioStream) + let fullText = '' + let retryCount = 0 + + while (retryCount < 3) { + try { + fullText = await _processResponse(response) + if (fullText.length > 0) { + break + } + } catch (error) { + console.error('>>> [WHISPER] Error:', error) + retryCount++ + } + } + console.log(`>>> [WHISPER] Full text: ${fullText}`) + const wer = wordErrorRate(text, fullText) + return { wer } +} + +async function _processResponse (response) { + let fullText = '' + await response.onUpdate((output) => { + if (Array.isArray(output)) { + for (const item of output) { + if (item.text) { + fullText += item.text + } + } + } + }).await() + return fullText +} + +function wordErrorRate (expected, actual) { + // Normalize text for comparison + const normalize = (text) => { + return text + .trim() + .toLowerCase() + // Remove punctuation (periods, commas, exclamation, question marks, etc.) + .replace(/[.,!?;:"""''„«»()[\]{}]/g, '') + // Normalize apostrophes (handle French contractions like l'aube -> l aube) + .replace(/[''ʼ]/g, ' ') + // Normalize hyphens (au-dessus -> au dessus) + .replace(/[-–—]/g, ' ') + // Collapse multiple spaces into one + .replace(/\s+/g, ' ') + .trim() + .split(/\s+/) + } + + const r = normalize(expected) + const h = normalize(actual) + const d = Array(r.length + 1) + .fill(null) + .map(() => Array(h.length + 1).fill(0)) + + for (let i = 0; i <= r.length; i++) d[i][0] = i + for (let j = 0; j <= h.length; j++) d[0][j] = j + + for (let i = 1; i <= r.length; i++) { + for (let j = 1; j <= h.length; j++) { + const cost = r[i - 1] === h[j - 1] ? 0 : 1 + d[i][j] = Math.min( + d[i - 1][j] + 1, // deletion + d[i][j - 1] + 1, // insertion + d[i - 1][j - 1] + cost // substitution + ) + } + } + + const wer = Math.round((d[r.length][h.length] / r.length) * 10) / 10 + return wer +} + +module.exports = { loadWhisper, runWhisper } diff --git a/packages/tts-ggml/test/utils/wav-helper.js b/packages/tts-ggml/test/utils/wav-helper.js new file mode 100644 index 0000000000..ecd36faddc --- /dev/null +++ b/packages/tts-ggml/test/utils/wav-helper.js @@ -0,0 +1,179 @@ +const fs = require('bare-fs') + +function writeIntLE (buffer, value, offset, byteLength) { + for (let i = 0; i < byteLength; i++) { + buffer[offset + i] = value & 0xff + value >>= 8 + } +} + +function readWavAsFloat32 (wavPath) { + const buf = fs.readFileSync(wavPath) + if (buf.length < 44) throw new Error('WAV file too small') + + let arrayBuffer, byteOffset + if (buf.buffer && buf.byteOffset !== undefined) { + arrayBuffer = buf.buffer + byteOffset = buf.byteOffset + } else { + arrayBuffer = new ArrayBuffer(buf.length) + new Uint8Array(arrayBuffer).set(buf) + byteOffset = 0 + } + const view = new DataView(arrayBuffer, byteOffset, buf.length) + + const riff = String.fromCharCode(buf[0], buf[1], buf[2], buf[3]) + const wave = String.fromCharCode(buf[8], buf[9], buf[10], buf[11]) + if (riff !== 'RIFF') throw new Error('Not a RIFF file') + if (wave !== 'WAVE') throw new Error('Not WAVE format') + + let fmtChunk = null + let dataChunk = null + let offset = 12 + + while (offset + 8 <= buf.length) { + const chunkId = String.fromCharCode(buf[offset], buf[offset + 1], buf[offset + 2], buf[offset + 3]) + const chunkSize = view.getUint32(offset + 4, true) + + if (chunkId === 'fmt ') { + fmtChunk = { offset: offset + 8, size: chunkSize } + } else if (chunkId === 'data') { + dataChunk = { offset: offset + 8, size: chunkSize } + } + + offset += 8 + chunkSize + if (chunkSize % 2 === 1 && offset < buf.length) { + offset += 1 + } + } + + if (!fmtChunk) throw new Error('WAV missing fmt chunk') + if (!dataChunk) throw new Error('WAV missing data chunk') + + const fmtOff = fmtChunk.offset + if (fmtOff + 16 > buf.length) throw new Error('fmt chunk truncated') + + const audioFormat = view.getUint16(fmtOff, true) + const numChannels = view.getUint16(fmtOff + 2, true) + const sampleRate = view.getUint32(fmtOff + 4, true) + const bitsPerSample = view.getUint16(fmtOff + 14, true) + + if (audioFormat !== 1 && audioFormat !== 3) { + throw new Error('Unsupported WAV audio format: ' + audioFormat + ' (only PCM=1 and IEEE_FLOAT=3 supported)') + } + + const dataOff = dataChunk.offset + const dataLen = Math.min(dataChunk.size, buf.length - dataOff) + + let samples + if (audioFormat === 1 && bitsPerSample === 16) { + const bytesPerSample = 2 + const numSamples = Math.floor(dataLen / bytesPerSample) + const numFrames = numChannels === 1 ? numSamples : Math.floor(numSamples / numChannels) + samples = new Float32Array(numFrames) + for (let i = 0; i < numFrames; i++) { + const idx = dataOff + (numChannels === 1 ? i * 2 : i * numChannels * 2) + if (idx + 2 > buf.length) break + const s = view.getInt16(idx, true) + samples[i] = s / 32768 + } + } else if (audioFormat === 1 && bitsPerSample === 24) { + const bytesPerSample = 3 + const numSamples = Math.floor(dataLen / bytesPerSample) + const numFrames = numChannels === 1 ? numSamples : Math.floor(numSamples / numChannels) + samples = new Float32Array(numFrames) + for (let i = 0; i < numFrames; i++) { + const idx = dataOff + (numChannels === 1 ? i * 3 : i * numChannels * 3) + if (idx + 3 > buf.length) break + const lo = buf[idx] + const mid = buf[idx + 1] + const hi = buf[idx + 2] + let s = (hi << 16) | (mid << 8) | lo + if (s >= 0x800000) s -= 0x1000000 + samples[i] = s / 8388608 + } + } else if (audioFormat === 3 && bitsPerSample === 32) { + const bytesPerSample = 4 + const numSamples = Math.floor(dataLen / bytesPerSample) + const numFrames = numChannels === 1 ? numSamples : Math.floor(numSamples / numChannels) + samples = new Float32Array(numFrames) + for (let i = 0; i < numFrames; i++) { + const idx = dataOff + (numChannels === 1 ? i * 4 : i * numChannels * 4) + if (idx + 4 > buf.length) break + samples[i] = view.getFloat32(idx, true) + } + } else if (audioFormat === 1 && bitsPerSample === 8) { + const numSamples = dataLen + const numFrames = numChannels === 1 ? numSamples : Math.floor(numSamples / numChannels) + samples = new Float32Array(numFrames) + for (let i = 0; i < numFrames; i++) { + const idx = dataOff + (numChannels === 1 ? i : i * numChannels) + if (idx >= buf.length) break + samples[i] = (buf[idx] - 128) / 128 + } + } else { + throw new Error('Unsupported WAV format: audioFormat=' + audioFormat + ', bitsPerSample=' + bitsPerSample) + } + + return { samples, sampleRate, numChannels } +} + +function createWavBuffer (samples, sampleRate = 16000) { + const numChannels = 1 + const bytesPerSample = 2 + const blockAlign = numChannels * bytesPerSample + const byteRate = sampleRate * blockAlign + const dataSize = samples.length * bytesPerSample + const buffer = new Uint8Array(44 + dataSize) + + buffer.set([0x52, 0x49, 0x46, 0x46], 0) + writeIntLE(buffer, 36 + dataSize, 4, 4) + buffer.set([0x57, 0x41, 0x56, 0x45], 8) + + buffer.set([0x66, 0x6d, 0x74, 0x20], 12) + writeIntLE(buffer, 16, 16, 4) + writeIntLE(buffer, 1, 20, 2) + writeIntLE(buffer, numChannels, 22, 2) + writeIntLE(buffer, sampleRate, 24, 4) + writeIntLE(buffer, byteRate, 28, 4) + writeIntLE(buffer, blockAlign, 32, 2) + writeIntLE(buffer, bytesPerSample * 8, 34, 2) + + buffer.set([0x64, 0x61, 0x74, 0x61], 36) + writeIntLE(buffer, dataSize, 40, 4) + + for (let i = 0; i < samples.length; i++) { + const sample = Math.max(-32768, Math.min(32767, samples[i])) + const unsignedSample = sample < 0 ? sample + 65536 : sample + writeIntLE(buffer, unsignedSample, 44 + i * 2, 2) + } + + return buffer +} + +function createWav (samples, sampleRate = 16000, outputPath = 'test.wav') { + const buffer = createWavBuffer(samples, sampleRate) + fs.writeFileSync(outputPath, buffer) +} + +function resampleLinear (samples, fromRate, toRate) { + if (fromRate === toRate) return samples + const ratio = fromRate / toRate + const outputLen = Math.round(samples.length / ratio) + const output = new Float32Array(outputLen) + for (let i = 0; i < outputLen; i++) { + const srcIdx = i * ratio + const lo = Math.floor(srcIdx) + const hi = Math.min(lo + 1, samples.length - 1) + const frac = srcIdx - lo + output[i] = samples[lo] * (1 - frac) + samples[hi] * frac + } + return output +} + +module.exports = { + readWavAsFloat32, + createWav, + createWavBuffer, + resampleLinear +} diff --git a/packages/tts-ggml/tts.js b/packages/tts-ggml/tts.js new file mode 100644 index 0000000000..04b28d08ab --- /dev/null +++ b/packages/tts-ggml/tts.js @@ -0,0 +1,112 @@ +'use strict' + +const { QvacErrorAddonTTSGgml, ERR_CODES } = require('./lib/error') + +/** + * An interface between Bare addon in C++ and JS runtime. + */ +class TTSInterface { + /** + * @param {Object} binding - the native binding object + * @param {Object} configuration Optional initial configuration (engine-specific model paths, language, etc.) + * @param {Function} outputCb - To be called on inference output events + */ + constructor (binding, configuration = {}, outputCb = null) { + this._binding = binding + this._handle = this._binding.createInstance(this, configuration, outputCb) + } + + /** + * Moves addon to the LISTENING state after all the initialization is done. + * + * The C++ binding (addon_js::activate, registered in binding.cpp) + * wraps `AddonCpp::activate()` in a JsAsyncTask::run worker thread so + * the deferred GGUF parse (driven by + * `IModelAsyncLoad::waitForLoadInitialization`) does not stall the JS + * event loop. The native call therefore returns a JS promise; awaiting + * it here is what blocks `model.load()` until the worker finishes. + */ + async activate () { + try { + await this._binding.activate(this._handle) + } catch (err) { + throw new QvacErrorAddonTTSGgml({ + code: ERR_CODES.FAILED_TO_ACTIVATE, + adds: err.message, + cause: err + }) + } + } + + /** + * Enqueues a new TTS job + * @param {Object} data + * @param {String} data.type + * @param {String} data.input + */ + async runJob (data) { + try { + this._binding.runJob(this._handle, data) + } catch (err) { + throw new QvacErrorAddonTTSGgml({ + code: ERR_CODES.FAILED_TO_APPEND, + adds: err.message, + cause: err + }) + } + } + + async loadWeights (weightsData) { + try { + this._binding.loadWeights(this._handle, weightsData) + } catch (err) { + throw new QvacErrorAddonTTSGgml({ + code: ERR_CODES.FAILED_TO_LOAD, + adds: err.message, + cause: err + }) + } + } + + async cancel () { + try { + await this._binding.cancel(this._handle) + } catch (err) { + throw new QvacErrorAddonTTSGgml({ + code: ERR_CODES.FAILED_TO_CANCEL, + adds: err.message, + cause: err + }) + } + } + + /** + * Stops addon process and clears resources (including memory). + */ + async destroyInstance () { + // Already destroyed, nothing to do + if (this._handle === null) { + return + } + + try { + const h = this._handle + this._handle = null + return this._binding.destroyInstance(h) + } catch (err) { + throw new QvacErrorAddonTTSGgml({ + code: ERR_CODES.FAILED_TO_DESTROY, + adds: err.message, + cause: err + }) + } + } + + async unload () { + return this.destroyInstance() + } +} + +module.exports = { + TTSInterface +} diff --git a/packages/tts-ggml/vcpkg-configuration.json b/packages/tts-ggml/vcpkg-configuration.json new file mode 100644 index 0000000000..2ab711b651 --- /dev/null +++ b/packages/tts-ggml/vcpkg-configuration.json @@ -0,0 +1,20 @@ +{ + "default-registry": { + "kind": "git", + "baseline": "74d2dfd03d1c2c0767bac6d892ec43a2a0e29c10", + "repository": "https://github.com/tetherto/qvac-registry-vcpkg.git" + }, + "registries": [ + { + "kind": "git", + "baseline": "16c71a39e5a0fc0bdb3fad03beef8f38ee00ee3b", + "repository": "https://github.com/microsoft/vcpkg", + "packages": [ + "gtest", + "vulkan", + "vulkan-headers", + "vulkan-loader" + ] + } + ] +} diff --git a/packages/tts-ggml/vcpkg.json b/packages/tts-ggml/vcpkg.json new file mode 100644 index 0000000000..7bc6a419de --- /dev/null +++ b/packages/tts-ggml/vcpkg.json @@ -0,0 +1,33 @@ +{ + "dependencies": [ + { + "name": "qvac-lib-inference-addon-cpp", + "version>=": "1.1.7#1" + }, + { + "name": "qvac-lint-cpp", + "version>=": "1.4.4#3" + }, + { + "name": "tts-cpp", + "version>=": "2026-05-07" + } + ], + "features": { + "tests": { + "description": "Build C++ unit tests", + "dependencies": [ + "gtest" + ] + }, + "vulkan": { + "description": "Enable Vulkan GPU acceleration", + "dependencies": [ + { + "name": "tts-cpp", + "features": ["vulkan"] + } + ] + } + } +} diff --git a/packages/tts-ggml/vcpkg/toolchains/linux-clang.cmake b/packages/tts-ggml/vcpkg/toolchains/linux-clang.cmake new file mode 100644 index 0000000000..f06d219a74 --- /dev/null +++ b/packages/tts-ggml/vcpkg/toolchains/linux-clang.cmake @@ -0,0 +1,4 @@ +set(CMAKE_C_COMPILER "clang") +set(CMAKE_CXX_COMPILER "clang++") + +include("$ENV{VCPKG_ROOT}/scripts/toolchains/linux.cmake") diff --git a/packages/tts-ggml/vcpkg/triplets/arm64-linux.cmake b/packages/tts-ggml/vcpkg/triplets/arm64-linux.cmake new file mode 100644 index 0000000000..77c0e6b318 --- /dev/null +++ b/packages/tts-ggml/vcpkg/triplets/arm64-linux.cmake @@ -0,0 +1,9 @@ +set(VCPKG_TARGET_ARCHITECTURE arm64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE static) +set(VCPKG_CMAKE_SYSTEM_NAME Linux) + +set(VCPKG_CHAINLOAD_TOOLCHAIN_FILE "${CMAKE_CURRENT_LIST_DIR}/../toolchains/linux-clang.cmake") +set(VCPKG_C_FLAGS "-fPIC") +set(VCPKG_CXX_FLAGS "-fPIC -stdlib=libc++") +set(VCPKG_LINKER_FLAGS "-stdlib=libc++") diff --git a/packages/tts-ggml/vcpkg/triplets/x64-linux.cmake b/packages/tts-ggml/vcpkg/triplets/x64-linux.cmake new file mode 100644 index 0000000000..7660720b49 --- /dev/null +++ b/packages/tts-ggml/vcpkg/triplets/x64-linux.cmake @@ -0,0 +1,9 @@ +set(VCPKG_TARGET_ARCHITECTURE x64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE static) +set(VCPKG_CMAKE_SYSTEM_NAME Linux) + +set(VCPKG_CHAINLOAD_TOOLCHAIN_FILE "${CMAKE_CURRENT_LIST_DIR}/../toolchains/linux-clang.cmake") +set(VCPKG_C_FLAGS "-fPIC") +set(VCPKG_CXX_FLAGS "-fPIC -stdlib=libc++") +set(VCPKG_LINKER_FLAGS "-stdlib=libc++")