diff --git a/.github/workflows/benchmark-perf-llm-llamacpp.yml b/.github/workflows/benchmark-perf-llm-llamacpp.yml index 3cb312e98b..4c0f24ac78 100644 --- a/.github/workflows/benchmark-perf-llm-llamacpp.yml +++ b/.github/workflows/benchmark-perf-llm-llamacpp.yml @@ -6,13 +6,14 @@ name: Benchmark Performance — LLM Parameter Sweep # # To change what runs, edit: # desktop: models.manifest.json (models) + llm-parameter-sweep.config.js (sweep dims) -# mobile: mobile.config.json +# mobile: test/integration/_benchmark-perf.js (shared runner) + the +# benchmark-perf-*.test.js shards (one per model x KV-cache type) on: workflow_dispatch: inputs: ref: - description: "Git ref (branch/tag/SHA) of the code to benchmark (must be on this repo)" + description: "Branch/tag/SHA of the benchmark code+addon to build and run. Defaults to the branch this workflow is launched from." required: false type: string run_desktop: @@ -25,6 +26,19 @@ on: required: false default: true type: boolean + summarize_only: + description: "Summarize a PREVIOUS run only: re-render its report in ~1 min, skip the ~6h benchmarks. Requires 'Previous run ID to summarize' below." + required: false + default: false + type: boolean + artifact_run_id: + description: "Previous run ID to summarize (the number in that run's URL). Only used when 'summarize_only' is checked." + required: false + type: string + compare_run_id: + description: "Optional baseline run ID to compare against (the number in that run's URL). Adds Δ TTFT / TPS / ppTPS columns vs that run." + required: false + type: string permissions: contents: read @@ -70,11 +84,77 @@ jobs: ref="${INPUT_REF:-$REF_NAME}" echo "ref=$ref" >> "$GITHUB_OUTPUT" + # Fast fail-first gate (~30s, no prebuild needed): the benchmark shards are + # generated from test/integration/_benchmark-matrix.js and not committed, so + # the committed integration.auto.cjs and this workflow's test_groups must stay + # in lockstep with the matrix. If they have drifted, fail the whole run now + # rather than after the mobile prebuild. + verify-shards: + needs: + - context + - label-gate + if: needs.label-gate.outputs.authorised == 'true' && inputs.run_mobile && !inputs.summarize_only + runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + contents: read + steps: + - name: Checkout matrix + generated artifacts + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 + with: + ref: ${{ needs.context.outputs.ref }} + sparse-checkout: | + packages/llm-llamacpp/scripts/generate-benchmark-shards.js + packages/llm-llamacpp/test/integration/_benchmark-matrix.js + packages/llm-llamacpp/test/mobile/integration.auto.cjs + .github/workflows/benchmark-perf-llm-llamacpp.yml + sparse-checkout-cone-mode: false + - name: Verify benchmark matrix is in sync + working-directory: packages/llm-llamacpp + run: node scripts/generate-benchmark-shards.js --check + + # Records, into a run-meta.json artifact, the exact addon version this run + # benchmarks (from the checked-out package.json) AND the mobile shard list it + # targets (from the checked-out matrix). The summarize/comparison step reads + # both from this artifact, so a stamped run can never be mislabelled and its + # coverage is always scored against its OWN matrix — even when re-rendered + # after the matrix has grown. + stamp-version: + needs: + - context + - label-gate + if: needs.label-gate.outputs.authorised == 'true' && !inputs.summarize_only + runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + contents: read + steps: + - name: Checkout package.json + benchmark matrix + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 + with: + ref: ${{ needs.context.outputs.ref }} + sparse-checkout: | + packages/llm-llamacpp/package.json + packages/llm-llamacpp/test/integration/_benchmark-matrix.js + sparse-checkout-cone-mode: false + - name: Write run-meta.json + shell: bash + run: | + mkdir -p run-meta + node -e "const pkg=require('./packages/llm-llamacpp/package.json');const m=require('./packages/llm-llamacpp/test/integration/_benchmark-matrix.js');require('fs').writeFileSync('run-meta/run-meta.json',JSON.stringify({addonVersion:'@qvac/llm-llamacpp@'+pkg.version,expectedShards:m.matrix().map(m.mobileShardKey)})+'\n')" + cat run-meta/run-meta.json + - name: Upload run-meta + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0 + with: + name: benchmark-run-meta-${{ github.run_number }} + path: run-meta/ + retention-days: 90 + prebuild: needs: - context - label-gate - if: needs.label-gate.outputs.authorised == 'true' && inputs.run_mobile + if: needs.label-gate.outputs.authorised == 'true' && (inputs.run_mobile || inputs.run_desktop) && !inputs.summarize_only permissions: contents: write packages: write @@ -90,14 +170,21 @@ jobs: needs: - context - label-gate - if: needs.label-gate.outputs.authorised == 'true' && inputs.run_desktop + - prebuild + if: needs.label-gate.outputs.authorised == 'true' && inputs.run_desktop && !inputs.summarize_only name: Desktop Parameter Sweep - runs-on: ai-run-linux-gpu + runs-on: qvac-ubuntu2204-x64-gpu timeout-minutes: 360 - env: - VCPKG_BINARY_SOURCES: "clear;files,${{ github.workspace }}/packages/llm-llamacpp/vcpkg/cache,readwrite" steps: + # Self-hosted runners are persistent, so the workspace must be wiped + # between runs (skipped on the rare github-hosted fallback). + - name: Manual Workspace Cleanup + run: rm -rf "$GITHUB_WORKSPACE" && mkdir -p "$GITHUB_WORKSPACE" + shell: bash + working-directory: . + if: runner.environment != 'github-hosted' + - name: Checkout repository uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 with: @@ -109,82 +196,34 @@ jobs: node-version: lts/* - name: Install bare runtime - working-directory: packages/llm-llamacpp run: npm install -g --force bare - - name: Get vcpkg cache - id: cache-vcpkg - uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # 5.0.4 - with: - key: linux-x64-${{ hashFiles('packages/llm-llamacpp/vcpkg.json', 'packages/llm-llamacpp/vcpkg-configuration.json', 'packages/llm-llamacpp/vcpkg/ports/**') }} - path: packages/llm-llamacpp/vcpkg/cache - restore-keys: linux-x64- - - - name: Setup LLVM - uses: tetherto/qvac/.github/actions/setup-llvm@98a6a6b6e8f3866dfdd75052a4071269ce85dc41 - - - name: Build addon from source + - name: Install addon dependencies working-directory: packages/llm-llamacpp - run: | - echo "=== Building addon from source ===" + run: npm install - echo "Cleaning up disk space..." - sudo docker image prune --all --force - sudo rm -rf /opt/hostedtoolcache/CodeQL /opt/ghc /usr/share/dotnet /usr/local/lib/android - df -h + # Run the exact linux-x64 binary the prebuild job built (the same artifacts + # the integration tests consume) instead of compiling on the runner, so the + # benchmark needs no build toolchain and matches what ships. + - name: Download prebuilds (from artifacts) + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + path: ${{ runner.temp }}/prebuilds-staging + merge-multiple: true - npm install -g --force bare-make + - name: Move prebuilds into the addon + shell: bash + run: | + mkdir -p packages/llm-llamacpp/prebuilds + cp -r ${{ runner.temp }}/prebuilds-staging/* packages/llm-llamacpp/prebuilds/ + ls -la packages/llm-llamacpp/prebuilds/ - echo "VCPKG_INSTALLATION_ROOT=$VCPKG_INSTALLATION_ROOT" - if [ -n "${VCPKG_INSTALLATION_ROOT:-}" ] && [ -d "$VCPKG_INSTALLATION_ROOT" ]; then - echo "Using pre-installed vcpkg at $VCPKG_INSTALLATION_ROOT" - export VCPKG_ROOT=$VCPKG_INSTALLATION_ROOT - else - echo "vcpkg not found, bootstrapping..." - sudo apt-get update - sudo apt-get install -y curl zip unzip tar pkg-config - git clone https://github.com/Microsoft/vcpkg.git /tmp/vcpkg - /tmp/vcpkg/bootstrap-vcpkg.sh -disableMetrics - export VCPKG_ROOT=/tmp/vcpkg - fi - echo "VCPKG_ROOT=$VCPKG_ROOT" - echo "VCPKG_ROOT=$VCPKG_ROOT" >> $GITHUB_ENV - rm -rf $VCPKG_ROOT/buildtrees/* - rm -rf $VCPKG_ROOT/packages/* - mkdir -p vcpkg/cache - - echo "Installing system dependencies..." - sudo apt-get update - sudo apt-get install -y libxi-dev libxtst-dev libxrandr-dev xz-utils - - echo "Installing Vulkan SDK (latest)..." - wget -q -O /tmp/vulkansdk.tar.xz https://sdk.lunarg.com/sdk/download/latest/linux/vulkan_sdk.tar.xz - mkdir -p "$HOME/vulkan" && cd "$HOME/vulkan" - tar xf /tmp/vulkansdk.tar.xz --strip-components=1 - export VULKAN_SDK="$HOME/vulkan/x86_64" - export PATH="$VULKAN_SDK/bin:$PATH" - export LD_LIBRARY_PATH="$VULKAN_SDK/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" - export VK_ADD_LAYER_PATH="$VULKAN_SDK/share/vulkan/explicit_layer.d" - export PKG_CONFIG_PATH="$VULKAN_SDK/share/pkgconfig:$VULKAN_SDK/lib/pkgconfig${PKG_CONFIG_PATH:+:$PKG_CONFIG_PATH}" - echo "VULKAN_SDK=$VULKAN_SDK" >> $GITHUB_ENV - echo "PATH=$PATH" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "VK_ADD_LAYER_PATH=$VK_ADD_LAYER_PATH" >> $GITHUB_ENV - echo "PKG_CONFIG_PATH=$PKG_CONFIG_PATH" >> $GITHUB_ENV - - cd "$GITHUB_WORKSPACE/packages/llm-llamacpp" - npm install - - echo "Running bare-make generate..." - bare-make generate --platform linux --arch x64 - - echo "Running bare-make build..." - bare-make build - - echo "Running bare-make install..." - bare-make install - - echo "=== Build complete ===" + - name: Detect GPU + id: gpu + shell: bash + run: | + gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 | tr -d '\r' || echo "GPU") + echo "name=$gpu_name" >> "$GITHUB_OUTPUT" - name: Install benchmark dependencies working-directory: packages/llm-llamacpp/benchmarks/performance @@ -199,24 +238,22 @@ jobs: working-directory: packages/llm-llamacpp/benchmarks/performance run: bare ./llm-parameter-sweep.js --addon-source local - - name: Add job summary + # Stamp the desktop device name (incl. detected GPU) into the results so + # the renderer shows it even on a later summarize-only re-render, when this + # job didn't run. + - name: Stamp desktop device if: always() - working-directory: packages/llm-llamacpp/benchmarks/performance shell: bash + env: + GPU_NAME: ${{ steps.gpu.outputs.name }} run: | - LATEST_MD=$(find results/parameter-sweep -name "*.md" -type f 2>/dev/null | sort | tail -1) - { - echo "## LLM Parameter Sweep — Desktop" - echo "" - echo "ref: \`${{ needs.context.outputs.ref }}\`" - echo "" - if [ -n "${LATEST_MD:-}" ]; then - cat "$LATEST_MD" - else - echo "No results file found." - fi - } >> "$GITHUB_STEP_SUMMARY" + mkdir -p packages/llm-llamacpp/benchmarks/performance/results/parameter-sweep + printf '{"desktopDevice":"Desktop (%s)"}\n' "$GPU_NAME" \ + > packages/llm-llamacpp/benchmarks/performance/results/parameter-sweep/desktop-meta.json + cat packages/llm-llamacpp/benchmarks/performance/results/parameter-sweep/desktop-meta.json + # The run summary is rendered by the summarize job (unified desktop + + # mobile view); this job just uploads the raw sweep results. - name: Upload results if: always() uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0 @@ -226,23 +263,231 @@ jobs: retention-days: 90 if-no-files-found: ignore + # Mobile is sharded one group per (model x KV-cache type) = 70 groups so each + # Device Farm session finishes inside the fixed 20-minute iOS per-test ceiling. + # All of them in one reused-workflow call do NOT fit: Android serializes the + # runs against its device pool and the macOS runner fills its disk collecting + # logs. So we split the groups into one batch per KV-cache type (10 each — the + # proven in-budget load) and run them sequentially (max-parallel: 1) to avoid + # Device Farm pool contention. Each batch raises the job timeout to 180 for + # headroom (proven 10-shard wall ~119 min) and gets a distinct artifact_suffix + # so its perf-report doesn't collide; summarize aggregates all of them. The + # TurboQuant/PolarQuant batches (tbq*/pq*) are Vulkan + CPU only, so they show + # Crashed on iOS (Metal) and Samsung GPU. These wrappers are deliberately absent from + # the addon's test-groups.json, so this override is the only path that runs + # them — normal mobile integration runs never trigger the benchmark. mobile-benchmark: needs: - context - prebuild - label-gate - if: needs.label-gate.outputs.authorised == 'true' && inputs.run_mobile + - verify-shards + if: needs.label-gate.outputs.authorised == 'true' && inputs.run_mobile && !inputs.summarize_only permissions: contents: read packages: read pull-requests: write id-token: write + strategy: + fail-fast: false + max-parallel: 1 + matrix: + include: + - cache: f16 + groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"}]' + - cache: q8_0 + groups: '[{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]' + - cache: q4_0 + groups: '[{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"}]' + - cache: tbq3_0-pq3_0 + groups: '[{"name":"BenchmarkPerf08bQ40Tbq30Pq30","grep":"runBenchmarkPerf08bQ40Tbq30Pq30Test"},{"name":"BenchmarkPerf08bQ41Tbq30Pq30","grep":"runBenchmarkPerf08bQ41Tbq30Pq30Test"},{"name":"BenchmarkPerf08bQ4KMTbq30Pq30","grep":"runBenchmarkPerf08bQ4KMTbq30Pq30Test"},{"name":"BenchmarkPerf08bQ6KTbq30Pq30","grep":"runBenchmarkPerf08bQ6KTbq30Pq30Test"},{"name":"BenchmarkPerf08bQ80Tbq30Pq30","grep":"runBenchmarkPerf08bQ80Tbq30Pq30Test"},{"name":"BenchmarkPerf2bQ40Tbq30Pq30","grep":"runBenchmarkPerf2bQ40Tbq30Pq30Test"},{"name":"BenchmarkPerf2bQ41Tbq30Pq30","grep":"runBenchmarkPerf2bQ41Tbq30Pq30Test"},{"name":"BenchmarkPerf2bQ4KMTbq30Pq30","grep":"runBenchmarkPerf2bQ4KMTbq30Pq30Test"},{"name":"BenchmarkPerf2bQ6KTbq30Pq30","grep":"runBenchmarkPerf2bQ6KTbq30Pq30Test"},{"name":"BenchmarkPerf2bQ80Tbq30Pq30","grep":"runBenchmarkPerf2bQ80Tbq30Pq30Test"}]' + - cache: tbq4_0-pq4_0 + groups: '[{"name":"BenchmarkPerf08bQ40Tbq40Pq40","grep":"runBenchmarkPerf08bQ40Tbq40Pq40Test"},{"name":"BenchmarkPerf08bQ41Tbq40Pq40","grep":"runBenchmarkPerf08bQ41Tbq40Pq40Test"},{"name":"BenchmarkPerf08bQ4KMTbq40Pq40","grep":"runBenchmarkPerf08bQ4KMTbq40Pq40Test"},{"name":"BenchmarkPerf08bQ6KTbq40Pq40","grep":"runBenchmarkPerf08bQ6KTbq40Pq40Test"},{"name":"BenchmarkPerf08bQ80Tbq40Pq40","grep":"runBenchmarkPerf08bQ80Tbq40Pq40Test"},{"name":"BenchmarkPerf2bQ40Tbq40Pq40","grep":"runBenchmarkPerf2bQ40Tbq40Pq40Test"},{"name":"BenchmarkPerf2bQ41Tbq40Pq40","grep":"runBenchmarkPerf2bQ41Tbq40Pq40Test"},{"name":"BenchmarkPerf2bQ4KMTbq40Pq40","grep":"runBenchmarkPerf2bQ4KMTbq40Pq40Test"},{"name":"BenchmarkPerf2bQ6KTbq40Pq40","grep":"runBenchmarkPerf2bQ6KTbq40Pq40Test"},{"name":"BenchmarkPerf2bQ80Tbq40Pq40","grep":"runBenchmarkPerf2bQ80Tbq40Pq40Test"}]' + - cache: pq3_0 + groups: '[{"name":"BenchmarkPerf08bQ40Pq30","grep":"runBenchmarkPerf08bQ40Pq30Test"},{"name":"BenchmarkPerf08bQ41Pq30","grep":"runBenchmarkPerf08bQ41Pq30Test"},{"name":"BenchmarkPerf08bQ4KMPq30","grep":"runBenchmarkPerf08bQ4KMPq30Test"},{"name":"BenchmarkPerf08bQ6KPq30","grep":"runBenchmarkPerf08bQ6KPq30Test"},{"name":"BenchmarkPerf08bQ80Pq30","grep":"runBenchmarkPerf08bQ80Pq30Test"},{"name":"BenchmarkPerf2bQ40Pq30","grep":"runBenchmarkPerf2bQ40Pq30Test"},{"name":"BenchmarkPerf2bQ41Pq30","grep":"runBenchmarkPerf2bQ41Pq30Test"},{"name":"BenchmarkPerf2bQ4KMPq30","grep":"runBenchmarkPerf2bQ4KMPq30Test"},{"name":"BenchmarkPerf2bQ6KPq30","grep":"runBenchmarkPerf2bQ6KPq30Test"},{"name":"BenchmarkPerf2bQ80Pq30","grep":"runBenchmarkPerf2bQ80Pq30Test"}]' + - cache: pq4_0 + groups: '[{"name":"BenchmarkPerf08bQ40Pq40","grep":"runBenchmarkPerf08bQ40Pq40Test"},{"name":"BenchmarkPerf08bQ41Pq40","grep":"runBenchmarkPerf08bQ41Pq40Test"},{"name":"BenchmarkPerf08bQ4KMPq40","grep":"runBenchmarkPerf08bQ4KMPq40Test"},{"name":"BenchmarkPerf08bQ6KPq40","grep":"runBenchmarkPerf08bQ6KPq40Test"},{"name":"BenchmarkPerf08bQ80Pq40","grep":"runBenchmarkPerf08bQ80Pq40Test"},{"name":"BenchmarkPerf2bQ40Pq40","grep":"runBenchmarkPerf2bQ40Pq40Test"},{"name":"BenchmarkPerf2bQ41Pq40","grep":"runBenchmarkPerf2bQ41Pq40Test"},{"name":"BenchmarkPerf2bQ4KMPq40","grep":"runBenchmarkPerf2bQ4KMPq40Test"},{"name":"BenchmarkPerf2bQ6KPq40","grep":"runBenchmarkPerf2bQ6KPq40Test"},{"name":"BenchmarkPerf2bQ80Pq40","grep":"runBenchmarkPerf2bQ80Pq40Test"}]' uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml secrets: inherit with: repository: ${{ github.repository }} ref: ${{ needs.context.outputs.ref }} - # Schedule only the benchmark group. runBenchmarkPerfTest is deliberately - # absent from the addon's test-groups.json, so this override is the only - # path that runs it — normal mobile integration runs never trigger it. - test_groups: '[{"name":"benchmarkPerf","grep":"runBenchmarkPerfTest"}]' + job_timeout_minutes: 180 + test_groups: ${{ matrix.groups }} + artifact_suffix: ${{ matrix.cache }}- + + # Aggregates desktop + mobile artifacts into one unified markdown report. + # Runs after benchmarks finish, or standalone when summarize_only=true + # (set artifact_run_id to re-render a previous run). + # Set compare_run_id to add Δ TPS / TTFT / ppTPS columns vs a baseline run. + summarize: + needs: + - context + - label-gate + - stamp-version + - desktop-benchmark + - mobile-benchmark + if: needs.label-gate.outputs.authorised == 'true' && always() && needs.context.result == 'success' + runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + contents: read + steps: + - name: Checkout aggregator + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 + with: + ref: ${{ needs.context.outputs.ref }} + sparse-checkout: | + packages/llm-llamacpp/benchmarks/performance/render-report.js + packages/llm-llamacpp/test/integration/_benchmark-matrix.js + packages/llm-llamacpp/package.json + + - name: Setup Node.js + uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0 + with: + node-version: lts/* + + - name: Get addon version + id: addon_ver + shell: bash + run: | + ver=$(node -e "process.stdout.write(require('./packages/llm-llamacpp/package.json').version)" 2>/dev/null || true) + echo "version=${ver:+@qvac/llm-llamacpp@$ver}" >> "$GITHUB_OUTPUT" + + # Artifacts are matched by name prefix and scoped to a single run via + # run-id, so the run number is never needed — run-id alone identifies the + # run. Defaults to the current run; artifact_run_id targets a previous one. + - name: Download desktop sweep artifact + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + pattern: llm-param-sweep-desktop-* + run-id: ${{ inputs.artifact_run_id || github.run_id }} + github-token: ${{ github.token }} + path: combined-reports + continue-on-error: true + + - name: Download mobile perf-report artifacts + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + pattern: perf-report-llamacpp-llm-* + run-id: ${{ inputs.artifact_run_id || github.run_id }} + github-token: ${{ github.token }} + path: combined-reports + continue-on-error: true + + # The version this run benchmarked (run-meta.json), authoritative over any + # manual --addon-version. Lands alongside the data so the renderer reads it. + - name: Download run-meta (current) + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + pattern: benchmark-run-meta-* + run-id: ${{ inputs.artifact_run_id || github.run_id }} + github-token: ${{ github.token }} + path: combined-reports + continue-on-error: true + + - name: Download baseline desktop artifact for comparison + if: inputs.compare_run_id != '' + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + pattern: llm-param-sweep-desktop-* + run-id: ${{ inputs.compare_run_id }} + github-token: ${{ github.token }} + path: baseline-reports + continue-on-error: true + + - name: Download baseline mobile artifacts for comparison + if: inputs.compare_run_id != '' + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + pattern: perf-report-llamacpp-llm-* + run-id: ${{ inputs.compare_run_id }} + github-token: ${{ github.token }} + path: baseline-reports + continue-on-error: true + + - name: Download baseline run-meta for comparison + if: inputs.compare_run_id != '' + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + pattern: benchmark-run-meta-* + run-id: ${{ inputs.compare_run_id }} + github-token: ${{ github.token }} + path: baseline-reports + continue-on-error: true + + - name: Verify baseline artifacts downloaded + if: inputs.compare_run_id != '' + shell: bash + env: + COMPARE_RUN_ID: ${{ inputs.compare_run_id }} + run: | + if ! find baseline-reports -name "*.json" -type f 2>/dev/null | grep -q .; then + echo "::error::No baseline artifacts found for run $COMPARE_RUN_ID (wrong run ID, or artifacts expired past the 90-day retention). A comparison was requested but cannot be produced." + exit 1 + fi + + - name: Render unified benchmark report + shell: bash + env: + ARTIFACT_RUN_ID: ${{ inputs.artifact_run_id || github.run_id }} + COMPARE_RUN_ID: ${{ inputs.compare_run_id }} + ADDON_VERSION: ${{ steps.addon_ver.outputs.version }} + REPO: ${{ github.repository }} + run: | + if ! find combined-reports -name "*.json" -type f 2>/dev/null | grep -q .; then + echo "::error::No benchmark reports found for run $ARTIFACT_RUN_ID. Nothing to render." + exit 1 + fi + mkdir -p benchmark-artifacts + + # Build args as an array so values containing spaces (e.g. the GPU + # device name "Desktop (NVIDIA RTX 3090)") survive intact. + # The desktop device name (incl. GPU) comes from the stamped + # desktop-meta.json inside the sweep artifact, so it's correct on + # re-renders too; no --desktop-device needed here. + ARGS=(--dir combined-reports --output benchmark-artifacts/qwen35-benchmark-findings.md --html benchmark-artifacts/qwen35-benchmark-charts.html --charts-url "__CHARTS_URL__") + if [ -n "$ADDON_VERSION" ]; then + ARGS+=(--addon-version "$ADDON_VERSION") + fi + if [ -d baseline-reports ] && find baseline-reports -name "*.json" -type f 2>/dev/null | grep -q .; then + ARGS+=(--compare-dir baseline-reports) + if [ -n "$COMPARE_RUN_ID" ]; then + ARGS+=(--baseline-run-id "$COMPARE_RUN_ID") + ARGS+=(--baseline-run-url "https://github.com/$REPO/actions/runs/$COMPARE_RUN_ID") + fi + fi + + node packages/llm-llamacpp/benchmarks/performance/render-report.js "${ARGS[@]}" + + # Upload first so the artifact's download URL is known, then substitute it + # into the report's chart-download link before writing the run summary — so + # the link in the summary points straight at the download instead of telling + # readers to scroll to the artifacts section. Falls back to the run page URL + # if the upload didn't yield a URL. + - name: Upload consolidated report + id: upload + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0 + with: + name: qwen35-benchmark-findings-${{ github.run_number }} + path: benchmark-artifacts/ + retention-days: 90 + if-no-files-found: ignore + + - name: Add to run summary + if: always() + shell: bash + env: + ARTIFACT_URL: ${{ steps.upload.outputs.artifact-url }} + RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + set +e + MD_FILE="benchmark-artifacts/qwen35-benchmark-findings.md" + if [ -f "$MD_FILE" ]; then + url="$ARTIFACT_URL" + if [ -z "$url" ]; then url="$RUN_URL"; fi + sed "s|__CHARTS_URL__|$url|g" "$MD_FILE" >> "$GITHUB_STEP_SUMMARY" + else + echo "No consolidated benchmark report available." >> "$GITHUB_STEP_SUMMARY" + fi diff --git a/.github/workflows/integration-mobile-test-llm-llamacpp.yml b/.github/workflows/integration-mobile-test-llm-llamacpp.yml index 3f0953fe2d..b173bf4a31 100644 --- a/.github/workflows/integration-mobile-test-llm-llamacpp.yml +++ b/.github/workflows/integration-mobile-test-llm-llamacpp.yml @@ -41,6 +41,16 @@ on: type: string required: false default: "" + job_timeout_minutes: + description: "Override the build-and-test job timeout (minutes). Default 150. Raised by Benchmark Performance (LLM) where the sharded matrix needs more headroom." + type: number + required: false + default: 150 + artifact_suffix: + description: "Optional suffix appended to the perf-report artifact-name stem (before the platform segment) so multiple invocations in one run (e.g. Benchmark batches) don't collide. Default empty keeps the existing name." + type: string + required: false + default: "" pre_build_script: description: "Optional node script (path under packages/llm-llamacpp) run before the mobile build to bootstrap a benchmark into the framework (stage files + regenerate the test list). Default '' = no-op." type: string @@ -89,7 +99,7 @@ jobs: name: Build ${{ matrix.platform }} and Run E2E Tests runs-on: ${{ matrix.runner }} environment: release - timeout-minutes: 150 + timeout-minutes: ${{ inputs.job_timeout_minutes || 150 }} continue-on-error: true permissions: contents: read @@ -141,6 +151,20 @@ jobs: prebuild-artifact-prefix: 'llama-cpp-' pat-token: ${{ secrets.PAT_TOKEN }} + # The mobile perf benchmark shards (benchmark-perf-*.test.js) are not + # committed — they are generated from test/integration/_benchmark-matrix.js. + # Regenerate them after setup (which provisions Node) but before the + # bundle is built, so the Device Farm app contains them, then hard-fail if + # any are still missing. This makes it impossible to build the bundle + # without the shards present. (Runs for every LLM mobile run; for + # non-benchmark runs the generated shards are simply skipped, like any + # other mobile-only test.) + - name: Generate benchmark shards + working-directory: addon/packages/llm-llamacpp + run: | + npm run generate:benchmark-shards + node scripts/generate-benchmark-shards.js --assert-shards + # ── Benchmark bootstrap (additive, opt-in via pre_build_script) ────────── # When a caller passes pre_build_script (only the VLM benchmark does), run the # bootstrap: optionally pull fixture assets from the caller-supplied object-store @@ -266,7 +290,7 @@ jobs: platform: ${{ matrix.platform }} merge: 'true' unzip-customer-artifacts: 'true' - artifact-name: perf-report-llamacpp-llm-${{ matrix.platform }}-${{ github.run_number }} + artifact-name: perf-report-llamacpp-llm-${{ inputs.artifact_suffix }}${{ matrix.platform }}-${{ github.run_number }} - name: Comment results on PR if: always() && !cancelled() diff --git a/packages/llm-llamacpp/.gitignore b/packages/llm-llamacpp/.gitignore index 6bad13398b..ad88f07c74 100644 --- a/packages/llm-llamacpp/.gitignore +++ b/packages/llm-llamacpp/.gitignore @@ -8,6 +8,9 @@ prebuilds/ test/unit/all.js test/integration/all.js +# Mobile perf benchmark shards — generated from test/integration/_benchmark-matrix.js +# by scripts/generate-benchmark-shards.js. Never commit them. +test/integration/benchmark-perf-*.test.js test/model/ test/results/ .npmrc diff --git a/packages/llm-llamacpp/benchmarks/performance/README.md b/packages/llm-llamacpp/benchmarks/performance/README.md index fdac758a7f..4e39109046 100644 --- a/packages/llm-llamacpp/benchmarks/performance/README.md +++ b/packages/llm-llamacpp/benchmarks/performance/README.md @@ -7,6 +7,7 @@ Full-factorial parameter sweep for `@qvac/llm-llamacpp`, measuring TTFT, TPS, an - [Addon Source](#addon-source) - [Setup](#setup) - [Quick Start](#quick-start) +- [CI Workflow (GitHub Actions)](#ci-workflow-github-actions) - [Sweep Flags](#sweep-flags) - [Prompt Cases](#prompt-cases) - [Judge Pass](#judge-pass) @@ -62,22 +63,106 @@ npm run run:param-sweep -- \ npm run run:judge ``` +## CI Workflow (GitHub Actions) + +Everything above runs locally. To run the benchmark on CI runners + AWS Device +Farm (desktop **and** mobile), use the **Benchmark Performance — LLM Parameter +Sweep** workflow (`.github/workflows/benchmark-perf-llm-llamacpp.yml`). + +Trigger it from the GitHub UI: **Actions → Benchmark Performance — LLM Parameter +Sweep → Run workflow**. There is nothing to configure for a normal run — the +matrix (models, quantizations, reasoning-budget, KV-cache types, repeats) is +fixed in the scripts; edit those to change what runs. + +The **mobile** sweep runs one Device Farm session per +`(size, quant, KV-cache)` combination. Those combinations live in a single +source of truth, `test/integration/_benchmark-matrix.js`. The per-combination +test files (`test/integration/benchmark-perf-*.test.js`) and the workflow's +mobile `test_groups` are derived from it and the shard files are **not +committed** — regenerate them with `npm run generate:benchmark-shards` (the CI +mobile job does this automatically before the Device Farm bundle is built, and +fails hard if any shard is missing). To change the mobile grid, edit +`_benchmark-matrix.js`, run `npm run generate:benchmark-shards` and +`npm run test:mobile:generate`, then update the workflow groups from +`node scripts/generate-benchmark-shards.js --groups` and commit +`integration.auto.cjs`. `npm run verify:benchmark-shards` checks they are all in +sync. + +### Inputs + +| Input | Default | Purpose | +|-------|---------|---------| +| `ref` | launch branch | Branch/tag/SHA of the benchmark code + addon to build and run | +| `run_desktop` | `true` | Run the desktop sweep (Linux GPU runner) | +| `run_mobile` | `true` | Run the mobile sweep (Android + iOS via Device Farm) | +| `summarize_only` | `false` | Re-render a previous run's report in ~1 min, skipping the ~6 h benchmarks. Needs `artifact_run_id` | +| `artifact_run_id` | — | Previous run ID to re-render (the number in that run's URL). Only with `summarize_only` | +| `compare_run_id` | — | Baseline run ID to diff against — adds Δ TTFT / TPS / ppTPS columns | + +Run IDs are the number in a run's URL (`.../actions/runs/`). You never +supply a run ID for a fresh run — leave them blank. + +### Recipes + +| Goal | Inputs | +|------|--------| +| Fresh full benchmark (desktop + mobile) | *(all blank)* | +| Desktop only | `run_mobile = false` | +| Mobile only | `run_desktop = false` | +| Benchmark a specific code version | `ref = ` | +| Re-render a finished run's report | `summarize_only = true`, `artifact_run_id = ` | +| Compare two runs (regression check) | `summarize_only = true`, `artifact_run_id = `, `compare_run_id = ` | +| Fresh run that also diffs vs a baseline | `compare_run_id = ` | + +The comparison downloads both runs' artifacts and prints a `Δ` for every +metric, e.g. `122.37 ± 0.62 | -0.52` (current value ± stddev, then the delta vs +baseline). It works against **any** two runs. + +### What the report contains + +Rendered into the run summary of the `summarize` job and uploaded as the +`qwen35-benchmark-findings-` artifact. One table per device, identical shape +for desktop and mobile: + +- **Header** — addon version, prompt size, repeats per config (e.g. + `desktop=5, mobile=3`). The version is recorded into the run's artifacts at + benchmark time, so it is always the version that actually ran and a + comparison auto-reads each run's own version (nothing to type, nothing to get + wrong). +- **Columns** — `TTFT (ms) | TPS | ppTPS | Tokens`, each as `mean ± stddev` + across the repeats (plus `Δ` columns when comparing). +- **Desktop device** — shows the detected GPU (e.g. `Desktop (NVIDIA RTX …)`), + preserved on re-renders. +- **`Crashed`** — a configuration that crashed or produced no output on that + device (e.g. quantized KV cache on Adreno GPUs). +- **Best configuration per device** — highest TPS and highest ppTPS. + +> Note: the table shape is identical across desktop and mobile, but the +> generation length differs — desktop caps at `n-predict` 1024 tokens, mobile +> at 512. The rate metrics (TPS, ppTPS) stay comparable; the `Tokens` column +> and absolute TTFT reflect those different caps. + ## Sweep Flags All sweep dimensions accept comma-separated values for full-factorial grid. +Defaults below are the focused set currently pinned in +`llm-parameter-sweep.config.js` (`PARAMETER_SWEEP`). Pass a flag with +comma-separated values to widen any dimension into the full grid. + | Flag | Type | Default | Description | |------|------|---------|-------------| | `--models` | `str` | All in manifest | Comma-separated model IDs | -| `--quantization` | `str` | `Q4_0,Q4_K_M,Q8_0,F16` | Quantization levels | -| `--device` | `str` | `gpu` | `gpu`, `cpu` | +| `--quantization` | `str` | `Q4_0,Q4_1,Q4_K_M,Q6_K,Q8_0` | Quantization levels | +| `--reasoning-budget` | `str` | `-1,0` | Reasoning budget values | +| `--device` | `str` | `gpu` (desktop) | `gpu`, `cpu` | | `--ctx-size` | `str` | `2048` | Context sizes | -| `--batch-size` | `str` | `512,2048` | Batch sizes | -| `--ubatch-size` | `str` | `128,512` | Micro-batch sizes (must be <= batch-size) | -| `--threads` | `str` | `2,4,8` | Thread counts | -| `--flash-attn` | `str` | `off,on` | Flash attention | -| `--cache-type-k` | `str` | `f16,q8_0,q4_0` | KV cache key type | -| `--cache-type-v` | `str` | `f16,q8_0,q4_0` | KV cache value type | +| `--batch-size` | `str` | `512` | Batch sizes | +| `--ubatch-size` | `str` | `512` | Micro-batch sizes (must be <= batch-size) | +| `--threads` | `str` | `4` | Thread counts | +| `--flash-attn` | `str` | `off` | Flash attention | +| `--cache-type-k` | `str` | `f16` | KV cache key type | +| `--cache-type-v` | `str` | `f16` | KV cache value type | | `--repeats` | `int` | `5` | Repeats per case | | `--results-dir` | `str` | `results/parameter-sweep/` | Output directory | | `--prompts-file` | `str` | `test-prompts.json` | Prompts file path | @@ -86,11 +171,14 @@ All sweep dimensions accept comma-separated values for full-factorial grid. ## Prompt Cases -Each parameter combination runs three prompt cases: +The sweep currently runs a single prompt case, `long` (the focused ~512-token +benchmark prompt) — `PROMPT_CASES = ['long']` in `case-runner.js`. The +`ctx-filling` / `span-fill` fixtures below still exist in `test-prompts.json` +and can be re-enabled by extending `PROMPT_CASES`. | Case | Description | Prompt Selection | |------|-------------|-----------------| -| `long` | Long-output generation | Static `long` prompt | +| `long` | Long-output generation (active) | Static `long` prompt | | `ctx-filling` | Maximizes context fill | `ctx-filling__ctx={ctx-size}` | | `span-fill` | Spans multiple prefill batches | `batch-spanning__ctx={ctx-size}__bs={batch-size}` | diff --git a/packages/llm-llamacpp/benchmarks/performance/case-runner.js b/packages/llm-llamacpp/benchmarks/performance/case-runner.js index ca149705ac..575d30e51a 100644 --- a/packages/llm-llamacpp/benchmarks/performance/case-runner.js +++ b/packages/llm-llamacpp/benchmarks/performance/case-runner.js @@ -5,7 +5,9 @@ const path = require('bare-path') const { round, average, stddev, cartesianProduct } = require('./math') const { stripSurroundingQuotes, normalizeArgValue } = require('./utils') -const PROMPT_CASES = ['long', 'ctx-filling', 'span-fill'] +// The focused sweep uses a single ~512-token prompt. Add 'ctx-filling' / +// 'span-fill' back to also sweep context-fill and batch-spanning prompts. +const PROMPT_CASES = ['long'] const PROMPTS_PER_CASE = 1 const SWEEP_OVERRIDE_KEYS = [ @@ -17,7 +19,8 @@ const SWEEP_OVERRIDE_KEYS = [ 'ubatch-size', 'flash-attn', 'cache-type-k', - 'cache-type-v' + 'cache-type-v', + 'reasoning-budget' ] function splitCsvArg (value, key) { @@ -83,6 +86,7 @@ function buildCases (modelDef, sweep) { const threadsValues = sweep.threads || [] const cacheTypeKValues = sweep['cache-type-k'] || [] const cacheTypeVValues = sweep['cache-type-v'] || [] + const reasoningBudgetValues = sweep['reasoning-budget'] || [] const cases = [] for (const promptCase of PROMPT_CASES) { @@ -101,6 +105,7 @@ function buildCases (modelDef, sweep) { if (devices.length > 0 && ctxSizes.length > 0 && batchSizes.length > 0 && ubatchSizes.length > 0 && flashAttnValues.length > 0 && threadsValues.length > 0 && cacheTypeKValues.length > 0 && cacheTypeVValues.length > 0) { + const rbValues = reasoningBudgetValues.length > 0 ? reasoningBudgetValues : [null] const combos = cartesianProduct([ supportedQuants, devices, @@ -110,10 +115,11 @@ function buildCases (modelDef, sweep) { flashAttnValues, threadsValues, cacheTypeKValues, - cacheTypeVValues + cacheTypeVValues, + rbValues ]) - for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV] of combos) { + for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV, reasoningBudget] of combos) { if (Number(ubatchSize) > Number(batchSize)) { continue // Skip combinations where ubatchSize is greater than batchSize } @@ -128,8 +134,10 @@ function buildCases (modelDef, sweep) { 'cache-type-k': cacheTypeK, 'cache-type-v': cacheTypeV } + if (reasoningBudget !== null) runtimeConfig['reasoning-budget'] = reasoningBudget - const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}` + const rbSuffix = reasoningBudget !== null ? `__rb=${reasoningBudget}` : '' + const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}${rbSuffix}` for (const promptCase of PROMPT_CASES) { cases.push({ @@ -207,6 +215,7 @@ function aggregateRunMetrics (runMetrics) { const unloadMsValues = runMetrics.map((x) => x.unloadMs).filter((x) => x != null) const ttftMsValues = runMetrics.map((x) => x.ttftMs).filter((x) => x != null) const tpsValues = runMetrics.map((x) => x.tps).filter((x) => x != null) + const ppTpsValues = runMetrics.map((x) => x.ppTps).filter((x) => x != null) const firstPromptTokens = runMetrics.find((x) => x.promptTokens != null)?.promptTokens ?? null const firstGeneratedTokens = runMetrics.find((x) => x.generatedTokens != null)?.generatedTokens ?? null @@ -222,6 +231,8 @@ function aggregateRunMetrics (runMetrics) { ttftMsStd: round(stddev(ttftMsValues), 3), tpsMean: round(average(tpsValues), 3), tpsStd: round(stddev(tpsValues), 3), + ppTpsMean: round(average(ppTpsValues), 3), + ppTpsStd: round(stddev(ppTpsValues), 3), promptTokens: firstPromptTokens, generatedTokens: firstGeneratedTokens } diff --git a/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.config.js b/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.config.js index a495ec4a9a..ea2c80e0cc 100644 --- a/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.config.js +++ b/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.config.js @@ -3,10 +3,6 @@ const fs = require('bare-fs') const path = require('bare-path') const os = require('bare-os') -const { - DEFAULT_SWEEP_CTX_SIZES, - DEFAULT_SWEEP_BATCH_SIZES -} = require('./utils') const DEFAULT_RESULTS_DIR = path.resolve(__dirname, 'results', 'parameter-sweep') const DEFAULT_MODELS_DIR = path.resolve(__dirname, 'models') @@ -94,17 +90,20 @@ function loadModelsFromManifest () { const MODELS = loadModelsFromManifest() -// Parameter sweep: full factorial (cartesian product) +// Parameter sweep (cartesian product). Tuned to the focused sweep: +// only quantization and reasoning-budget vary; every other dimension is +// pinned to a single value. Edit these arrays to sweep more dimensions. const PARAMETER_SWEEP = { - quantization: ['Q4_0', 'Q4_K_M', 'Q8_0', 'F16'], + quantization: ['Q4_0', 'Q4_1', 'Q4_K_M', 'Q6_K', 'Q8_0'], device: getDefaultSweepDevices(), - 'ctx-size': DEFAULT_SWEEP_CTX_SIZES.map(String), - threads: ['2', '4', '8'], - 'batch-size': DEFAULT_SWEEP_BATCH_SIZES.map(String), // max: 10k - 'ubatch-size': ['128', '512'], // must be <= batch-size - 'flash-attn': ['off', 'on'], - 'cache-type-k': ['f16', 'q8_0', 'q4_0'], - 'cache-type-v': ['f16', 'q8_0', 'q4_0'] + 'ctx-size': ['2048'], + threads: ['4'], + 'batch-size': ['512'], + 'ubatch-size': ['512'], + 'flash-attn': ['off'], + 'cache-type-k': ['f16'], + 'cache-type-v': ['f16'], + 'reasoning-budget': ['-1', '0'] // verbosity: fixed at '0' (not swept) } diff --git a/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.js b/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.js index c65f81c658..c312b4f88c 100644 --- a/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.js +++ b/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.js @@ -91,23 +91,27 @@ async function main () { runStartedAt = new Date().toISOString() } + const writeProgressFile = () => { + try { + fs.writeFileSync(progressFile, JSON.stringify({ + startedAt: runStartedAt, + sweepFingerprint, + completedCases: Array.from(completedCases) + }, null, 2)) + } catch (writeError) { + if (debugEnabled) { + debugLogger.warn(`Failed to write progress: ${writeError.message || String(writeError)}`) + } + } + } + let saveProgressTimeout = null const saveProgress = () => { if (saveProgressTimeout) { clearTimeout(saveProgressTimeout) } saveProgressTimeout = setTimeout(() => { - try { - fs.writeFileSync(progressFile, JSON.stringify({ - startedAt: runStartedAt, - sweepFingerprint, - completedCases: Array.from(completedCases) - }, null, 2)) - } catch (writeError) { - if (debugEnabled) { - debugLogger.warn(`Failed to save progress: ${writeError.message || String(writeError)}`) - } - } + writeProgressFile() saveProgressTimeout = null }, 1000) } @@ -117,17 +121,7 @@ async function main () { clearTimeout(saveProgressTimeout) saveProgressTimeout = null } - try { - fs.writeFileSync(progressFile, JSON.stringify({ - startedAt: runStartedAt, - sweepFingerprint, - completedCases: Array.from(completedCases) - }, null, 2)) - } catch (writeError) { - if (debugEnabled) { - debugLogger.warn(`Failed to flush progress: ${writeError.message || String(writeError)}`) - } - } + writeProgressFile() } moduleFlushProgress = flushProgress @@ -331,7 +325,8 @@ async function main () { const caseMetricSamples = { runMs: [], ttftMs: [], - tps: [] + tps: [], + ppTps: [] } let firstPromptTokens = null let firstGeneratedTokens = null @@ -343,6 +338,15 @@ async function main () { let firstOutput = null let promptError = null + // Warmup run (discarded) so the first measured repeat isn't skewed by + // cold-start graph build / GPU kernel warmup. Without it the first run + // is a large outlier that makes the TTFT/ppTPS mean ± stddev + // meaningless. Mirrors the mobile runner's warmup. + try { + const warmup = await model.run(prompt.messages) + await warmup.onUpdate(() => {}).await() + } catch (_) { /* measured runs below surface any real error */ } + for (let repeat = 1; repeat <= repeats; repeat++) { try { const runStart = process.hrtime() @@ -366,6 +370,7 @@ async function main () { unloadMs: null, // Will unload after all prompts ttftMs: round(ttftMs, 3), tps: round(stats.TPS != null ? stats.TPS : null, 3), + ppTps: round(stats.ppTPS != null ? stats.ppTPS : null, 3), promptTokens: stats.promptTokens ?? null, generatedTokens: stats.generatedTokens ?? null } @@ -374,6 +379,7 @@ async function main () { caseMetricSamples.runMs.push(metrics.runMs) if (metrics.ttftMs != null) caseMetricSamples.ttftMs.push(metrics.ttftMs) if (metrics.tps != null) caseMetricSamples.tps.push(metrics.tps) + if (metrics.ppTps != null) caseMetricSamples.ppTps.push(metrics.ppTps) if (firstPromptTokens == null && metrics.promptTokens != null) firstPromptTokens = metrics.promptTokens if (firstGeneratedTokens == null && metrics.generatedTokens != null) firstGeneratedTokens = metrics.generatedTokens caseRepeatsAttempted += 1 @@ -537,6 +543,8 @@ async function main () { ttftMsStd: round(stddev(caseMetricSamples.ttftMs), 3), tpsMean: round(average(caseMetricSamples.tps), 3), tpsStd: round(stddev(caseMetricSamples.tps), 3), + ppTpsMean: round(average(caseMetricSamples.ppTps), 3), + ppTpsStd: round(stddev(caseMetricSamples.ppTps), 3), promptTokens: firstPromptTokens, generatedTokens: firstGeneratedTokens } diff --git a/packages/llm-llamacpp/benchmarks/performance/models.manifest.json b/packages/llm-llamacpp/benchmarks/performance/models.manifest.json index ceed7a7dd9..a3f521d212 100644 --- a/packages/llm-llamacpp/benchmarks/performance/models.manifest.json +++ b/packages/llm-llamacpp/benchmarks/performance/models.manifest.json @@ -14,15 +14,19 @@ } }, { - "id": "qwen3-4b", + "id": "qwen3.5-0.8b", "gguf": { - "repo": "unsloth/Qwen3-4B-GGUF", + "repo": "unsloth/Qwen3.5-0.8B-GGUF", "revision": "main", - "quantizations": ["Q4_0", "Q4_K_M", "Q8_0", "F16"] - }, - "pytorch": { - "repo": "Qwen/Qwen3-4B", - "revision": "main" + "quantizations": ["Q4_0", "Q4_1", "Q4_K_M", "Q6_K", "Q8_0"] + } + }, + { + "id": "qwen3.5-2b", + "gguf": { + "repo": "unsloth/Qwen3.5-2B-GGUF", + "revision": "main", + "quantizations": ["Q4_0", "Q4_1", "Q4_K_M", "Q6_K", "Q8_0"] } } ] diff --git a/packages/llm-llamacpp/benchmarks/performance/prepare-models.js b/packages/llm-llamacpp/benchmarks/performance/prepare-models.js index 9cc56b5365..1dfb70396d 100644 --- a/packages/llm-llamacpp/benchmarks/performance/prepare-models.js +++ b/packages/llm-llamacpp/benchmarks/performance/prepare-models.js @@ -160,6 +160,39 @@ function downloadFile (url, destination, headers, redirects = 5) { }) } +// Transient-error handling mirrors the addon's integration-test downloader +// (test/integration/utils.js). That helper is Bare-only (bare-https) and can't +// be imported into this Node script, so the semantics are duplicated here +// rather than diverged: same error set, same exponential backoff with jitter. +const TRANSIENT_ERROR_CODES = new Set([ + 'EAI_NODATA', 'EAI_AGAIN', 'ENOTFOUND', 'ETIMEDOUT', + 'ECONNRESET', 'EPIPE', 'ECONNABORTED', 'ESIZE' +]) + +function isTransientError (err) { + if (err && err.code && TRANSIENT_ERROR_CODES.has(err.code)) return true + // downloadFile reports HTTP failures with a numeric `code` (e.g. 500). + const status = (err && err.statusCode) || (err && typeof err.code === 'number' ? err.code : null) + if (status) return status === 408 || status === 429 || status >= 500 + return false +} + +// Retry transient network/HTTP failures so a single HuggingFace blip doesn't +// abort the whole benchmark. 404 and other client errors are re-thrown +// immediately — the caller handles 404 via its filename-candidate fallback. +async function downloadFileWithRetry (url, destination, headers, retries = 3) { + for (let attempt = 0; ; attempt++) { + try { + return await downloadFile(url, destination, headers) + } catch (error) { + if (!isTransientError(error) || attempt >= retries) throw error + const delay = Math.min(1000 * Math.pow(2, attempt) + Math.random() * 500, 30_000) + console.log(`[addon] download attempt ${attempt + 1}/${retries + 1} failed (${(error && (error.code || error.message)) || error}), retrying in ${Math.round(delay)}ms...`) + await new Promise((resolve) => setTimeout(resolve, delay)) + } + } +} + async function listRepoGgufFiles (repo, revision, headers) { const encodedRepo = String(repo) .split('/') @@ -259,7 +292,7 @@ async function prepareAddonModels (selectedModels, modelsDir, headers, baseDir) const url = `https://huggingface.co/${repo}/resolve/${revision}/${candidateFilename}` console.log(`[addon] downloading ${modelId}:${quantization} from ${url}`) try { - await downloadFile(url, candidateDestination, headers) + await downloadFileWithRetry(url, candidateDestination, headers) selectedFilename = candidateFilename destination = candidateDestination break @@ -284,7 +317,7 @@ async function prepareAddonModels (selectedModels, modelsDir, headers, baseDir) } else { console.log(`[addon] downloading ${modelId}:${quantization} from resolved filename ${selectedFilename}`) } - await downloadFile(url, destination, headers) + await downloadFileWithRetry(url, destination, headers) } } diff --git a/packages/llm-llamacpp/benchmarks/performance/prepare-prompts.js b/packages/llm-llamacpp/benchmarks/performance/prepare-prompts.js index 0a70fe49d1..50e1340c7b 100644 --- a/packages/llm-llamacpp/benchmarks/performance/prepare-prompts.js +++ b/packages/llm-llamacpp/benchmarks/performance/prepare-prompts.js @@ -153,6 +153,10 @@ async function tuneToBudget (model, templateMessages, budget) { } } +// The 'long' prompt is the focused ~512-token benchmark prompt (verified +// against the Qwen3.5 tokenizer). Kept in sync with the committed +// test-prompts.json and the PROMPT constant in +// test/integration/_benchmark-perf.js so desktop and mobile measure the same input. function basePrompts () { return [ { @@ -161,11 +165,7 @@ function basePrompts () { { role: 'system', content: 'You are a helpful assistant.' }, { role: 'user', - content: ( - 'You are reviewing an incident report. Write a detailed narrative with sections for timeline, ' + - 'root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, ' + - 'include concrete checkpoints, and avoid bullet points unless needed for clarity. ' - ).repeat(15) + content: 'Summarize the following passage and explain its key technical implications for on-device inference.\n\nModern large language models have transformed natural language processing. Unlike earlier systems that relied on handcrafted features and task-specific architectures, transformer-based models learn general-purpose representations that transfer across many tasks. This shift enabled strong performance in text generation, translation, question answering, and code synthesis, frequently matching expert humans on established benchmarks.\n\nThe scaling laws governing these models describe a consistent relationship between compute, training data, and model capacity. As researchers grow model size and dataset volume, capabilities tend to improve smoothly and predictably, with occasional emergent abilities appearing at particular scale thresholds. This predictability has guided the design of increasingly capable systems, while raising real questions about energy use and cost.\n\nInference efficiency is now a central challenge. Quantization reduces the memory footprint and increases throughput by storing weights at lower numerical precision, allowing deployment on edge devices that would otherwise lack the necessary memory bandwidth. Speculative decoding and continuous batching push throughput further by using available compute more fully during autoregressive generation. Together these techniques make it practical to run capable models locally on consumer hardware, cutting latency and preserving privacy because data never leaves the device.\n\nReasoning quality continues to improve through chain-of-thought prompting and reinforcement learning from human feedback. Models with an explicit reasoning budget can spend more computation on hard problems while staying efficient on simple queries by disabling the reasoning trace entirely. Balancing this budget against latency and battery on mobile hardware is an open and practical engineering problem that the field is only beginning to address in production systems.\n\nOn mobile devices the constraints are sharper than on servers. Memory is limited, thermal headroom is small, and sustained throughput drops as the device heats up under a long generation. Prefill throughput, measured as prompt tokens processed per second, often behaves very differently from decode throughput, because prefill is compute bound across the whole prompt while decode is memory bound on a single token at a time. Quantization format interacts with both phases in ways that are hard to predict from first principles, which is exactly why empirical benchmarks across formats and devices matter. A format that is fast to decode on a desktop GPU may be slower on a phone because of how its blocks map onto the available kernels and cache hierarchy. Measuring time to first token, decode tokens per second, and prefill tokens per second across each quantization and reasoning setting gives the clearest practical picture of what users will actually experience.' } ] } diff --git a/packages/llm-llamacpp/benchmarks/performance/render-report.js b/packages/llm-llamacpp/benchmarks/performance/render-report.js new file mode 100644 index 0000000000..4470fe61ba --- /dev/null +++ b/packages/llm-llamacpp/benchmarks/performance/render-report.js @@ -0,0 +1,765 @@ +#!/usr/bin/env node +'use strict' + +// Unified benchmark report renderer for the Qwen3.5 perf benchmark. +// +// Reads perf JSON from --dir (recursively) and renders ONE markdown report: +// - header with addon version, prompt size, runs-per-config, GPU +// - one table per device: Config | TTFT (ms) | TPS | ppTPS | Tokens +// - optional Δ columns when --compare-dir is provided (cross-run regression) +// - a closing "best config per device" summary +// +// Two input schemas are normalised: +// desktop sweep: { models:[{modelId, cases:[{quantization, runtimeConfig, +// metrics:{ttftMsMean,tpsMean,ppTpsMean,promptTokens, +// generatedTokens}, status, isBaseline}]}], repeats, ... } +// mobile report: { addon, device:{name}, results:[{test, metrics:{ttft_ms, +// tps, pp_tps, generated_tokens, prompt_tokens}}] } + +const fs = require('fs') +const path = require('path') +const { matrix, mobileShardKey, SIZES, QUANTS, CACHE_TYPES } = require('../../test/integration/_benchmark-matrix.js') + +function parseArgs (argv) { + const a = { + dir: null, + output: null, + html: null, + chartsUrl: null, + desktopDevice: 'Desktop (linux-x64 GPU)', + addonVersion: null, + compareDir: null, + baselineRunId: null, + baselineRunNumber: null, + baselineRunUrl: null + } + for (let i = 2; i < argv.length; i++) { + const t = argv[i] + if (t === '--dir') a.dir = argv[++i] + else if (t === '--output') a.output = argv[++i] + else if (t === '--html') a.html = argv[++i] + else if (t === '--charts-url') a.chartsUrl = argv[++i] + else if (t === '--desktop-device') a.desktopDevice = argv[++i] + else if (t === '--addon-version') a.addonVersion = argv[++i] + else if (t === '--compare-dir') a.compareDir = argv[++i] + else if (t === '--baseline-run-id') a.baselineRunId = argv[++i] + else if (t === '--baseline-run-number') a.baselineRunNumber = argv[++i] + else if (t === '--baseline-run-url') a.baselineRunUrl = argv[++i] + } + if (!a.dir) { + throw new Error( + 'usage: render-report.js --dir [--output ] ' + + '[--desktop-device ] [--addon-version ] [--compare-dir ]' + ) + } + return a +} + +function walkJson (dir) { + const out = [] + for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { + const p = path.join(dir, entry.name) + if (entry.isDirectory()) out.push(...walkJson(p)) + else if (entry.name.endsWith('.json')) out.push(p) + } + return out +} + +function num (v) { + return typeof v === 'number' && Number.isFinite(v) ? v : null +} + +function int (v) { + const n = num(v) + return n !== null ? Math.round(n) : null +} + +// Collect metadata and rows from all files in a directory. +// Returns { rows, meta } where meta = { addonVersion, repeats, promptTokens }. +function loadDir (dir, desktopDevice) { + const files = walkJson(dir) + // The desktop device name (incl. the detected GPU) is stamped into + // desktop-meta.json at run time, so re-renders show the real GPU even though + // the desktop job didn't run. Falls back to the passed/default name. + let resolvedDesktop = desktopDevice + for (const f of files) { + try { + const d = JSON.parse(fs.readFileSync(f, 'utf8')) + if (d && typeof d.desktopDevice === 'string' && d.desktopDevice) { resolvedDesktop = d.desktopDevice; break } + } catch {} + } + const meta = { addonVersion: null, repeats: null, promptTokens: null, expectedShards: null } + let rows = [] + for (const f of files) { + const r = rowsFromFile(f, resolvedDesktop, meta) + rows.push(...r) + } + rows = aggregate(rows) + return { rows, meta, desktopDevice: resolvedDesktop } +} + +// Normalise any report file into rows: { device, config, ttft, tps, ppTps, tokens, crashed } +// Also fills in meta fields when found. +function rowsFromFile (file, desktopDevice, meta) { + let doc + try { doc = JSON.parse(fs.readFileSync(file, 'utf8')) } catch { return [] } + const rows = [] + + // run-meta.json — the addon version and the expected mobile shard list, + // stamped into the run's artifacts at benchmark time. Both come from here so a + // re-render always reflects what THAT run targeted: the version label stays + // correct after the code moves on, and coverage compares against the run's own + // matrix rather than the renderer's current one (which may have since grown). + if (doc && typeof doc.addonVersion === 'string') { + if (meta.addonVersion === null) meta.addonVersion = doc.addonVersion + if (meta.expectedShards === null && Array.isArray(doc.expectedShards)) meta.expectedShards = doc.expectedShards + return rows + } + + // desktop-meta.json — the desktop device name (resolved in loadDir's first + // pass); nothing to render from it here. + if (doc && typeof doc.desktopDevice === 'string') return rows + + // Desktop sweep schema + if (Array.isArray(doc.models) && doc.models.length && Array.isArray(doc.models[0].cases)) { + if (num(doc.repeats) !== null && meta.repeats === null) meta.repeats = doc.repeats + for (const model of doc.models) { + for (const c of model.cases) { + if (c.isBaseline) continue + const rc = c.runtimeConfig || {} + const config = configLabel({ + model: `${model.modelId}-${c.quantization}`, + backend: rc.device, + rb: rc['reasoning-budget'], + ck: rc['cache-type-k'], + cv: rc['cache-type-v'] + }) + const m = c.metrics || {} + if (int(m.promptTokens) !== null && meta.promptTokens === null) { + meta.promptTokens = int(m.promptTokens) + } + const crashed = c.status && c.status !== 'ok' && c.status !== 'partial-failure' + rows.push({ + device: desktopDevice, + config, + ttft: num(m.ttftMsMean), + ttftStd: num(m.ttftMsStd), + tps: num(m.tpsMean), + tpsStd: num(m.tpsStd), + ppTps: num(m.ppTpsMean), + ppTpsStd: num(m.ppTpsStd), + tokens: int(m.generatedTokens), + crashed: !!crashed, + preAggregated: true, + sampleCount: int(m.repeats) + }) + } + } + return rows + } + + // Mobile perf-report schema + if (doc.device && Array.isArray(doc.results)) { + // doc.addon is the addon NAME ("llamacpp-llm"), not a version — never use + // it as the version label. The real version comes from run-meta.json. + const device = (doc.device.name || 'unknown').trim() + for (const r of doc.results) { + const m = r.metrics || {} + if (int(m.prompt_tokens) !== null && meta.promptTokens === null) { + meta.promptTokens = int(m.prompt_tokens) + } + const crashed = (r.status && String(r.status).toLowerCase() === 'crashed') || + (num(m.ttft_ms) === null && num(m.tps) === null && num(m.pp_tps) === null) + rows.push({ + device, + config: r.test || '(unknown)', + ttft: num(m.ttft_ms), + tps: num(m.tps), + ppTps: num(m.pp_tps), + tokens: int(m.generated_tokens), + crashed: !!crashed + }) + } + return rows + } + + return rows +} + +function configLabel ({ model, backend, rb, ck, cv }) { + const parts = [`[${model}]`] + if (backend) parts.push(`[${backend}]`) + if (rb !== undefined && rb !== null && rb !== '') parts.push(`[rb=${rb}]`) + if (ck || cv) parts.push(ck === cv ? `[kv=${ck}]` : `[kv=${ck || '?'}/${cv || '?'}]`) + return parts.join(' ') +} + +function fmt (v, decimals = 2) { + if (v === null) return '-' + return (Math.round(v * Math.pow(10, decimals)) / Math.pow(10, decimals)).toFixed(decimals) +} + +function fmtDelta (v) { + if (v === null) return '-' + const sign = v >= 0 ? '+' : '' + return `${sign}${fmt(v)}` +} + +// "mean ± std" when there is more than one sample; bare mean otherwise. +function fmtMS (meanV, stdV, sampleCount) { + if (meanV === null || meanV === undefined) return '-' + if (stdV !== null && stdV !== undefined && sampleCount && sampleCount > 1) { + return `${fmt(meanV)} ± ${fmt(stdV)}` + } + return fmt(meanV) +} + +function mean (values) { + if (!values.length) return null + return values.reduce((a, b) => a + b, 0) / values.length +} + +// Population standard deviation — matches the desktop sweep's math.js stddev. +function stddev (values) { + if (!values.length) return null + if (values.length === 1) return 0 + const avg = mean(values) + let s = 0 + for (const v of values) s += (v - avg) * (v - avg) + return Math.sqrt(s / values.length) +} + +// Group raw rows by (device, config) and reduce each group to a single row +// carrying mean + stddev per metric. Desktop rows arrive pre-aggregated (they +// already hold *Std fields from the 5-repeat sweep); mobile rows arrive as one +// row per repetition and are aggregated here across the non-crashed samples. +function aggregate (rows) { + const byKey = new Map() + for (const r of rows) { + const k = `${r.device}@@${r.config}` + if (!byKey.has(k)) byKey.set(k, []) + byKey.get(k).push(r) + } + const out = [] + for (const group of byKey.values()) { + const { device, config } = group[0] + const pre = group.find(r => r.preAggregated && !r.crashed) + if (pre) { out.push(pre); continue } + if (group.some(r => r.preAggregated)) { + out.push({ device, config, crashed: true, tokens: null }); continue + } + const real = group.filter(r => !r.crashed) + if (!real.length) { + out.push({ device, config, crashed: true, tokens: null }); continue + } + const ttftVals = real.map(r => r.ttft).filter(v => v !== null) + const tpsVals = real.map(r => r.tps).filter(v => v !== null) + const ppVals = real.map(r => r.ppTps).filter(v => v !== null) + out.push({ + device, + config, + crashed: false, + ttft: mean(ttftVals), + ttftStd: stddev(ttftVals), + tps: mean(tpsVals), + tpsStd: stddev(tpsVals), + ppTps: mean(ppVals), + ppTpsStd: stddev(ppVals), + tokens: real.find(r => r.tokens !== null)?.tokens ?? null, + sampleCount: real.length + }) + } + return out +} + +function buildBaselineMap (baseRows) { + const m = new Map() + for (const r of baseRows) m.set(`${r.device}@@${r.config}`, r) + return m +} + +// Largest mobile sample count across a run's aggregated rows (the mobile +// repetition count, read from the data rather than hard-coded). +function mobileRepeats (rows, desktopDevice) { + const counts = rows + .filter(r => r.device !== desktopDevice && !r.crashed && r.sampleCount) + .map(r => r.sampleCount) + return counts.length ? Math.max(...counts) : null +} + +// Build the "Addon: X · Prompt: Y · Repeats: ..." metadata line for a run. +function metaLine (meta, addonVersion, hasDesktopRows, mobileReps) { + const parts = [] + if (addonVersion) parts.push(`**Addon:** \`${addonVersion}\``) + if (meta.promptTokens !== null) parts.push(`**Prompt:** ${meta.promptTokens} tokens`) + if (meta.repeats !== null || mobileReps !== null) { + const reps = [] + if (hasDesktopRows && meta.repeats !== null) reps.push(`desktop=${meta.repeats}`) + if (mobileReps !== null) reps.push(`mobile=${mobileReps}`) + parts.push(`**Repeats:** ${reps.join(', ')}`) + } + return parts.join(' · ') +} + +// Shard key for a mobile row, parsed from its "[] [] [rb=..] +// [kv=]" label, to match _benchmark-matrix.js mobileShardKey. +function shardKeyOf (config) { + const model = /^\[([^\]]+)\]/.exec(config) + const kv = /\[kv=([^\]]+)\]/.exec(config) + return model && kv ? `${model[1]}|${kv[1]}` : null +} + +function shardLabel (key) { + const [model, kv] = key.split('|') + return `${model} [kv=${kv}]` +} + +// Per-device coverage of the mobile shard matrix. Every shard that runs emits +// at least a Crashed placeholder row, so a shard with no row at all never ran +// or its data was lost (e.g. a dropped KV-cache batch artifact). Surfacing this +// keeps a partial run from rendering as a complete-looking report. +function coverageLines (rows, desktopDevice, devices, expectedShards) { + // Prefer the shard list THIS run stamped into its run-meta; fall back to the + // renderer's current matrix only for runs predating the stamp. This keeps a + // re-render of an older run from being scored against a matrix that has since + // grown (e.g. an old 30-shard run reading 30/70 against today's 70). + const expected = expectedShards || matrix().map(mobileShardKey) + const expectedSet = new Set(expected) + const mobileDevices = devices.filter(d => d !== desktopDevice) + if (!mobileDevices.length) { + return [ + '## Coverage', + '', + `**Warning: 0 mobile devices reported.** ${expected.length} shards expected per device. ` + + 'If mobile was enabled for this run, its data was lost (failed job or dropped artifacts).', + '' + ] + } + + const seenByDevice = new Map(mobileDevices.map(d => [d, new Set()])) + const seenAll = new Set() + for (const r of rows) { + if (r.device === desktopDevice) continue + const k = shardKeyOf(r.config) + if (!k || !expectedSet.has(k) || !seenByDevice.has(r.device)) continue + seenByDevice.get(r.device).add(k) + seenAll.add(k) + } + + // Only show the dimension breakdown when the expected set came from the live + // matrix; a stamped older run may have different dimensions than today's code. + const dims = expectedShards ? '' : ` (${SIZES.length} sizes x ${QUANTS.length} quants x ${CACHE_TYPES.length} KV-cache types)` + const lines = ['## Coverage', ''] + lines.push( + `Mobile matrix: ${expected.length} shards expected per device${dims}. ` + + `${mobileDevices.length} device(s) reported.` + ) + lines.push('') + lines.push('| Device | Shards reported |') + lines.push('| --- | ---: |') + for (const d of mobileDevices) lines.push(`| ${d} | ${seenByDevice.get(d).size} / ${expected.length} |`) + lines.push('') + + const missingEverywhere = expected.filter(k => !seenAll.has(k)) + if (missingEverywhere.length) { + lines.push(`**${missingEverywhere.length} shard(s) produced no data on any device** (likely a dropped batch):`) + for (const k of missingEverywhere) lines.push(`- ${shardLabel(k)}`) + lines.push('') + } + for (const d of mobileDevices) { + const miss = expected.filter(k => seenAll.has(k) && !seenByDevice.get(d).has(k)) + if (miss.length) { + lines.push(`**${d}** is missing ${miss.length} shard(s) other devices reported:`) + for (const k of miss) lines.push(`- ${shardLabel(k)}`) + lines.push('') + } + } + return lines +} + +function shortDevice (name) { + return name.replace(/^Apple /, '').replace(/^Samsung Galaxy /, '').replace(/^Google /, '') +} + +function mermaidBar (title, ylabel, labels, values) { + const max = Math.ceil(Math.max(...values, 1) * 1.15) + return [ + '```mermaid', + 'xychart-beta', + ` title "${title}"`, + ` x-axis [${labels.map(l => `"${l}"`).join(', ')}]`, + ` y-axis "${ylabel}" 0 --> ${max}`, + ` bar [${values.map(v => Math.round(v * 10) / 10).join(', ')}]`, + '```' + ] +} + +// One inline at-a-glance bar chart: decode TPS per device at a SINGLE fixed +// configuration — no averaging across backends, sizes or budgets, so every bar +// is one real measured number. xychart-beta is single-series and cannot draw +// error bars, so the per-backend breakdowns by KV-cache type / quantization, +// with 3-rep stddev whiskers, live in the HTML chart artifact. +function mermaidSection (rows, desktopDevice, chartsUrl) { + const held = { backend: 'gpu', rb: CHART_RB, size: CHART_SIZE, quant: CHART_QUANT_HELD, kv: CHART_KV_DEFAULT } + const pts = atConfig(rows, held).filter(r => r.device !== desktopDevice && !r.crashed && r.tps !== null) + if (pts.length < 2) return [] + const byDevice = new Map() + for (const r of pts) if (!byDevice.has(r.device)) byDevice.set(r.device, r.tps) + const devices = [...byDevice.keys()].sort((a, b) => byDevice.get(b) - byDevice.get(a)) + const cfg = `Qwen3.5-${CHART_SIZE.toUpperCase()}, ${CHART_QUANT_HELD}, KV ${CHART_KV_DEFAULT}, reasoning on, GPU` + // The download URL only exists after the artifact is uploaded, so the workflow + // passes it in post-upload; a local render leaves the artifact name as plain text. + const artifact = chartsUrl + ? `[**qwen35-benchmark-findings** artifact](${chartsUrl})` + : '**qwen35-benchmark-findings** artifact' + return [ + '## Charts', + '', + `> At-a-glance TPS by device at one fixed config: **${cfg}**. ` + + 'Per-backend charts broken down by KV-cache type and quantization, with ±1 stddev over 3 reps, ' + + `are in the ${artifact} — download and open \`qwen35-benchmark-charts.html\` inside. ` + + 'The full matrix and all sizes are in the tables below.', + '', + ...mermaidBar(`TPS by device (${cfg})`, 'TPS', devices.map(shortDevice), devices.map(d => byDevice.get(d))), + '' + ] +} + +function render (rows, desktopDevice, meta, addonVersionArg, baselineMap, baseline, chartsUrl) { + const byDevice = new Map() + for (const r of rows) { + if (!byDevice.has(r.device)) byDevice.set(r.device, []) + byDevice.get(r.device).push(r) + } + const devices = [...byDevice.keys()].sort((a, b) => { + if (a === desktopDevice) return -1 + if (b === desktopDevice) return 1 + return a.localeCompare(b) + }) + + // Stamped version (from run-meta) wins over any manually-passed value. + const addonVersion = meta.addonVersion || addonVersionArg || null + const comparing = baselineMap !== null + + const lines = [] + lines.push('# Qwen3.5 Benchmark Results') + lines.push('') + + // Current-run metadata block + const hasDesktopRows = rows.some(r => r.device === desktopDevice) + const curLine = metaLine(meta, addonVersion, hasDesktopRows, mobileRepeats(rows, desktopDevice)) + if (curLine) { + lines.push(curLine) + lines.push('') + } + + // Baseline metadata block — same fields as the current run, plus the run + // link. Classify its rows by the BASELINE's own desktop device name (which + // may differ from the current run's, e.g. a different GPU). + if (comparing && baseline) { + const bDesktop = baseline.desktopDevice || desktopDevice + const bHasDesktop = baseline.rows.some(r => r.device === bDesktop) + const bAddon = baseline.meta.addonVersion || null + const bLine = metaLine(baseline.meta, bAddon, bHasDesktop, mobileRepeats(baseline.rows, bDesktop)) + const idParts = [] + if (baseline.runNumber) idParts.push(`run #${baseline.runNumber}`) + if (baseline.runId) idParts.push(`run ID ${baseline.runId}`) + const heading = idParts.length ? idParts.join(', ') : 'previous run' + lines.push(`> **Comparing against baseline (${heading}):**`) + if (bLine) lines.push('> ' + bLine) + if (baseline.runUrl) { + lines.push(`> [View baseline run](${baseline.runUrl})`) + } + lines.push('') + } + + lines.push( + 'Metrics are addon `runtimeStats`: ' + + 'TTFT = time to first token (ms), TPS = decode tokens/sec, ' + + 'ppTPS = prefill tokens/sec, Tokens = generated tokens.' + + (comparing ? ' Δ = current minus baseline (positive = improvement for TPS/ppTPS, negative = improvement for TTFT).' : '') + + ' `Crashed` = configuration crashed or produced no output.' + ) + lines.push('') + lines.push( + 'Config labels read `[model] [gpu|cpu] [rb=N] [kv=type]`, where `rb` is the ' + + 'reasoning budget (-1 leaves the model\'s reasoning channel on, 0 disables it) ' + + 'and `kv` is the KV-cache type.' + ) + lines.push('') + + for (const l of coverageLines(rows, desktopDevice, devices, meta.expectedShards)) lines.push(l) + + for (const l of mermaidSection(rows, desktopDevice, chartsUrl)) lines.push(l) + + const hasTokens = rows.some(r => r.tokens !== null) + + for (const device of devices) { + const items = byDevice.get(device).slice().sort((a, b) => a.config.localeCompare(b.config)) + lines.push(`## ${device}`) + lines.push('') + + if (comparing && !items.some(r => baselineMap.has(`${r.device}@@${r.config}`))) { + lines.push('> No baseline data for this device (baseline ran on different hardware).', '') + } + + if (comparing) { + const hdr = hasTokens + ? '| Config | TTFT (ms) | Δ TTFT | TPS | Δ TPS | ppTPS | Δ ppTPS | Tokens |' + : '| Config | TTFT (ms) | Δ TTFT | TPS | Δ TPS | ppTPS | Δ ppTPS |' + const sep = hasTokens + ? '| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |' + : '| --- | ---: | ---: | ---: | ---: | ---: | ---: |' + lines.push(hdr) + lines.push(sep) + for (const r of items) { + const b = baselineMap.get(`${r.device}@@${r.config}`) + if (r.crashed) { + const crash = hasTokens + ? `| ${r.config} | Crashed | - | Crashed | - | Crashed | - | - |` + : `| ${r.config} | Crashed | - | Crashed | - | Crashed | - |` + lines.push(crash) + } else { + const dTtft = (b && !b.crashed && r.ttft !== null && b.ttft !== null) ? r.ttft - b.ttft : null + const dTps = (b && !b.crashed && r.tps !== null && b.tps !== null) ? r.tps - b.tps : null + const dPp = (b && !b.crashed && r.ppTps !== null && b.ppTps !== null) ? r.ppTps - b.ppTps : null + const row = hasTokens + ? `| ${r.config} | ${fmtMS(r.ttft, r.ttftStd, r.sampleCount)} | ${fmtDelta(dTtft)} | ${fmtMS(r.tps, r.tpsStd, r.sampleCount)} | ${fmtDelta(dTps)} | ${fmtMS(r.ppTps, r.ppTpsStd, r.sampleCount)} | ${fmtDelta(dPp)} | ${r.tokens !== null ? r.tokens : '-'} |` + : `| ${r.config} | ${fmtMS(r.ttft, r.ttftStd, r.sampleCount)} | ${fmtDelta(dTtft)} | ${fmtMS(r.tps, r.tpsStd, r.sampleCount)} | ${fmtDelta(dTps)} | ${fmtMS(r.ppTps, r.ppTpsStd, r.sampleCount)} | ${fmtDelta(dPp)} |` + lines.push(row) + } + } + } else { + const hdr = hasTokens + ? '| Config | TTFT (ms) | TPS | ppTPS | Tokens |' + : '| Config | TTFT (ms) | TPS | ppTPS |' + const sep = hasTokens + ? '| --- | ---: | ---: | ---: | ---: |' + : '| --- | ---: | ---: | ---: |' + lines.push(hdr) + lines.push(sep) + for (const r of items) { + if (r.crashed) { + lines.push(hasTokens + ? `| ${r.config} | Crashed | Crashed | Crashed | - |` + : `| ${r.config} | Crashed | Crashed | Crashed |`) + } else { + lines.push(hasTokens + ? `| ${r.config} | ${fmtMS(r.ttft, r.ttftStd, r.sampleCount)} | ${fmtMS(r.tps, r.tpsStd, r.sampleCount)} | ${fmtMS(r.ppTps, r.ppTpsStd, r.sampleCount)} | ${r.tokens !== null ? r.tokens : '-'} |` + : `| ${r.config} | ${fmtMS(r.ttft, r.ttftStd, r.sampleCount)} | ${fmtMS(r.tps, r.tpsStd, r.sampleCount)} | ${fmtMS(r.ppTps, r.ppTpsStd, r.sampleCount)} |`) + } + } + } + lines.push('') + } + + lines.push('## Best configuration per device') + lines.push('') + lines.push('| Device | Highest TPS | Highest ppTPS |') + lines.push('| --- | --- | --- |') + for (const device of devices) { + const ok = byDevice.get(device).filter(r => !r.crashed) + const bestTps = ok.filter(r => r.tps !== null).sort((a, b) => b.tps - a.tps)[0] + const bestPp = ok.filter(r => r.ppTps !== null).sort((a, b) => b.ppTps - a.ppTps)[0] + const tpsCell = bestTps ? `${bestTps.config} — ${fmt(bestTps.tps)}` : '-' + const ppCell = bestPp ? `${bestPp.config} — ${fmt(bestPp.ppTps)}` : '-' + lines.push(`| ${device} | ${tpsCell} | ${ppCell} |`) + } + lines.push('') + return lines.join('\n') + '\n' +} + +// ── Visual HTML report: self-contained inline SVG bar charts, no deps or CDN ── +const CHART_COLORS = ['#2563eb', '#dc2626', '#16a34a', '#d97706', '#7c3aed', '#0891b2', '#db2777'] + +function rowQuant (config) { + const m = /^\[qwen[\d.]+-[^-\]]+-([^\]]+)\]/i.exec(config) + return m ? m[1] : null +} + +function rowKv (config) { + const m = /\[kv=([^\]]+)\]/.exec(config) + return m ? m[1] : null +} + +function rowBackend (config) { + const m = /\[(gpu|cpu)\]/.exec(config) + return m ? m[1] : null +} + +function rowRb (config) { + const m = /\[rb=(-?\d+)\]/.exec(config) + return m ? m[1] : null +} + +function rowSize (config) { + const m = /^\[qwen[\d.]+-([^-\]]+)-/i.exec(config) + return m ? m[1] : null +} + +// Charts hold every axis but the one on the x-axis at a single value, so each +// bar is one real measured configuration rather than an average across gpu/cpu, +// model sizes or reasoning budgets. reasoning-budget -1 is the model's default +// (reasoning channel on; 0 disables it) and KV f16 is llama.cpp's default; the +// featured size and held quant are stated in every chart. gpu and cpu are +// charted separately and never blended. +const CHART_BACKENDS = ['gpu', 'cpu'] +const CHART_RB = '-1' +const CHART_SIZE = '2b' +const CHART_KV_DEFAULT = 'f16' +const CHART_QUANT_HELD = 'Q4_K_M' + +// Keep only rows sitting at the given fixed point of the matrix; an axis left +// undefined is the one being varied on the x-axis. +function atConfig (rows, { backend, rb, size, quant, kv }) { + return rows.filter(r => + (backend == null || rowBackend(r.config) === backend) && + (rb == null || rowRb(r.config) === rb) && + (size == null || rowSize(r.config) === size) && + (quant == null || rowQuant(r.config) === quant) && + (kv == null || rowKv(r.config) === kv) + ) +} + +// One bar per device for each category of the varied axis. The caller passes +// rows already pinned to a single point on every OTHER axis (via atConfig), so +// each (device, category) is exactly one measured config: the bar is that row's +// mean and the whisker its own measured 3-rep stddev (stdKey) — never a spread +// recomputed across blended configs. +function chartSeries (rows, desktopDevice, byKey, order, metric, stdKey) { + const pts = rows.filter(r => r.device !== desktopDevice && !r.crashed && r[metric] !== null && byKey(r.config) !== null) + const present = new Set(pts.map(r => byKey(r.config))) + const cats = order.filter(c => present.has(c)) + const devices = [...new Set(pts.map(r => r.device))].sort() + const series = devices.map((dev, i) => ({ + name: dev, + color: CHART_COLORS[i % CHART_COLORS.length], + cells: cats.map(cat => { + const row = pts.find(r => r.device === dev && byKey(r.config) === cat) + return row ? { mean: row[metric], std: row[stdKey] != null ? row[stdKey] : null } : null + }) + })) + return { cats, series } +} + +function svgBarChart (title, unit, cats, series, maxOverride) { + const W = 860; const H = 360 + const m = { l: 64, r: 16, t: 16, b: 70 } + const pw = W - m.l - m.r; const ph = H - m.t - m.b + let max = maxOverride || 0 + if (!maxOverride) for (const s of series) for (const c of s.cells) if (c) max = Math.max(max, c.mean + (c.std || 0)) + const niceMax = (max > 0 ? max : 1) * 1.1 + const y = v => m.t + ph - (v / niceMax) * ph + const out = [''] + for (let g = 0; g <= 4; g++) { + const v = (niceMax / 4) * g; const yy = y(v) + out.push(``) + out.push(`${fmt(v, v < 10 ? 1 : 0)}`) + } + const groupW = pw / cats.length + const barW = (groupW * 0.74) / series.length + cats.forEach((cat, ci) => { + const gx = m.l + ci * groupW + groupW * 0.13 + series.forEach((s, si) => { + const cell = s.cells[ci] + if (!cell) return + const bx = gx + si * barW + const by = y(cell.mean) + out.push(`${s.name} | ${cat}: ${fmt(cell.mean)} ${unit}`) + if (cell.std) { + const cx = (bx + barW * 0.46).toFixed(1) + out.push(``) + } + }) + out.push(`${cat}`) + }) + out.push('') + return `
${title} (${unit})
${out.join('')}
` +} + +function renderHtml (rows, desktopDevice, meta, addonVersionArg) { + const addonVersion = meta.addonVersion || addonVersionArg || '' + const mobile = rows.filter(r => r.device !== desktopDevice) + const devices = [...new Set(mobile.filter(r => !r.crashed).map(r => r.device))].sort() + const legend = devices.map((d, i) => `${d}`).join('') + const KVO = ['f16', 'q8_0', 'q4_0', 'tbq3_0/pq3_0', 'tbq4_0/pq4_0', 'pq3_0', 'pq4_0'] + const QO = ['Q4_0', 'Q4_1', 'Q4_K_M', 'Q6_K', 'Q8_0'] + // [title, unit, byKey, order, metric, stdKey, held]. Each chart varies one axis + // and holds the rest at a fixed point (size, reasoning budget, and the other + // categorical). gpu and cpu are rendered as separate charts sharing one y-scale + // per metric so the backend gap is read correctly. + const defs = [ + ['TPS by KV-cache type', 'TPS, tokens/sec', rowKv, KVO, 'tps', 'tpsStd', { quant: CHART_QUANT_HELD }], + ['TPS by quantization', 'TPS, tokens/sec', rowQuant, QO, 'tps', 'tpsStd', { kv: CHART_KV_DEFAULT }], + ['ppTPS by KV-cache type', 'ppTPS, tokens/sec', rowKv, KVO, 'ppTps', 'ppTpsStd', { quant: CHART_QUANT_HELD }], + ['TTFT by KV-cache type', 'TTFT, ms (lower is better)', rowKv, KVO, 'ttft', 'ttftStd', { quant: CHART_QUANT_HELD }] + ] + let charts = '' + for (const [title, unit, byKey, order, metric, stdKey, extra] of defs) { + const perBackend = CHART_BACKENDS.map(backend => { + const subset = atConfig(mobile, { backend, rb: CHART_RB, size: CHART_SIZE, ...extra }) + return { backend, ...chartSeries(subset, desktopDevice, byKey, order, metric, stdKey) } + }).filter(b => b.cats.length) + if (!perBackend.length) continue + let smax = 0 + for (const b of perBackend) for (const s of b.series) for (const c of s.cells) if (c) smax = Math.max(smax, c.mean + (c.std || 0)) + for (const b of perBackend) charts += svgBarChart(`${title} — ${b.backend.toUpperCase()}`, unit, b.cats, b.series, smax) + } + const metaBits = [addonVersion && `Addon ${addonVersion}`, meta.promptTokens && `Prompt ${meta.promptTokens} tok`].filter(Boolean).join(' · ') + const caption = `Each bar is one measured configuration: Qwen3.5-${CHART_SIZE.toUpperCase()}, reasoning on (rb=-1). KV-cache charts hold the weights at ${CHART_QUANT_HELD}; the quantization chart holds the KV-cache at ${CHART_KV_DEFAULT} (llama.cpp default). GPU and CPU are shown separately and never averaged; each metric's gpu/cpu pair shares one y-scale. Whiskers are ±1 stddev over 3 reps. A missing bar means that configuration crashed or is unsupported on that device. The full matrix and all model sizes are in the report tables.` + return `Qwen3.5 Benchmark Charts

Qwen3.5 Benchmark Charts

${metaBits}

${caption}

${legend}
${charts || '

No mobile data to chart.

'}` +} + +function main () { + const args = parseArgs(process.argv) + + const { rows, meta, desktopDevice } = loadDir(args.dir, args.desktopDevice) + + let baselineMap = null + let baseline = null + if (args.compareDir) { + const { rows: baseRows, meta: baseMeta, desktopDevice: baselineDesktop } = loadDir(args.compareDir, desktopDevice) + baselineMap = buildBaselineMap(baseRows) + baseline = { + rows: baseRows, + meta: baseMeta, + desktopDevice: baselineDesktop, + runId: args.baselineRunId, + runNumber: args.baselineRunNumber, + runUrl: args.baselineRunUrl + } + } + + if (rows.length === 0) { + // Metadata-only artifacts (run-meta/desktop-meta) are valid JSON but carry no + // rows; the workflow's "any JSON present" precheck cannot tell them apart from + // real results. Fail so a run that produced no benchmark data never renders as + // a green, complete-looking report. + const msg = 'No benchmark results found.\n' + if (args.output) fs.writeFileSync(args.output, msg) + else process.stdout.write(msg) + process.exitCode = 1 + return + } + + if (args.compareDir && baseline.rows.length === 0) { + // A comparison was requested (compare_run_id) but the baseline produced no + // benchmark rows (e.g. only run-meta/desktop-meta metadata was downloaded + // for it). There is nothing to compare against, so fail rather than render a + // delta-less report. This is distinct from a baseline that has rows but none + // matching the current devices, which renders a per-device note instead. + const msg = `No baseline benchmark data for the requested comparison (${args.compareDir}).\n` + if (args.output) fs.writeFileSync(args.output, msg) + else process.stdout.write(msg) + process.exitCode = 1 + return + } + + const md = render(rows, desktopDevice, meta, args.addonVersion, baselineMap, baseline, args.chartsUrl) + if (args.output) fs.writeFileSync(args.output, md) + else process.stdout.write(md) + + if (args.html) fs.writeFileSync(args.html, renderHtml(rows, desktopDevice, meta, args.addonVersion)) +} + +main() diff --git a/packages/llm-llamacpp/benchmarks/performance/reporters.js b/packages/llm-llamacpp/benchmarks/performance/reporters.js index 9293b29e5c..138461bb34 100644 --- a/packages/llm-llamacpp/benchmarks/performance/reporters.js +++ b/packages/llm-llamacpp/benchmarks/performance/reporters.js @@ -13,6 +13,13 @@ function tsFileStamp () { return `${yyyy}${mm}${dd}-${hh}${mi}${ss}` } +// Baseline rows render every config column as 'default'; otherwise show the +// runtime value (stringified) or blank when unset. +function cfgCell (isBaseline, value) { + if (isBaseline) return 'default' + return value != null ? String(value) : '' +} + function compactPromptErrors (promptResults) { if (!Array.isArray(promptResults)) return [] const out = [] @@ -45,34 +52,33 @@ function toMarkdown (report) { lines.push('') for (const model of report.models) { lines.push(`## Model: ${model.modelId}`) - lines.push('| Quantization | Device | Ctx Size | Batch Size | Ubatch Size | Flash Attn | Threads | Cache K | Cache V | Prompt Case | Status | Load Mean | Load Std | Run Mean | Run Std | TTFT Mean | TTFT Std | TPS Mean | TPS Std | Unload Mean | Unload Std | Prompt Tokens | Generated Tokens | Quality Match | Error |') - lines.push('|---|---|---:|---:|---:|---|---:|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|') + lines.push('| Quantization | Reasoning Budget | Device | Ctx Size | Batch Size | Ubatch Size | Flash Attn | Threads | Cache K | Cache V | Prompt Case | Status | TTFT Mean | TTFT Std | TPS Mean | TPS Std | ppTPS Mean | ppTPS Std | Load Mean | Load Std | Run Mean | Run Std | Unload Mean | Unload Std | Prompt Tokens | Generated Tokens | Quality Match | Error |') + lines.push('|---|---|---|---:|---:|---:|---|---:|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|') for (const item of model.cases) { const runtimeConfig = item.runtimeConfig || {} - const quality = item.qualityMatch != null ? item.qualityMatch.toFixed(3) : '' - const quantizationCell = item.isBaseline ? 'default' : (item.quantization ?? '') - const deviceCell = item.isBaseline ? 'default' : (runtimeConfig.device != null ? String(runtimeConfig.device) : '') - const ctxSizeCell = item.isBaseline ? 'default' : (runtimeConfig['ctx-size'] != null ? String(runtimeConfig['ctx-size']) : '') - const batchSizeCell = item.isBaseline ? 'default' : (runtimeConfig['batch-size'] != null ? String(runtimeConfig['batch-size']) : '') - const ubatchSizeCell = item.isBaseline ? 'default' : (runtimeConfig['ubatch-size'] != null ? String(runtimeConfig['ubatch-size']) : '') - const flashAttnCell = item.isBaseline - ? 'default' - : (runtimeConfig['flash-attn'] != null ? String(runtimeConfig['flash-attn']) : '') - const threadsCell = item.isBaseline ? 'default' : (runtimeConfig.threads != null ? String(runtimeConfig.threads) : '') - const cacheKCell = item.isBaseline ? 'default' : (runtimeConfig['cache-type-k'] != null ? String(runtimeConfig['cache-type-k']) : '') - const cacheVCell = item.isBaseline ? 'default' : (runtimeConfig['cache-type-v'] != null ? String(runtimeConfig['cache-type-v']) : '') + const quantizationCell = cfgCell(item.isBaseline, item.quantization) + const rbCell = cfgCell(item.isBaseline, runtimeConfig['reasoning-budget']) + const deviceCell = cfgCell(item.isBaseline, runtimeConfig.device) + const ctxSizeCell = cfgCell(item.isBaseline, runtimeConfig['ctx-size']) + const batchSizeCell = cfgCell(item.isBaseline, runtimeConfig['batch-size']) + const ubatchSizeCell = cfgCell(item.isBaseline, runtimeConfig['ubatch-size']) + const flashAttnCell = cfgCell(item.isBaseline, runtimeConfig['flash-attn']) + const threadsCell = cfgCell(item.isBaseline, runtimeConfig.threads) + const cacheKCell = cfgCell(item.isBaseline, runtimeConfig['cache-type-k']) + const cacheVCell = cfgCell(item.isBaseline, runtimeConfig['cache-type-v']) const errorCell = item.error && item.error.message ? truncateText(item.error.message, 120) : '' lines.push( - `| ${quantizationCell} | ${deviceCell} | ${ctxSizeCell} | ${batchSizeCell} | ${ubatchSizeCell} | ${flashAttnCell} | ${threadsCell} | ${cacheKCell} | ${cacheVCell} | ${item.promptCase ?? ''} | ${item.status ?? ''}` + - ` | ${item.metrics?.loadMsMean ?? ''} | ${item.metrics?.loadMsStd ?? ''}` + - ` | ${item.metrics?.runMsMean ?? ''} | ${item.metrics?.runMsStd ?? ''}` + + `| ${quantizationCell} | ${rbCell} | ${deviceCell} | ${ctxSizeCell} | ${batchSizeCell} | ${ubatchSizeCell} | ${flashAttnCell} | ${threadsCell} | ${cacheKCell} | ${cacheVCell} | ${item.promptCase ?? ''} | ${item.status ?? ''}` + ` | ${item.metrics?.ttftMsMean ?? ''} | ${item.metrics?.ttftMsStd ?? ''}` + ` | ${item.metrics?.tpsMean ?? ''} | ${item.metrics?.tpsStd ?? ''}` + + ` | ${item.metrics?.ppTpsMean ?? ''} | ${item.metrics?.ppTpsStd ?? ''}` + + ` | ${item.metrics?.loadMsMean ?? ''} | ${item.metrics?.loadMsStd ?? ''}` + + ` | ${item.metrics?.runMsMean ?? ''} | ${item.metrics?.runMsStd ?? ''}` + ` | ${item.metrics?.unloadMsMean ?? ''} | ${item.metrics?.unloadMsStd ?? ''}` + ` | ${item.metrics?.promptTokens ?? ''} | ${item.metrics?.generatedTokens ?? ''}` + - ` | ${quality} | ${errorCell} |` + ` | ${item.qualityMatch != null ? item.qualityMatch.toFixed(3) : ''} | ${errorCell} |` ) } lines.push('') diff --git a/packages/llm-llamacpp/benchmarks/performance/test-prompts.json b/packages/llm-llamacpp/benchmarks/performance/test-prompts.json index d8bce45bbb..abeffddb47 100644 --- a/packages/llm-llamacpp/benchmarks/performance/test-prompts.json +++ b/packages/llm-llamacpp/benchmarks/performance/test-prompts.json @@ -8,7 +8,7 @@ }, { "role": "user", - "content": "You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. " + "content": "Summarize the following passage and explain its key technical implications for on-device inference.\n\nModern large language models have transformed natural language processing. Unlike earlier systems that relied on handcrafted features and task-specific architectures, transformer-based models learn general-purpose representations that transfer across many tasks. This shift enabled strong performance in text generation, translation, question answering, and code synthesis, frequently matching expert humans on established benchmarks.\n\nThe scaling laws governing these models describe a consistent relationship between compute, training data, and model capacity. As researchers grow model size and dataset volume, capabilities tend to improve smoothly and predictably, with occasional emergent abilities appearing at particular scale thresholds. This predictability has guided the design of increasingly capable systems, while raising real questions about energy use and cost.\n\nInference efficiency is now a central challenge. Quantization reduces the memory footprint and increases throughput by storing weights at lower numerical precision, allowing deployment on edge devices that would otherwise lack the necessary memory bandwidth. Speculative decoding and continuous batching push throughput further by using available compute more fully during autoregressive generation. Together these techniques make it practical to run capable models locally on consumer hardware, cutting latency and preserving privacy because data never leaves the device.\n\nReasoning quality continues to improve through chain-of-thought prompting and reinforcement learning from human feedback. Models with an explicit reasoning budget can spend more computation on hard problems while staying efficient on simple queries by disabling the reasoning trace entirely. Balancing this budget against latency and battery on mobile hardware is an open and practical engineering problem that the field is only beginning to address in production systems.\n\nOn mobile devices the constraints are sharper than on servers. Memory is limited, thermal headroom is small, and sustained throughput drops as the device heats up under a long generation. Prefill throughput, measured as prompt tokens processed per second, often behaves very differently from decode throughput, because prefill is compute bound across the whole prompt while decode is memory bound on a single token at a time. Quantization format interacts with both phases in ways that are hard to predict from first principles, which is exactly why empirical benchmarks across formats and devices matter. A format that is fast to decode on a desktop GPU may be slower on a phone because of how its blocks map onto the available kernels and cache hierarchy. Measuring time to first token, decode tokens per second, and prefill tokens per second across each quantization and reasoning setting gives the clearest practical picture of what users will actually experience." } ] }, diff --git a/packages/llm-llamacpp/package.json b/packages/llm-llamacpp/package.json index 4e804a8636..b373feb245 100644 --- a/packages/llm-llamacpp/package.json +++ b/packages/llm-llamacpp/package.json @@ -13,10 +13,12 @@ "quickstart": "node -e \"const fs=require('fs');const path=require('path');const {execSync}=require('child_process');const prebuilds=path.join(process.cwd(),'prebuilds');const source=path.join(process.cwd(),'node_modules','@qvac','llm-llamacpp','prebuilds');if(!fs.existsSync(prebuilds)){if(!fs.existsSync(source)){execSync('npm install @qvac/llm-llamacpp@latest',{stdio:'inherit'});}if(!fs.existsSync(source)){throw new Error('Prebuilds not found after install.');}fs.cpSync(source,prebuilds,{recursive:true});}execSync('bare examples/quickstart.js',{stdio:'inherit'});\"", "update:quickstart-section": "node ./scripts/quickstart-testing/update_readme.js", "test:integration": "npm run test:integration:generate && bare test/integration/all.js --exit", - "test:mobile:generate": "bare ./scripts/generate-mobile-integration-tests.js", - "test:mobile:validate": "node scripts/validate-mobile-tests.js", + "test:mobile:generate": "npm run generate:benchmark-shards && bare ./scripts/generate-mobile-integration-tests.js", + "test:mobile:validate": "npm run generate:benchmark-shards && node scripts/validate-mobile-tests.js", "test:dts": "tsc -p tsconfig.dts.json", - "test:integration:generate": "brittle -r test/integration/all.js test/integration/*.test.js && npm run test:mobile:generate", + "generate:benchmark-shards": "node scripts/generate-benchmark-shards.js", + "verify:benchmark-shards": "node scripts/generate-benchmark-shards.js --check", + "test:integration:generate": "npm run generate:benchmark-shards && brittle -r test/integration/all.js test/integration/*.test.js && npm run test:mobile:generate", "test:unit:generate": "brittle -r test/unit/all.js test/unit/*.test.js", "test:unit": "npm run test:unit:generate && bare test/unit/all.js --exit", "test:cpp:build": "bare-make generate -D BUILD_TESTING=ON && bare-make build --target addon-test", diff --git a/packages/llm-llamacpp/scripts/generate-benchmark-shards.js b/packages/llm-llamacpp/scripts/generate-benchmark-shards.js new file mode 100644 index 0000000000..12b428aa24 --- /dev/null +++ b/packages/llm-llamacpp/scripts/generate-benchmark-shards.js @@ -0,0 +1,169 @@ +'use strict' + +// Generates the mobile perf benchmark shard files from the single source of +// truth, test/integration/_benchmark-matrix.js. The shard files are NOT +// committed (see .gitignore) — they are regenerated wherever the benchmark is +// built or run, so the matrix is the only place the 2 x 5 x 7 grid is defined. +// +// node scripts/generate-benchmark-shards.js # (re)write shards + prune orphans +// node scripts/generate-benchmark-shards.js --check # verify committed artifacts vs matrix +// node scripts/generate-benchmark-shards.js --assert-shards # fail unless every shard exists on disk +// node scripts/generate-benchmark-shards.js --groups # print workflow test_groups JSON +// +// --check needs no shard files on disk: it verifies the committed workflow +// test_groups and the committed integration.auto.cjs references both match the +// matrix. --assert-shards is the hard pre-bundle gate: it makes it impossible +// to build the Device Farm bundle without all shards present. + +const fs = require('fs') +const path = require('path') +const { + matrix, + shardFileName, + runFunctionName, + shardContents, + workflowBatches +} = require('../test/integration/_benchmark-matrix.js') + +const integrationDir = path.resolve(__dirname, '..', 'test', 'integration') +const mobileAutoFile = path.resolve(__dirname, '..', 'test', 'mobile', 'integration.auto.cjs') +const workflowFile = path.resolve(__dirname, '..', '..', '..', '.github', 'workflows', 'benchmark-perf-llm-llamacpp.yml') + +const mode = process.argv.includes('--check') + ? 'check' + : process.argv.includes('--assert-shards') + ? 'assert' + : process.argv.includes('--groups') + ? 'groups' + : 'write' + +const SHARD_PREFIX = 'benchmark-perf-' + +// Verify the committed workflow test_groups match the matrix-derived batches. +function checkGroups () { + if (!fs.existsSync(workflowFile)) { + console.error(`MISMATCH: benchmark workflow not found at ${workflowFile}`) + return 1 + } + const yaml = fs.readFileSync(workflowFile, 'utf8') + // Parse each committed groups value and compare canonically, so reformatting + // the inline JSON (extra whitespace etc.) doesn't trip a false mismatch. + const committed = [...yaml.matchAll(/groups:\s*'(.+)'/g)].map((m) => { + try { return JSON.stringify(JSON.parse(m[1])) } catch { return m[1] } + }) + const expected = workflowBatches().map((b) => JSON.stringify(b.groups)) + let bad = 0 + if (committed.length !== expected.length) { + console.error(`MISMATCH: workflow has ${committed.length} group batches, matrix yields ${expected.length}`) + bad++ + } + for (let i = 0; i < expected.length; i++) { + if (committed[i] !== expected[i]) { + bad++ + console.error(`MISMATCH: workflow group batch ${i} differs from matrix`) + } + } + return bad +} + +// Verify the committed integration.auto.cjs is in lockstep with the matrix on +// BOTH axes the benchmark depends on: +// - the shard files it loads (runIntegrationModule('../integration/')) +// - the run-function NAMES it defines, which the workflow test_groups grep +// against. Those names come from toFunctionName in +// generate-mobile-integration-tests.js; matching them here (rather than the +// matrix's runFunctionName matching a convention) means a change to that +// generator that desyncs the grep fails the gate instead of silently +// running 0 tests. +function bidiDiff (label, expected, actual) { + let bad = 0 + for (const v of expected) { + if (!actual.has(v)) { bad++; console.error(`MISMATCH: integration.auto.cjs missing ${label} ${v}`) } + } + for (const v of actual) { + if (!expected.has(v)) { bad++; console.error(`MISMATCH: integration.auto.cjs has stale ${label} ${v}`) } + } + return bad +} + +function checkMobileAuto () { + if (!fs.existsSync(mobileAutoFile)) { + console.error(`MISMATCH: integration.auto.cjs not found at ${mobileAutoFile}. Run: npm run test:mobile:generate`) + return 1 + } + const content = fs.readFileSync(mobileAutoFile, 'utf8') + const cells = matrix() + + const referencedFiles = new Set( + [...content.matchAll(/runIntegrationModule\('\.\.\/integration\/([^']+)'/g)] + .map((m) => m[1]) + .filter((f) => f.startsWith(SHARD_PREFIX)) + ) + const definedFns = new Set( + [...content.matchAll(/function\s+(run\w+)\s*\(/g)] + .map((m) => m[1]) + .filter((n) => n.startsWith('runBenchmarkPerf')) + ) + + return ( + bidiDiff('shard', new Set(cells.map(shardFileName)), referencedFiles) + + bidiDiff('run-function', new Set(cells.map(runFunctionName)), definedFns) + ) +} + +// Hard gate: every matrix shard file must exist on disk (so the bundle that +// goes to Device Farm contains them). Makes it impossible to run the benchmark +// without shards. +function assertShards () { + let missing = 0 + for (const cell of matrix()) { + if (!fs.existsSync(path.join(integrationDir, shardFileName(cell)))) { + missing++ + console.error(`MISSING shard: ${shardFileName(cell)}`) + } + } + return missing +} + +// Write every matrix shard, then prune any benchmark-perf-*.test.js the matrix +// no longer produces, so shrinking the matrix never leaves orphans behind. +function writeShards () { + const expected = new Set(matrix().map(shardFileName)) + let written = 0 + for (const cell of matrix()) { + fs.writeFileSync(path.join(integrationDir, shardFileName(cell)), shardContents(cell)) + written++ + } + let pruned = 0 + for (const entry of fs.readdirSync(integrationDir)) { + if (entry.startsWith(SHARD_PREFIX) && entry.endsWith('.test.js') && !expected.has(entry)) { + fs.unlinkSync(path.join(integrationDir, entry)) + pruned++ + } + } + console.log(`Wrote ${written} shard files from the matrix${pruned ? `, pruned ${pruned} orphan(s)` : ''}.`) +} + +if (mode === 'groups') { + for (const batch of workflowBatches()) { + console.log(`# cache: ${batch.cache}`) + console.log(JSON.stringify(batch.groups)) + } +} else if (mode === 'check') { + const bad = checkGroups() + checkMobileAuto() + if (bad) { + console.error('\nCommitted benchmark artifacts are out of sync with _benchmark-matrix.js.') + console.error('Run: npm run generate:benchmark-shards && npm run test:mobile:generate, then commit integration.auto.cjs + the workflow groups.') + process.exit(1) + } + console.log(`OK: workflow test_groups and integration.auto.cjs both match the matrix (${matrix().length} shards).`) +} else if (mode === 'assert') { + const missing = assertShards() + if (missing) { + console.error(`\n${missing} shard file(s) missing. Run: npm run generate:benchmark-shards`) + process.exit(1) + } + console.log(`OK: all ${matrix().length} shard files present on disk.`) +} else { + writeShards() +} diff --git a/packages/llm-llamacpp/scripts/generate-mobile-integration-tests.js b/packages/llm-llamacpp/scripts/generate-mobile-integration-tests.js index e55a53d003..389e31483e 100644 --- a/packages/llm-llamacpp/scripts/generate-mobile-integration-tests.js +++ b/packages/llm-llamacpp/scripts/generate-mobile-integration-tests.js @@ -2,6 +2,7 @@ const fs = require('bare-fs') const path = require('bare-path') +const { matrix, shardFileName } = require('../test/integration/_benchmark-matrix.js') const repoRoot = path.resolve(__dirname, '..') const integrationDir = path.join(repoRoot, 'test', 'integration') @@ -12,6 +13,26 @@ const mobileExcludedTests = new Set([ 'continuous-batching.test.js' ]) +// The benchmark-perf-*.test.js shards are generated, not committed (see +// .gitignore), but the committed integration.auto.cjs references them. Enumerating +// the directory without them on disk would silently regenerate this file with the +// benchmark runners dropped, leaving the Benchmark Performance workflow to grep for +// functions that no longer exist and schedule zero tests. Refuse to run unless every +// shard is present. `npm run test:mobile:generate` writes them first; a bare +// invocation must run `npm run generate:benchmark-shards` beforehand. +function assertBenchmarkShardsPresent () { + const missing = matrix() + .map(shardFileName) + .filter(name => !fs.existsSync(path.join(integrationDir, name))) + if (missing.length) { + throw new Error( + `Refusing to regenerate mobile tests: ${missing.length} benchmark shard(s) absent ` + + `(e.g. ${missing[0]}). Run \`npm run generate:benchmark-shards\` first, or use ` + + '`npm run test:mobile:generate`, which does it for you.' + ) + } +} + function getIntegrationFiles () { if (!fs.existsSync(integrationDir)) { throw new Error(`Integration directory not found: ${integrationDir}`) @@ -70,9 +91,15 @@ function validateGroups (functionNames) { } const groups = JSON.parse(fs.readFileSync(groupsFile, 'utf-8')) const nameSet = new Set(functionNames) + // Benchmark shards (benchmark-perf-*.test.js -> runBenchmarkPerf*) are + // scheduled only by the Benchmark Performance workflow via an explicit + // test_groups override, and are deliberately absent from test-groups.json + // so normal mobile integration runs never trigger the heavy benchmark. + // Exclude them from the group-coverage requirement. + const isOverrideOnly = (n) => n.startsWith('runBenchmarkPerf') for (const [platform, splits] of Object.entries(groups)) { const covered = new Set(Object.values(splits).flat()) - const missing = functionNames.filter(n => !covered.has(n)) + const missing = functionNames.filter(n => !covered.has(n) && !isOverrideOnly(n)) const extra = [...covered].filter(n => !nameSet.has(n)) if (missing.length) { throw new Error( @@ -91,6 +118,7 @@ function validateGroups (functionNames) { } function main () { + assertBenchmarkShardsPresent() const files = getIntegrationFiles() if (files.length === 0) { throw new Error(`No integration test files found inside ${integrationDir}`) diff --git a/packages/llm-llamacpp/test/integration/_benchmark-matrix.js b/packages/llm-llamacpp/test/integration/_benchmark-matrix.js new file mode 100644 index 0000000000..751c013580 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/_benchmark-matrix.js @@ -0,0 +1,122 @@ +'use strict' + +// Single source of truth for the mobile perf benchmark matrix. The per-shard +// test files (benchmark-perf---.test.js) and the Benchmark +// Performance workflow's test_groups override are both generated from this +// list by scripts/generate-benchmark-shards.js, so the 2 x 5 x 7 matrix is +// defined in exactly one place. Underscore prefix keeps it out of the test +// globs (it is not a *.test.js file). + +const SIZES = ['0.8B', '2B'] +const QUANTS = ['Q4_0', 'Q4_1', 'Q4_K_M', 'Q6_K', 'Q8_0'] +// KV-cache types as (k, v) pairs. f16/q8_0/q4_0 are symmetric (k === v); the +// TurboQuant/PolarQuant schemes pair a TBQ or PQ key with a PQ value, so k may +// differ from v. TBQ/PQ ship Vulkan + CPU kernels only, so they are reported as +// Crashed on Metal (iOS) and on GPUs that lack support (e.g. Samsung). +const CACHE_TYPES = [ + { k: 'f16', v: 'f16' }, + { k: 'q8_0', v: 'q8_0' }, + { k: 'q4_0', v: 'q4_0' }, + { k: 'tbq3_0', v: 'pq3_0' }, + { k: 'tbq4_0', v: 'pq4_0' }, + { k: 'pq3_0', v: 'pq3_0' }, + { k: 'pq4_0', v: 'pq4_0' } +] + +// Full cross-product, size outer / quant middle / cache inner. +function matrix () { + const out = [] + for (const size of SIZES) { + for (const quant of QUANTS) { + for (const cache of CACHE_TYPES) { + out.push({ size, quant, cache }) + } + } + } + return out +} + +// Filename slug: lowercase, drop dots, underscores -> dashes. +// '0.8B' -> '08b', 'Q4_K_M' -> 'q4-k-m', 'q8_0' -> 'q8-0'. +function slug (value) { + return value.toLowerCase().replace(/\./g, '').replace(/_/g, '-') +} + +// Single slash-free token identifying a KV-cache type (filesystem and artifact +// safe): the cache type when k === v, else 'k-v'. Used for shard filenames and +// the workflow batch / artifact-suffix name. +function cacheId (cache) { + return cache.k === cache.v ? cache.k : `${cache.k}-${cache.v}` +} + +// Display label for a KV-cache type in report rows: the cache type when +// k === v, else 'k/v', matching the renderer's [kv=...] tag. +function cacheLabel (cache) { + return cache.k === cache.v ? cache.k : `${cache.k}/${cache.v}` +} + +function shardFileName (cell) { + return `benchmark-perf-${slug(cell.size)}-${slug(cell.quant)}-${slug(cacheId(cell.cache))}.test.js` +} + +// HuggingFace model id for a cell, e.g. {size:'0.8B',quant:'Q4_0'} -> 'qwen3.5-0.8b-Q4_0'. +// Single source for the id used by the on-device benchmark (modelSpec) and by +// the report renderer's coverage check, so both agree on shard identity. +function modelId (size, quant) { + return `qwen3.5-${size.toLowerCase()}-${quant}` +} + +// Stable per-shard key matching the renderer's "[] ... [kv=]" +// row label, so coverage can be reconciled against the matrix. +function mobileShardKey (cell) { + return `${modelId(cell.size, cell.quant)}|${cacheLabel(cell.cache)}` +} + +// Mirrors toFunctionName in scripts/generate-mobile-integration-tests.js: +// split the base name on non-alphanumerics, capitalize each part, prefix run. +function runFunctionName (cell) { + const base = shardFileName(cell).replace(/\.js$/, '') + const parts = base.split(/[^a-zA-Z0-9]+/).filter(Boolean) + const suffix = parts.map((p) => p.charAt(0).toUpperCase() + p.slice(1)).join('') + return `run${suffix}` +} + +// The exact lines + trailing newline each generated shard file holds. +function shardContents (cell) { + return [ + "'use strict'", + "const { benchmarkModel } = require('./_benchmark-perf.js')", + `benchmarkModel('${cell.size}', '${cell.quant}', '${cell.cache.k}', '${cell.cache.v}')`, + '' + ].join('\n') +} + +// One workflow matrix entry per KV-cache type, each carrying its 10 groups in +// size -> quant order, matching the mobile-benchmark job's test_groups. +function workflowBatches () { + return CACHE_TYPES.map((cache) => ({ + cache: cacheId(cache), + groups: SIZES.flatMap((size) => + QUANTS.map((quant) => { + const grep = runFunctionName({ size, quant, cache }) + return { name: grep.slice(3).replace(/Test$/, ''), grep } + }) + ) + })) +} + +module.exports = { + SIZES, + QUANTS, + CACHE_TYPES, + matrix, + slug, + cacheId, + cacheLabel, + shardFileName, + modelId, + mobileShardKey, + runFunctionName, + shardContents, + workflowBatches +} diff --git a/packages/llm-llamacpp/test/integration/_benchmark-perf.js b/packages/llm-llamacpp/test/integration/_benchmark-perf.js new file mode 100644 index 0000000000..2f9ab7b4d9 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/_benchmark-perf.js @@ -0,0 +1,185 @@ +'use strict' + +// Shared runner for the mobile perf benchmark. Sharded into one test file per +// (model x KV-cache type) (benchmark-perf---.test.js) +// so each Device Farm session finishes inside the fixed 20-minute iOS per-test +// ceiling; this module holds the logic they all share. Underscore prefix keeps +// it out of the mobile test generator (it is not a *.test.js file). +// +// Each shard sweeps its model across both devices (gpu, cpu) and both +// reasoning-budget values (-1, 0), recording TTFT / TPS / ppTPS. The full +// matrix (2 sizes x 5 quants x 7 KV-cache types x 2 devices x 2 budgets) is +// split across the shard files; nothing here reduces it. + +const path = require('bare-path') +const LlmLlamacpp = require('../../index.js') +const { ensureModel, safeTest } = require('./utils') +const { attachSpecLogger } = require('./spec-logger') +const { recordPerformance, isMobile } = require('./_perf-helper.js') +const { modelId } = require('./_benchmark-matrix.js') +const os = require('bare-os') + +const DEVICES = ['gpu', 'cpu'] +const REASONING_BUDGETS = ['-1', '0'] + +const RUNTIME = { + gpu_layers: '999', + ctx_size: '2048', + n_predict: '512', + temp: '0.1', + seed: '42', + verbosity: '0' +} + +// ~512-token prompt (verified against the Qwen3.5 tokenizer at 518 templated tokens). +const PROMPT = [ + { role: 'system', content: 'You are a helpful assistant.' }, + { + role: 'user', + content: 'Summarize the following passage and explain its key technical implications for on-device inference.\n\nModern large language models have transformed natural language processing. Unlike earlier systems that relied on handcrafted features and task-specific architectures, transformer-based models learn general-purpose representations that transfer across many tasks. This shift enabled strong performance in text generation, translation, question answering, and code synthesis, frequently matching expert humans on established benchmarks.\n\nThe scaling laws governing these models describe a consistent relationship between compute, training data, and model capacity. As researchers grow model size and dataset volume, capabilities tend to improve smoothly and predictably, with occasional emergent abilities appearing at particular scale thresholds. This predictability has guided the design of increasingly capable systems, while raising real questions about energy use and cost.\n\nInference efficiency is now a central challenge. Quantization reduces the memory footprint and increases throughput by storing weights at lower numerical precision, allowing deployment on edge devices that would otherwise lack the necessary memory bandwidth. Speculative decoding and continuous batching push throughput further by using available compute more fully during autoregressive generation. Together these techniques make it practical to run capable models locally on consumer hardware, cutting latency and preserving privacy because data never leaves the device.\n\nReasoning quality continues to improve through chain-of-thought prompting and reinforcement learning from human feedback. Models with an explicit reasoning budget can spend more computation on hard problems while staying efficient on simple queries by disabling the reasoning trace entirely. Balancing this budget against latency and battery on mobile hardware is an open and practical engineering problem that the field is only beginning to address in production systems.\n\nOn mobile devices the constraints are sharper than on servers. Memory is limited, thermal headroom is small, and sustained throughput drops as the device heats up under a long generation. Prefill throughput, measured as prompt tokens processed per second, often behaves very differently from decode throughput, because prefill is compute bound across the whole prompt while decode is memory bound on a single token at a time. Quantization format interacts with both phases in ways that are hard to predict from first principles, which is exactly why empirical benchmarks across formats and devices matter. A format that is fast to decode on a desktop GPU may be slower on a phone because of how its blocks map onto the available kernels and cache hierarchy. Measuring time to first token, decode tokens per second, and prefill tokens per second across each quantization and reasoning setting gives the clearest practical picture of what users will actually experience.' + } +] + +function _envInt (key, fallback) { + let raw = '' + if (typeof os.getEnv === 'function') raw = os.getEnv(key) || '' + if (!raw && typeof process !== 'undefined' && process.env) raw = process.env[key] || '' + const v = parseInt(raw, 10) + return Number.isFinite(v) && v > 0 ? v : fallback +} +// 3 measured repetitions per config so the renderer can report mean + stddev +// (matches the desktop sweep, which repeats 5x). Overridable via QVAC_PERF_RUNS. +const PERF_RUNS = _envInt('QVAC_PERF_RUNS', 3) +const PERF_WARMUP_RUNS = _envInt('QVAC_PERF_WARMUP_RUNS', 1) + +function modelSpec (size, quant) { + return { + id: modelId(size, quant), + name: `Qwen3.5-${size}-${quant}.gguf`, + url: `https://huggingface.co/unsloth/Qwen3.5-${size}-GGUF/resolve/main/Qwen3.5-${size}-${quant}.gguf` + } +} + +async function runInference (addon, prompt, reasoningBudget) { + const startTime = Date.now() + const response = await addon.run(prompt, { + generationParams: { reasoning_budget: parseInt(reasoningBudget, 10) } + }) + const chunks = [] + let error = null + response + .onUpdate(data => { chunks.push(data) }) + .onError(err => { error = err }) + await response.await() + if (error) throw new Error('inference failed: ' + error) + return { output: chunks.join('').trim(), startTime, endTime: Date.now(), stats: response.stats || null } +} + +// Records a placeholder row with no metrics. The renderer shows any row +// without TTFT/TPS/ppTPS as `Crashed`. We emit one up-front for every combo +// BEFORE loading/running it, so a hard native crash that kills the Device +// Farm session still leaves a `Crashed` row in the logs (the mobile reporter +// flushes each record to console immediately). A successful run records the +// real metrics afterwards, which supersedes the placeholder in the renderer. +function recordCrashedPlaceholder (label, device, model) { + recordPerformance(label, 0, { stats: null, deviceId: device, scenario: 'benchmark-perf', model }) +} + +// Registers the benchmark test for one (model x quant x kv-cache type), +// sweeping device x reasoning-budget. One Device Farm session per call. +// kv-cache type is set as cache-type-k/v at load time. Adreno devices don't +// support quantized KV cache, and TurboQuant/PolarQuant (tbq*/pq*) ship Vulkan +// + CPU kernels only (rejected on Metal/iOS, unsupported on some GPUs), so +// those combos may crash or fail to load — reported as Crashed. +function benchmarkModel (size, quant, cacheK, cacheV) { + const spec = modelSpec(size, quant) + // kvLabel uses the k/v form when key and value differ (e.g. TurboQuant + // tbq3_0/pq3_0), matching the renderer's [kv=...] tag. kvId is the + // slash-free token used for the model id and per-run identifiers. + const kvLabel = cacheK === cacheV ? cacheK : `${cacheK}/${cacheV}` + const kvId = cacheK === cacheV ? cacheK : `${cacheK}-${cacheV}` + const id = `${spec.id}-${kvId}` + safeTest(`Mobile perf benchmark: ${id} (TTFT / TPS / ppTPS)`, { + timeout: 1_800_000, + skip: !isMobile + }, async t => { + const specLogger = attachSpecLogger({ forwardToConsole: true }) + try { + const [modelName, dirPath] = await ensureModel({ modelName: spec.name, downloadUrl: spec.url }) + const modelPath = path.join(dirPath, modelName) + + // Up-front Crashed placeholders for EVERY combo across BOTH devices before + // any load/run, so a hard native crash during the first device's pass still + // leaves rows for the other device. Real metrics supersede these. + for (const device of DEVICES) { + for (const rb of REASONING_BUDGETS) { + recordCrashedPlaceholder(`[${spec.id}] [${device}] [rb=${rb}] [kv=${kvLabel}]`, device, `${id}-${device}-rb${rb}`) + } + } + + for (const device of DEVICES) { + const labelFor = rb => `[${spec.id}] [${device}] [rb=${rb}] [kv=${kvLabel}]` + const modelFor = rb => `${id}-${device}-rb${rb}` + + let addon = null + try { + addon = new LlmLlamacpp({ + files: { model: [modelPath] }, + config: { ...RUNTIME, device, 'cache-type-k': cacheK, 'cache-type-v': cacheV }, + logger: { error: () => {}, warn: () => {}, info: () => {}, debug: () => {} }, + opts: { stats: true } + }) + await addon.load() + } catch (loadErr) { + // Load failed (e.g. unsupported quantized KV cache) — placeholders + // remain Crashed for this device's combos. Move on. + t.comment(`[${id}] [${device}] load failed (reported as Crashed): ${loadErr && loadErr.message ? loadErr.message : loadErr}`) + await (addon && addon.unload && addon.unload().catch(() => {})) + continue + } + + try { + // Warm up once per backend, not per reasoning budget. The warm-up + // primes the GPU kernels/caches for this loaded model; reasoning + // budget is a per-call generation param that does not change the + // compute kernels, so one warm-up covers both budgets. It is + // discarded, never a measured rep, so the 3 reps and their stddev + // are unaffected. + try { + for (let w = 1; w <= PERF_WARMUP_RUNS; w++) { + const { endTime, startTime } = await runInference(addon, PROMPT, REASONING_BUDGETS[0]) + t.comment(`[${id}] [${device}] warmup ${w}/${PERF_WARMUP_RUNS} (${endTime - startTime}ms) - perf NOT recorded`) + } + } catch (warmErr) { + t.comment(`[${id}] [${device}] warmup failed: ${warmErr && warmErr.message ? warmErr.message : warmErr}`) + } + for (const rb of REASONING_BUDGETS) { + const label = labelFor(rb) + try { + for (let run = 1; run <= PERF_RUNS; run++) { + const { output, startTime, endTime, stats } = await runInference(addon, PROMPT, rb) + // Real metrics supersede the Crashed placeholder in the renderer. + t.comment(recordPerformance(label, endTime - startTime, { + stats, + deviceId: device, + scenario: 'benchmark-perf', + model: modelFor(rb) + })) + t.ok(output.length > 0, `${label} run ${run}/${PERF_RUNS} produced output`) + } + } catch (runErr) { + // Catchable run failure — placeholder stays Crashed for this combo. + t.comment(`${label} run failed (reported as Crashed): ${runErr && runErr.message ? runErr.message : runErr}`) + } + } + } finally { + await addon.unload().catch(() => {}) + } + } + } finally { + specLogger.release() + } + }) +} + +module.exports = { benchmarkModel, modelSpec } diff --git a/packages/llm-llamacpp/test/integration/_perf-helper.js b/packages/llm-llamacpp/test/integration/_perf-helper.js index 86d2bc9681..ffa71ab0b8 100644 --- a/packages/llm-llamacpp/test/integration/_perf-helper.js +++ b/packages/llm-llamacpp/test/integration/_perf-helper.js @@ -277,6 +277,7 @@ function recordPerformance (label, totalTime, extra) { const ttftMs = stats ? _num(stats.TTFT) : null const tps = stats ? _num(stats.TPS) : null + const ppTps = stats ? _num(stats.ppTPS) : null const generatedTokens = stats ? _num(stats.generatedTokens) : null const promptTokens = stats ? _num(stats.promptTokens) : null @@ -309,7 +310,8 @@ function recordPerformance (label, totalTime, extra) { ttft_ms: ttftMs !== null ? Math.round(ttftMs) : null, generated_tokens: generatedTokens, prompt_tokens: promptTokens, - tps: tps !== null ? Number(tps.toFixed(2)) : null + tps: tps !== null ? Number(tps.toFixed(2)) : null, + pp_tps: ppTps !== null ? Number(ppTps.toFixed(2)) : null }, { scenario: (extra && extra.scenario) || 'default', model: (extra && extra.model) || null, @@ -342,6 +344,7 @@ function recordPerformance (label, totalTime, extra) { ` - Prefill / TTFT: ${ttftMs !== null ? Math.round(ttftMs) + 'ms' : 'n/a'}`, ` - Decode: ${decodeMs !== null ? decodeMs + 'ms' : 'n/a'}`, ` - TPS: ${tps !== null ? tps.toFixed(2) : 'n/a'}`, + ` - ppTPS: ${ppTps !== null ? ppTps.toFixed(2) : 'n/a'}`, ` - Tokens: ${generatedTokens !== null ? generatedTokens : 'n/a'} gen / ${promptTokens !== null ? promptTokens : 'n/a'} prompt` ] return lines.join('\n') diff --git a/packages/llm-llamacpp/test/mobile/integration.auto.cjs b/packages/llm-llamacpp/test/mobile/integration.auto.cjs index 987ae1ddb1..64ec38ab40 100644 --- a/packages/llm-llamacpp/test/mobile/integration.auto.cjs +++ b/packages/llm-llamacpp/test/mobile/integration.auto.cjs @@ -16,6 +16,356 @@ async function runApiBehaviorTest (options = {}) { // eslint-disable-line no-unu return runIntegrationModule('../integration/api-behavior.test.js', options) } +async function runBenchmarkPerf08bQ40F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-f16.test.js', options) +} + +async function runBenchmarkPerf08bQ40Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf08bQ40Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ40Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-q4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ40Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-q8-0.test.js', options) +} + +async function runBenchmarkPerf08bQ40Tbq30Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40Tbq30Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-tbq3-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf08bQ40Tbq40Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40Tbq40Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-tbq4-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ41F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-f16.test.js', options) +} + +async function runBenchmarkPerf08bQ41Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-pq3-0.test.js', options) +} + +async function runBenchmarkPerf08bQ41Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-pq4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ41Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-q4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ41Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-q8-0.test.js', options) +} + +async function runBenchmarkPerf08bQ41Tbq30Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41Tbq30Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-tbq3-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf08bQ41Tbq40Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41Tbq40Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-tbq4-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ4KMF16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMF16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-f16.test.js', options) +} + +async function runBenchmarkPerf08bQ4KMPq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMPq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-pq3-0.test.js', options) +} + +async function runBenchmarkPerf08bQ4KMPq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMPq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-pq4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ4KMQ40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMQ40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-q4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ4KMQ80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMQ80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-q8-0.test.js', options) +} + +async function runBenchmarkPerf08bQ4KMTbq30Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMTbq30Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-tbq3-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf08bQ4KMTbq40Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMTbq40Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-tbq4-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ6KF16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KF16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-f16.test.js', options) +} + +async function runBenchmarkPerf08bQ6KPq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KPq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-pq3-0.test.js', options) +} + +async function runBenchmarkPerf08bQ6KPq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KPq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-pq4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ6KQ40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KQ40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-q4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ6KQ80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KQ80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-q8-0.test.js', options) +} + +async function runBenchmarkPerf08bQ6KTbq30Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KTbq30Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-tbq3-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf08bQ6KTbq40Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KTbq40Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-tbq4-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ80F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-f16.test.js', options) +} + +async function runBenchmarkPerf08bQ80Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf08bQ80Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ80Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-q4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ80Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-q8-0.test.js', options) +} + +async function runBenchmarkPerf08bQ80Tbq30Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80Tbq30Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-tbq3-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf08bQ80Tbq40Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80Tbq40Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-tbq4-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ40F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-f16.test.js', options) +} + +async function runBenchmarkPerf2bQ40Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf2bQ40Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ40Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-q4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ40Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-q8-0.test.js', options) +} + +async function runBenchmarkPerf2bQ40Tbq30Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40Tbq30Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-tbq3-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf2bQ40Tbq40Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40Tbq40Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-tbq4-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ41F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-f16.test.js', options) +} + +async function runBenchmarkPerf2bQ41Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-pq3-0.test.js', options) +} + +async function runBenchmarkPerf2bQ41Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-pq4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ41Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-q4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ41Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-q8-0.test.js', options) +} + +async function runBenchmarkPerf2bQ41Tbq30Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41Tbq30Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-tbq3-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf2bQ41Tbq40Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41Tbq40Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-tbq4-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ4KMF16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMF16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-f16.test.js', options) +} + +async function runBenchmarkPerf2bQ4KMPq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMPq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-pq3-0.test.js', options) +} + +async function runBenchmarkPerf2bQ4KMPq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMPq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-pq4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ4KMQ40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMQ40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-q4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ4KMQ80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMQ80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-q8-0.test.js', options) +} + +async function runBenchmarkPerf2bQ4KMTbq30Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMTbq30Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-tbq3-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf2bQ4KMTbq40Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMTbq40Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-tbq4-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ6KF16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KF16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-f16.test.js', options) +} + +async function runBenchmarkPerf2bQ6KPq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KPq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-pq3-0.test.js', options) +} + +async function runBenchmarkPerf2bQ6KPq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KPq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-pq4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ6KQ40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KQ40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-q4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ6KQ80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KQ80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-q8-0.test.js', options) +} + +async function runBenchmarkPerf2bQ6KTbq30Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KTbq30Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-tbq3-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf2bQ6KTbq40Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KTbq40Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-tbq4-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ80F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-f16.test.js', options) +} + +async function runBenchmarkPerf2bQ80Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf2bQ80Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-pq4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ80Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-q4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ80Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-q8-0.test.js', options) +} + +async function runBenchmarkPerf2bQ80Tbq30Pq30Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80Tbq30Pq30Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-tbq3-0-pq3-0.test.js', options) +} + +async function runBenchmarkPerf2bQ80Tbq40Pq40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80Tbq40Pq40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-tbq4-0-pq4-0.test.js', options) +} + async function runBitnetTest (options = {}) { // eslint-disable-line no-unused-vars if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBitnetTest')) return __FILTERED return runIntegrationModule('../integration/bitnet.test.js', options)