diff --git a/.github/workflows/benchmark-perf-llm-llamacpp.yml b/.github/workflows/benchmark-perf-llm-llamacpp.yml index 3cb312e98b..b357e05fbd 100644 --- a/.github/workflows/benchmark-perf-llm-llamacpp.yml +++ b/.github/workflows/benchmark-perf-llm-llamacpp.yml @@ -6,7 +6,8 @@ name: Benchmark Performance — LLM Parameter Sweep # # To change what runs, edit: # desktop: models.manifest.json (models) + llm-parameter-sweep.config.js (sweep dims) -# mobile: mobile.config.json +# mobile: test/integration/_benchmark-perf.js (shared runner) + the +# benchmark-perf-*.test.js shards (one per model x KV-cache type) on: workflow_dispatch: @@ -25,6 +26,19 @@ on: required: false default: true type: boolean + summarize_only: + description: "Re-render report from a previous run's artifacts (skips benchmarks)" + required: false + default: false + type: boolean + artifact_run_number: + description: "Run number to pull artifacts from when summarize_only=true (e.g. 9)" + required: false + type: string + compare_run_number: + description: "Run number of a baseline run to diff against (shows Δ TPS / Δ TTFT columns)" + required: false + type: string permissions: contents: read @@ -90,7 +104,7 @@ jobs: needs: - context - label-gate - if: needs.label-gate.outputs.authorised == 'true' && inputs.run_desktop + if: needs.label-gate.outputs.authorised == 'true' && inputs.run_desktop && !inputs.summarize_only name: Desktop Parameter Sweep runs-on: ai-run-linux-gpu timeout-minutes: 360 @@ -123,6 +137,12 @@ jobs: - name: Setup LLVM uses: tetherto/qvac/.github/actions/setup-llvm@98a6a6b6e8f3866dfdd75052a4071269ce85dc41 + - name: Setup Vulkan SDK + uses: tetherto/qvac/.github/actions/setup-vulkan-sdk@0bbdca93da303a0b1634ba14a89cec085621078d + with: + platform: linux + arch: x64 + - name: Build addon from source working-directory: packages/llm-llamacpp run: | @@ -157,21 +177,6 @@ jobs: sudo apt-get update sudo apt-get install -y libxi-dev libxtst-dev libxrandr-dev xz-utils - echo "Installing Vulkan SDK (latest)..." - wget -q -O /tmp/vulkansdk.tar.xz https://sdk.lunarg.com/sdk/download/latest/linux/vulkan_sdk.tar.xz - mkdir -p "$HOME/vulkan" && cd "$HOME/vulkan" - tar xf /tmp/vulkansdk.tar.xz --strip-components=1 - export VULKAN_SDK="$HOME/vulkan/x86_64" - export PATH="$VULKAN_SDK/bin:$PATH" - export LD_LIBRARY_PATH="$VULKAN_SDK/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" - export VK_ADD_LAYER_PATH="$VULKAN_SDK/share/vulkan/explicit_layer.d" - export PKG_CONFIG_PATH="$VULKAN_SDK/share/pkgconfig:$VULKAN_SDK/lib/pkgconfig${PKG_CONFIG_PATH:+:$PKG_CONFIG_PATH}" - echo "VULKAN_SDK=$VULKAN_SDK" >> $GITHUB_ENV - echo "PATH=$PATH" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "VK_ADD_LAYER_PATH=$VK_ADD_LAYER_PATH" >> $GITHUB_ENV - echo "PKG_CONFIG_PATH=$PKG_CONFIG_PATH" >> $GITHUB_ENV - cd "$GITHUB_WORKSPACE/packages/llm-llamacpp" npm install @@ -186,6 +191,13 @@ jobs: echo "=== Build complete ===" + - name: Detect GPU + id: gpu + shell: bash + run: | + gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 | tr -d '\r' || echo "GPU") + echo "name=$gpu_name" >> "$GITHUB_OUTPUT" + - name: Install benchmark dependencies working-directory: packages/llm-llamacpp/benchmarks/performance run: npm install @@ -199,24 +211,8 @@ jobs: working-directory: packages/llm-llamacpp/benchmarks/performance run: bare ./llm-parameter-sweep.js --addon-source local - - name: Add job summary - if: always() - working-directory: packages/llm-llamacpp/benchmarks/performance - shell: bash - run: | - LATEST_MD=$(find results/parameter-sweep -name "*.md" -type f 2>/dev/null | sort | tail -1) - { - echo "## LLM Parameter Sweep — Desktop" - echo "" - echo "ref: \`${{ needs.context.outputs.ref }}\`" - echo "" - if [ -n "${LATEST_MD:-}" ]; then - cat "$LATEST_MD" - else - echo "No results file found." - fi - } >> "$GITHUB_STEP_SUMMARY" - + # The run summary is rendered by the summarize job (unified desktop + + # mobile view); this job just uploads the raw sweep results. - name: Upload results if: always() uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0 @@ -225,24 +221,159 @@ jobs: path: packages/llm-llamacpp/benchmarks/performance/results/parameter-sweep/ retention-days: 90 if-no-files-found: ignore + env: + DESKTOP_GPU: ${{ steps.gpu.outputs.name }} + # Mobile is sharded one group per (model x KV-cache type) = 30 groups so each + # Device Farm session finishes inside the fixed 20-minute iOS per-test ceiling. + # All 30 in one reused-workflow call do NOT fit: Android serializes the runs + # against its device pool (>240 min) and the macOS runner fills its disk + # collecting 30 runs' logs. So we split the groups into three batches by + # KV-cache type (10 each — the proven in-budget load) and run them + # sequentially (max-parallel: 1) to avoid Device Farm pool contention. Each + # batch raises the job timeout to 180 for headroom (proven 10-shard wall ~119 + # min) and gets a distinct artifact_suffix so its perf-report doesn't collide; + # summarize aggregates all three. These wrappers are deliberately absent from + # the addon's test-groups.json, so this override is the only path that runs + # them — normal mobile integration runs never trigger the benchmark. mobile-benchmark: needs: - context - prebuild - label-gate - if: needs.label-gate.outputs.authorised == 'true' && inputs.run_mobile + if: needs.label-gate.outputs.authorised == 'true' && inputs.run_mobile && !inputs.summarize_only permissions: contents: read packages: read pull-requests: write id-token: write + strategy: + fail-fast: false + max-parallel: 1 + matrix: + include: + - cache: f16 + groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"}]' + - cache: q8_0 + groups: '[{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]' + - cache: q4_0 + groups: '[{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"}]' uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml secrets: inherit with: repository: ${{ github.repository }} ref: ${{ needs.context.outputs.ref }} - # Schedule only the benchmark group. runBenchmarkPerfTest is deliberately - # absent from the addon's test-groups.json, so this override is the only - # path that runs it — normal mobile integration runs never trigger it. - test_groups: '[{"name":"benchmarkPerf","grep":"runBenchmarkPerfTest"}]' + job_timeout_minutes: 180 + test_groups: ${{ matrix.groups }} + artifact_suffix: ${{ matrix.cache }}- + + # Aggregates desktop + mobile artifacts into one unified markdown report. + # Runs after benchmarks finish, or standalone when summarize_only=true + # (pass artifact_run_number to pull results from a previous run). + # Pass compare_run_number to show Δ TPS / Δ TTFT regression columns. + summarize: + needs: + - context + - label-gate + - desktop-benchmark + - mobile-benchmark + if: needs.label-gate.outputs.authorised == 'true' && always() && needs.context.result == 'success' + runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + contents: read + env: + # Use artifact_run_number when summarize_only, otherwise current run + ARTIFACT_RUN: ${{ inputs.artifact_run_number || github.run_number }} + steps: + - name: Checkout aggregator + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 + with: + ref: ${{ needs.context.outputs.ref }} + sparse-checkout: | + packages/llm-llamacpp/benchmarks/performance/render-report.js + + - name: Setup Node.js + uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0 + with: + node-version: lts/* + + - name: Get addon version + id: addon_ver + shell: bash + run: | + ver=$(node -e "process.stdout.write(require('./packages/llm-llamacpp/package.json').version)" 2>/dev/null || true) + echo "version=${ver:+@qvac/llm-llamacpp@$ver}" >> "$GITHUB_OUTPUT" + + - name: Download desktop sweep artifact + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + pattern: llm-param-sweep-desktop-${{ env.ARTIFACT_RUN }} + path: combined-reports + continue-on-error: true + + - name: Download mobile perf-report artifacts + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + pattern: perf-report-llamacpp-llm-*-${{ env.ARTIFACT_RUN }} + path: combined-reports + continue-on-error: true + + - name: Download baseline artifacts for comparison + if: inputs.compare_run_number != '' + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + pattern: llm-param-sweep-desktop-${{ inputs.compare_run_number }} + path: baseline-reports + continue-on-error: true + + - name: Download baseline mobile artifacts for comparison + if: inputs.compare_run_number != '' + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + pattern: perf-report-llamacpp-llm-*-${{ inputs.compare_run_number }} + path: baseline-reports + continue-on-error: true + + - name: Render unified benchmark report + shell: bash + run: | + if ! find combined-reports -name "*.json" -type f 2>/dev/null | grep -q .; then + echo "No benchmark reports found." + exit 0 + fi + mkdir -p benchmark-artifacts + + EXTRA_ARGS="" + if [ -n "${{ steps.addon_ver.outputs.version }}" ]; then + EXTRA_ARGS="$EXTRA_ARGS --addon-version ${{ steps.addon_ver.outputs.version }}" + fi + if [ -d baseline-reports ] && find baseline-reports -name "*.json" -type f 2>/dev/null | grep -q .; then + EXTRA_ARGS="$EXTRA_ARGS --compare-dir baseline-reports" + fi + + node packages/llm-llamacpp/benchmarks/performance/render-report.js \ + --dir combined-reports \ + --output benchmark-artifacts/qwen35-benchmark-findings.md \ + $EXTRA_ARGS + + - name: Add to run summary + if: always() + shell: bash + run: | + set +e + MD_FILE="benchmark-artifacts/qwen35-benchmark-findings.md" + if [ -f "$MD_FILE" ]; then + cat "$MD_FILE" >> "$GITHUB_STEP_SUMMARY" + else + echo "No consolidated benchmark report available." >> "$GITHUB_STEP_SUMMARY" + fi + + - name: Upload consolidated report + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0 + with: + name: qwen35-benchmark-findings-${{ github.run_number }} + path: benchmark-artifacts/ + retention-days: 90 + if-no-files-found: ignore diff --git a/.github/workflows/integration-mobile-test-llm-llamacpp.yml b/.github/workflows/integration-mobile-test-llm-llamacpp.yml index a7df080202..3e53e42c8b 100644 --- a/.github/workflows/integration-mobile-test-llm-llamacpp.yml +++ b/.github/workflows/integration-mobile-test-llm-llamacpp.yml @@ -41,6 +41,16 @@ on: type: string required: false default: "" + job_timeout_minutes: + description: "Override the build-and-test job timeout (minutes). Default 120. Raised by Benchmark Performance (LLM) where the sharded matrix needs more headroom." + type: number + required: false + default: 120 + artifact_suffix: + description: "Optional prefix inserted into the perf-report artifact name so multiple invocations in one run (e.g. Benchmark batches) don't collide. Default empty keeps the existing name." + type: string + required: false + default: "" workflow_dispatch: inputs: ref: @@ -74,7 +84,7 @@ jobs: name: Build ${{ matrix.platform }} and Run E2E Tests runs-on: ${{ matrix.runner }} environment: release - timeout-minutes: 120 + timeout-minutes: ${{ inputs.job_timeout_minutes || 120 }} continue-on-error: true permissions: contents: read @@ -217,7 +227,7 @@ jobs: platform: ${{ matrix.platform }} merge: 'true' unzip-customer-artifacts: 'true' - artifact-name: perf-report-llamacpp-llm-${{ matrix.platform }}-${{ github.run_number }} + artifact-name: perf-report-llamacpp-llm-${{ inputs.artifact_suffix }}${{ matrix.platform }}-${{ github.run_number }} - name: Comment results on PR if: always() && !cancelled() diff --git a/packages/llm-llamacpp/benchmarks/performance/case-runner.js b/packages/llm-llamacpp/benchmarks/performance/case-runner.js index ca149705ac..462aa62700 100644 --- a/packages/llm-llamacpp/benchmarks/performance/case-runner.js +++ b/packages/llm-llamacpp/benchmarks/performance/case-runner.js @@ -5,7 +5,9 @@ const path = require('bare-path') const { round, average, stddev, cartesianProduct } = require('./math') const { stripSurroundingQuotes, normalizeArgValue } = require('./utils') -const PROMPT_CASES = ['long', 'ctx-filling', 'span-fill'] +// Focused WB run uses a single ~512-token prompt. Add 'ctx-filling' / +// 'span-fill' back to also sweep context-fill and batch-spanning prompts. +const PROMPT_CASES = ['long'] const PROMPTS_PER_CASE = 1 const SWEEP_OVERRIDE_KEYS = [ @@ -17,7 +19,8 @@ const SWEEP_OVERRIDE_KEYS = [ 'ubatch-size', 'flash-attn', 'cache-type-k', - 'cache-type-v' + 'cache-type-v', + 'reasoning-budget' ] function splitCsvArg (value, key) { @@ -83,6 +86,7 @@ function buildCases (modelDef, sweep) { const threadsValues = sweep.threads || [] const cacheTypeKValues = sweep['cache-type-k'] || [] const cacheTypeVValues = sweep['cache-type-v'] || [] + const reasoningBudgetValues = sweep['reasoning-budget'] || [] const cases = [] for (const promptCase of PROMPT_CASES) { @@ -101,6 +105,7 @@ function buildCases (modelDef, sweep) { if (devices.length > 0 && ctxSizes.length > 0 && batchSizes.length > 0 && ubatchSizes.length > 0 && flashAttnValues.length > 0 && threadsValues.length > 0 && cacheTypeKValues.length > 0 && cacheTypeVValues.length > 0) { + const rbValues = reasoningBudgetValues.length > 0 ? reasoningBudgetValues : [null] const combos = cartesianProduct([ supportedQuants, devices, @@ -110,10 +115,11 @@ function buildCases (modelDef, sweep) { flashAttnValues, threadsValues, cacheTypeKValues, - cacheTypeVValues + cacheTypeVValues, + rbValues ]) - for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV] of combos) { + for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV, reasoningBudget] of combos) { if (Number(ubatchSize) > Number(batchSize)) { continue // Skip combinations where ubatchSize is greater than batchSize } @@ -128,8 +134,10 @@ function buildCases (modelDef, sweep) { 'cache-type-k': cacheTypeK, 'cache-type-v': cacheTypeV } + if (reasoningBudget !== null) runtimeConfig['reasoning-budget'] = reasoningBudget - const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}` + const rbSuffix = reasoningBudget !== null ? `__rb=${reasoningBudget}` : '' + const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}${rbSuffix}` for (const promptCase of PROMPT_CASES) { cases.push({ @@ -207,6 +215,7 @@ function aggregateRunMetrics (runMetrics) { const unloadMsValues = runMetrics.map((x) => x.unloadMs).filter((x) => x != null) const ttftMsValues = runMetrics.map((x) => x.ttftMs).filter((x) => x != null) const tpsValues = runMetrics.map((x) => x.tps).filter((x) => x != null) + const ppTpsValues = runMetrics.map((x) => x.ppTps).filter((x) => x != null) const firstPromptTokens = runMetrics.find((x) => x.promptTokens != null)?.promptTokens ?? null const firstGeneratedTokens = runMetrics.find((x) => x.generatedTokens != null)?.generatedTokens ?? null @@ -222,6 +231,8 @@ function aggregateRunMetrics (runMetrics) { ttftMsStd: round(stddev(ttftMsValues), 3), tpsMean: round(average(tpsValues), 3), tpsStd: round(stddev(tpsValues), 3), + ppTpsMean: round(average(ppTpsValues), 3), + ppTpsStd: round(stddev(ppTpsValues), 3), promptTokens: firstPromptTokens, generatedTokens: firstGeneratedTokens } diff --git a/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.config.js b/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.config.js index a495ec4a9a..5cf59638d3 100644 --- a/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.config.js +++ b/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.config.js @@ -3,10 +3,6 @@ const fs = require('bare-fs') const path = require('bare-path') const os = require('bare-os') -const { - DEFAULT_SWEEP_CTX_SIZES, - DEFAULT_SWEEP_BATCH_SIZES -} = require('./utils') const DEFAULT_RESULTS_DIR = path.resolve(__dirname, 'results', 'parameter-sweep') const DEFAULT_MODELS_DIR = path.resolve(__dirname, 'models') @@ -94,17 +90,20 @@ function loadModelsFromManifest () { const MODELS = loadModelsFromManifest() -// Parameter sweep: full factorial (cartesian product) +// Parameter sweep (cartesian product). Tuned to the focused WB run: +// only quantization and reasoning-budget vary; every other dimension is +// pinned to a single value. Edit these arrays to sweep more dimensions. const PARAMETER_SWEEP = { - quantization: ['Q4_0', 'Q4_K_M', 'Q8_0', 'F16'], + quantization: ['Q4_0', 'Q4_1', 'Q4_K_M', 'Q6_K', 'Q8_0'], device: getDefaultSweepDevices(), - 'ctx-size': DEFAULT_SWEEP_CTX_SIZES.map(String), - threads: ['2', '4', '8'], - 'batch-size': DEFAULT_SWEEP_BATCH_SIZES.map(String), // max: 10k - 'ubatch-size': ['128', '512'], // must be <= batch-size - 'flash-attn': ['off', 'on'], - 'cache-type-k': ['f16', 'q8_0', 'q4_0'], - 'cache-type-v': ['f16', 'q8_0', 'q4_0'] + 'ctx-size': ['2048'], + threads: ['4'], + 'batch-size': ['512'], + 'ubatch-size': ['512'], + 'flash-attn': ['off'], + 'cache-type-k': ['f16'], + 'cache-type-v': ['f16'], + 'reasoning-budget': ['-1', '0'] // verbosity: fixed at '0' (not swept) } diff --git a/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.js b/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.js index c65f81c658..b82f8fbdb1 100644 --- a/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.js +++ b/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.js @@ -331,7 +331,8 @@ async function main () { const caseMetricSamples = { runMs: [], ttftMs: [], - tps: [] + tps: [], + ppTps: [] } let firstPromptTokens = null let firstGeneratedTokens = null @@ -366,6 +367,7 @@ async function main () { unloadMs: null, // Will unload after all prompts ttftMs: round(ttftMs, 3), tps: round(stats.TPS != null ? stats.TPS : null, 3), + ppTps: round(stats.ppTPS != null ? stats.ppTPS : null, 3), promptTokens: stats.promptTokens ?? null, generatedTokens: stats.generatedTokens ?? null } @@ -374,6 +376,7 @@ async function main () { caseMetricSamples.runMs.push(metrics.runMs) if (metrics.ttftMs != null) caseMetricSamples.ttftMs.push(metrics.ttftMs) if (metrics.tps != null) caseMetricSamples.tps.push(metrics.tps) + if (metrics.ppTps != null) caseMetricSamples.ppTps.push(metrics.ppTps) if (firstPromptTokens == null && metrics.promptTokens != null) firstPromptTokens = metrics.promptTokens if (firstGeneratedTokens == null && metrics.generatedTokens != null) firstGeneratedTokens = metrics.generatedTokens caseRepeatsAttempted += 1 @@ -537,6 +540,8 @@ async function main () { ttftMsStd: round(stddev(caseMetricSamples.ttftMs), 3), tpsMean: round(average(caseMetricSamples.tps), 3), tpsStd: round(stddev(caseMetricSamples.tps), 3), + ppTpsMean: round(average(caseMetricSamples.ppTps), 3), + ppTpsStd: round(stddev(caseMetricSamples.ppTps), 3), promptTokens: firstPromptTokens, generatedTokens: firstGeneratedTokens } diff --git a/packages/llm-llamacpp/benchmarks/performance/models.manifest.json b/packages/llm-llamacpp/benchmarks/performance/models.manifest.json index ceed7a7dd9..a3f521d212 100644 --- a/packages/llm-llamacpp/benchmarks/performance/models.manifest.json +++ b/packages/llm-llamacpp/benchmarks/performance/models.manifest.json @@ -14,15 +14,19 @@ } }, { - "id": "qwen3-4b", + "id": "qwen3.5-0.8b", "gguf": { - "repo": "unsloth/Qwen3-4B-GGUF", + "repo": "unsloth/Qwen3.5-0.8B-GGUF", "revision": "main", - "quantizations": ["Q4_0", "Q4_K_M", "Q8_0", "F16"] - }, - "pytorch": { - "repo": "Qwen/Qwen3-4B", - "revision": "main" + "quantizations": ["Q4_0", "Q4_1", "Q4_K_M", "Q6_K", "Q8_0"] + } + }, + { + "id": "qwen3.5-2b", + "gguf": { + "repo": "unsloth/Qwen3.5-2B-GGUF", + "revision": "main", + "quantizations": ["Q4_0", "Q4_1", "Q4_K_M", "Q6_K", "Q8_0"] } } ] diff --git a/packages/llm-llamacpp/benchmarks/performance/prepare-prompts.js b/packages/llm-llamacpp/benchmarks/performance/prepare-prompts.js index 0a70fe49d1..44fd58eb1f 100644 --- a/packages/llm-llamacpp/benchmarks/performance/prepare-prompts.js +++ b/packages/llm-llamacpp/benchmarks/performance/prepare-prompts.js @@ -153,6 +153,10 @@ async function tuneToBudget (model, templateMessages, budget) { } } +// The 'long' prompt is the focused ~512-token benchmark prompt (verified +// against the Qwen3.5 tokenizer). Kept in sync with the committed +// test-prompts.json and benchmarks/performance/mobile.config.json so desktop +// and mobile runs measure the same input. function basePrompts () { return [ { @@ -161,11 +165,7 @@ function basePrompts () { { role: 'system', content: 'You are a helpful assistant.' }, { role: 'user', - content: ( - 'You are reviewing an incident report. Write a detailed narrative with sections for timeline, ' + - 'root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, ' + - 'include concrete checkpoints, and avoid bullet points unless needed for clarity. ' - ).repeat(15) + content: 'Summarize the following passage and explain its key technical implications for on-device inference.\n\nModern large language models have transformed natural language processing. Unlike earlier systems that relied on handcrafted features and task-specific architectures, transformer-based models learn general-purpose representations that transfer across many tasks. This shift enabled strong performance in text generation, translation, question answering, and code synthesis, frequently matching expert humans on established benchmarks.\n\nThe scaling laws governing these models describe a consistent relationship between compute, training data, and model capacity. As researchers grow model size and dataset volume, capabilities tend to improve smoothly and predictably, with occasional emergent abilities appearing at particular scale thresholds. This predictability has guided the design of increasingly capable systems, while raising real questions about energy use and cost.\n\nInference efficiency is now a central challenge. Quantization reduces the memory footprint and increases throughput by storing weights at lower numerical precision, allowing deployment on edge devices that would otherwise lack the necessary memory bandwidth. Speculative decoding and continuous batching push throughput further by using available compute more fully during autoregressive generation. Together these techniques make it practical to run capable models locally on consumer hardware, cutting latency and preserving privacy because data never leaves the device.\n\nReasoning quality continues to improve through chain-of-thought prompting and reinforcement learning from human feedback. Models with an explicit reasoning budget can spend more computation on hard problems while staying efficient on simple queries by disabling the reasoning trace entirely. Balancing this budget against latency and battery on mobile hardware is an open and practical engineering problem that the field is only beginning to address in production systems.\n\nOn mobile devices the constraints are sharper than on servers. Memory is limited, thermal headroom is small, and sustained throughput drops as the device heats up under a long generation. Prefill throughput, measured as prompt tokens processed per second, often behaves very differently from decode throughput, because prefill is compute bound across the whole prompt while decode is memory bound on a single token at a time. Quantization format interacts with both phases in ways that are hard to predict from first principles, which is exactly why empirical benchmarks across formats and devices matter. A format that is fast to decode on a desktop GPU may be slower on a phone because of how its blocks map onto the available kernels and cache hierarchy. Measuring time to first token, decode tokens per second, and prefill tokens per second across each quantization and reasoning setting gives the clearest practical picture of what users will actually experience.' } ] } diff --git a/packages/llm-llamacpp/benchmarks/performance/render-report.js b/packages/llm-llamacpp/benchmarks/performance/render-report.js new file mode 100644 index 0000000000..13a4343d79 --- /dev/null +++ b/packages/llm-llamacpp/benchmarks/performance/render-report.js @@ -0,0 +1,316 @@ +#!/usr/bin/env node +'use strict' + +// Unified benchmark report renderer for the Qwen3.5 perf benchmark. +// +// Reads perf JSON from --dir (recursively) and renders ONE markdown report: +// - header with addon version, prompt size, runs-per-config, GPU +// - one table per device: Config | TTFT (ms) | TPS | ppTPS | Tokens +// - optional Δ columns when --compare-dir is provided (cross-run regression) +// - a closing "best config per device" summary +// +// Two input schemas are normalised: +// desktop sweep: { models:[{modelId, cases:[{quantization, runtimeConfig, +// metrics:{ttftMsMean,tpsMean,ppTpsMean,promptTokens, +// generatedTokens}, status, isBaseline}]}], repeats, ... } +// mobile report: { addon, device:{name}, results:[{test, metrics:{ttft_ms, +// tps, pp_tps, generated_tokens, prompt_tokens}}] } + +const fs = require('fs') +const path = require('path') + +function parseArgs (argv) { + const a = { + dir: null, + output: null, + desktopDevice: 'Desktop (linux-x64 GPU)', + addonVersion: null, + compareDir: null + } + for (let i = 2; i < argv.length; i++) { + const t = argv[i] + if (t === '--dir') a.dir = argv[++i] + else if (t === '--output') a.output = argv[++i] + else if (t === '--desktop-device') a.desktopDevice = argv[++i] + else if (t === '--addon-version') a.addonVersion = argv[++i] + else if (t === '--compare-dir') a.compareDir = argv[++i] + } + if (!a.dir) { + throw new Error( + 'usage: render-report.js --dir [--output ] ' + + '[--desktop-device ] [--addon-version ] [--compare-dir ]' + ) + } + return a +} + +function walkJson (dir) { + const out = [] + for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { + const p = path.join(dir, entry.name) + if (entry.isDirectory()) out.push(...walkJson(p)) + else if (entry.name.endsWith('.json')) out.push(p) + } + return out +} + +function num (v) { + return typeof v === 'number' && Number.isFinite(v) ? v : null +} + +function int (v) { + const n = num(v) + return n !== null ? Math.round(n) : null +} + +// Collect metadata and rows from all files in a directory. +// Returns { rows, meta } where meta = { addonVersion, repeats, promptTokens }. +function loadDir (dir, desktopDevice) { + const files = walkJson(dir) + const meta = { addonVersion: null, repeats: null, promptTokens: null } + let rows = [] + for (const f of files) { + const r = rowsFromFile(f, desktopDevice, meta) + rows.push(...r) + } + rows = dedupe(rows) + return { rows, meta } +} + +// Normalise any report file into rows: { device, config, ttft, tps, ppTps, tokens, crashed } +// Also fills in meta fields when found. +function rowsFromFile (file, desktopDevice, meta) { + let doc + try { doc = JSON.parse(fs.readFileSync(file, 'utf8')) } catch { return [] } + const rows = [] + + // Desktop sweep schema + if (Array.isArray(doc.models) && doc.models.length && Array.isArray(doc.models[0].cases)) { + if (num(doc.repeats) !== null && meta.repeats === null) meta.repeats = doc.repeats + for (const model of doc.models) { + for (const c of model.cases) { + if (c.isBaseline) continue + const rc = c.runtimeConfig || {} + const config = configLabel({ + model: `${model.modelId}-${c.quantization}`, + backend: rc.device, + rb: rc['reasoning-budget'], + ck: rc['cache-type-k'], + cv: rc['cache-type-v'] + }) + const m = c.metrics || {} + if (int(m.promptTokens) !== null && meta.promptTokens === null) { + meta.promptTokens = int(m.promptTokens) + } + const crashed = c.status && c.status !== 'ok' && c.status !== 'partial-failure' + rows.push({ + device: desktopDevice, + config, + ttft: num(m.ttftMsMean), + tps: num(m.tpsMean), + ppTps: num(m.ppTpsMean), + tokens: int(m.generatedTokens), + crashed: !!crashed + }) + } + } + return rows + } + + // Mobile perf-report schema + if (doc.device && Array.isArray(doc.results)) { + if (doc.addon && meta.addonVersion === null) meta.addonVersion = doc.addon + const device = (doc.device.name || 'unknown').trim() + for (const r of doc.results) { + const m = r.metrics || {} + if (int(m.prompt_tokens) !== null && meta.promptTokens === null) { + meta.promptTokens = int(m.prompt_tokens) + } + const crashed = (r.status && String(r.status).toLowerCase() === 'crashed') || + (num(m.ttft_ms) === null && num(m.tps) === null && num(m.pp_tps) === null) + rows.push({ + device, + config: r.test || '(unknown)', + ttft: num(m.ttft_ms), + tps: num(m.tps), + ppTps: num(m.pp_tps), + tokens: int(m.generated_tokens), + crashed: !!crashed + }) + } + return rows + } + + return rows +} + +function configLabel ({ model, backend, rb, ck, cv }) { + const parts = [`[${model}]`] + if (backend) parts.push(`[${backend}]`) + if (rb !== undefined && rb !== null && rb !== '') parts.push(`[rb=${rb}]`) + if (ck || cv) parts.push(ck === cv ? `[kv=${ck}]` : `[kv=${ck || '?'}/${cv || '?'}]`) + return parts.join(' ') +} + +function fmt (v, decimals = 2) { + if (v === null) return '-' + return (Math.round(v * Math.pow(10, decimals)) / Math.pow(10, decimals)).toFixed(decimals) +} + +function fmtDelta (v) { + if (v === null) return '-' + const sign = v >= 0 ? '+' : '' + return `${sign}${fmt(v)}` +} + +function dedupe (rows) { + const byKey = new Map() + for (const r of rows) { + const k = `${r.device}@@${r.config}` + const prev = byKey.get(k) + if (!prev || (prev.crashed && !r.crashed)) byKey.set(k, r) + } + return [...byKey.values()] +} + +function buildBaselineMap (baseRows) { + const m = new Map() + for (const r of baseRows) m.set(`${r.device}@@${r.config}`, r) + return m +} + +function render (rows, desktopDevice, meta, addonVersionArg, baselineMap) { + const byDevice = new Map() + for (const r of rows) { + if (!byDevice.has(r.device)) byDevice.set(r.device, []) + byDevice.get(r.device).push(r) + } + const devices = [...byDevice.keys()].sort((a, b) => { + if (a === desktopDevice) return -1 + if (b === desktopDevice) return 1 + return a.localeCompare(b) + }) + + const addonVersion = addonVersionArg || meta.addonVersion || null + const comparing = baselineMap !== null + + const lines = [] + lines.push('# Qwen3.5 Benchmark Results') + lines.push('') + + // Header metadata block + const headerParts = [] + if (addonVersion) headerParts.push(`**Addon:** \`${addonVersion}\``) + if (meta.promptTokens !== null) headerParts.push(`**Prompt:** ${meta.promptTokens} tokens`) + if (meta.repeats !== null) headerParts.push(`**Runs per config:** ${meta.repeats}`) + if (headerParts.length) { + lines.push(headerParts.join(' · ')) + lines.push('') + } + + lines.push( + 'Metrics are addon `runtimeStats`: ' + + 'TTFT = time to first token (ms), TPS = decode tokens/sec, ' + + 'ppTPS = prefill tokens/sec, Tokens = generated tokens.' + + (comparing ? ' Δ columns show current minus baseline.' : '') + + ' `Crashed` = configuration crashed or produced no output.' + ) + lines.push('') + + const hasTokens = rows.some(r => r.tokens !== null) + + for (const device of devices) { + const items = byDevice.get(device).slice().sort((a, b) => a.config.localeCompare(b.config)) + lines.push(`## ${device}`) + lines.push('') + + if (comparing) { + const hdr = hasTokens + ? '| Config | TTFT (ms) | Δ TTFT | TPS | Δ TPS | ppTPS | Δ ppTPS | Tokens |' + : '| Config | TTFT (ms) | Δ TTFT | TPS | Δ TPS | ppTPS | Δ ppTPS |' + const sep = hasTokens + ? '| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |' + : '| --- | ---: | ---: | ---: | ---: | ---: | ---: |' + lines.push(hdr) + lines.push(sep) + for (const r of items) { + const b = baselineMap.get(`${r.device}@@${r.config}`) + if (r.crashed) { + const crash = hasTokens + ? `| ${r.config} | Crashed | - | Crashed | - | Crashed | - | - |` + : `| ${r.config} | Crashed | - | Crashed | - | Crashed | - |` + lines.push(crash) + } else { + const dTtft = (b && !b.crashed && r.ttft !== null && b.ttft !== null) ? r.ttft - b.ttft : null + const dTps = (b && !b.crashed && r.tps !== null && b.tps !== null) ? r.tps - b.tps : null + const dPp = (b && !b.crashed && r.ppTps !== null && b.ppTps !== null) ? r.ppTps - b.ppTps : null + const row = hasTokens + ? `| ${r.config} | ${fmt(r.ttft)} | ${fmtDelta(dTtft)} | ${fmt(r.tps)} | ${fmtDelta(dTps)} | ${fmt(r.ppTps)} | ${fmtDelta(dPp)} | ${r.tokens !== null ? r.tokens : '-'} |` + : `| ${r.config} | ${fmt(r.ttft)} | ${fmtDelta(dTtft)} | ${fmt(r.tps)} | ${fmtDelta(dTps)} | ${fmt(r.ppTps)} | ${fmtDelta(dPp)} |` + lines.push(row) + } + } + } else { + const hdr = hasTokens + ? '| Config | TTFT (ms) | TPS | ppTPS | Tokens |' + : '| Config | TTFT (ms) | TPS | ppTPS |' + const sep = hasTokens + ? '| --- | ---: | ---: | ---: | ---: |' + : '| --- | ---: | ---: | ---: |' + lines.push(hdr) + lines.push(sep) + for (const r of items) { + if (r.crashed) { + lines.push(hasTokens + ? `| ${r.config} | Crashed | Crashed | Crashed | - |` + : `| ${r.config} | Crashed | Crashed | Crashed |`) + } else { + lines.push(hasTokens + ? `| ${r.config} | ${fmt(r.ttft)} | ${fmt(r.tps)} | ${fmt(r.ppTps)} | ${r.tokens !== null ? r.tokens : '-'} |` + : `| ${r.config} | ${fmt(r.ttft)} | ${fmt(r.tps)} | ${fmt(r.ppTps)} |`) + } + } + } + lines.push('') + } + + lines.push('## Best configuration per device') + lines.push('') + lines.push('| Device | Highest TPS | Highest ppTPS |') + lines.push('| --- | --- | --- |') + for (const device of devices) { + const ok = byDevice.get(device).filter(r => !r.crashed) + const bestTps = ok.filter(r => r.tps !== null).sort((a, b) => b.tps - a.tps)[0] + const bestPp = ok.filter(r => r.ppTps !== null).sort((a, b) => b.ppTps - a.ppTps)[0] + const tpsCell = bestTps ? `${bestTps.config} — ${fmt(bestTps.tps)}` : '-' + const ppCell = bestPp ? `${bestPp.config} — ${fmt(bestPp.ppTps)}` : '-' + lines.push(`| ${device} | ${tpsCell} | ${ppCell} |`) + } + lines.push('') + return lines.join('\n') + '\n' +} + +function main () { + const args = parseArgs(process.argv) + + const { rows, meta } = loadDir(args.dir, args.desktopDevice) + + let baselineMap = null + if (args.compareDir) { + const { rows: baseRows } = loadDir(args.compareDir, args.desktopDevice) + baselineMap = buildBaselineMap(baseRows) + } + + if (rows.length === 0) { + const msg = 'No benchmark results found.\n' + if (args.output) fs.writeFileSync(args.output, msg) + else process.stdout.write(msg) + return + } + + const md = render(rows, args.desktopDevice, meta, args.addonVersion, baselineMap) + if (args.output) fs.writeFileSync(args.output, md) + else process.stdout.write(md) +} + +main() diff --git a/packages/llm-llamacpp/benchmarks/performance/reporters.js b/packages/llm-llamacpp/benchmarks/performance/reporters.js index 9293b29e5c..d21b36d602 100644 --- a/packages/llm-llamacpp/benchmarks/performance/reporters.js +++ b/packages/llm-llamacpp/benchmarks/performance/reporters.js @@ -45,12 +45,12 @@ function toMarkdown (report) { lines.push('') for (const model of report.models) { lines.push(`## Model: ${model.modelId}`) - lines.push('| Quantization | Device | Ctx Size | Batch Size | Ubatch Size | Flash Attn | Threads | Cache K | Cache V | Prompt Case | Status | Load Mean | Load Std | Run Mean | Run Std | TTFT Mean | TTFT Std | TPS Mean | TPS Std | Unload Mean | Unload Std | Prompt Tokens | Generated Tokens | Quality Match | Error |') - lines.push('|---|---|---:|---:|---:|---|---:|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|') + lines.push('| Quantization | Reasoning Budget | Device | Ctx Size | Batch Size | Ubatch Size | Flash Attn | Threads | Cache K | Cache V | Prompt Case | Status | TTFT Mean | TTFT Std | TPS Mean | TPS Std | ppTPS Mean | ppTPS Std | Load Mean | Load Std | Run Mean | Run Std | Unload Mean | Unload Std | Prompt Tokens | Generated Tokens | Quality Match | Error |') + lines.push('|---|---|---|---:|---:|---:|---|---:|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|') for (const item of model.cases) { const runtimeConfig = item.runtimeConfig || {} - const quality = item.qualityMatch != null ? item.qualityMatch.toFixed(3) : '' const quantizationCell = item.isBaseline ? 'default' : (item.quantization ?? '') + const rbCell = item.isBaseline ? 'default' : (runtimeConfig['reasoning-budget'] != null ? String(runtimeConfig['reasoning-budget']) : '') const deviceCell = item.isBaseline ? 'default' : (runtimeConfig.device != null ? String(runtimeConfig.device) : '') const ctxSizeCell = item.isBaseline ? 'default' : (runtimeConfig['ctx-size'] != null ? String(runtimeConfig['ctx-size']) : '') const batchSizeCell = item.isBaseline ? 'default' : (runtimeConfig['batch-size'] != null ? String(runtimeConfig['batch-size']) : '') @@ -65,14 +65,15 @@ function toMarkdown (report) { ? truncateText(item.error.message, 120) : '' lines.push( - `| ${quantizationCell} | ${deviceCell} | ${ctxSizeCell} | ${batchSizeCell} | ${ubatchSizeCell} | ${flashAttnCell} | ${threadsCell} | ${cacheKCell} | ${cacheVCell} | ${item.promptCase ?? ''} | ${item.status ?? ''}` + - ` | ${item.metrics?.loadMsMean ?? ''} | ${item.metrics?.loadMsStd ?? ''}` + - ` | ${item.metrics?.runMsMean ?? ''} | ${item.metrics?.runMsStd ?? ''}` + + `| ${quantizationCell} | ${rbCell} | ${deviceCell} | ${ctxSizeCell} | ${batchSizeCell} | ${ubatchSizeCell} | ${flashAttnCell} | ${threadsCell} | ${cacheKCell} | ${cacheVCell} | ${item.promptCase ?? ''} | ${item.status ?? ''}` + ` | ${item.metrics?.ttftMsMean ?? ''} | ${item.metrics?.ttftMsStd ?? ''}` + ` | ${item.metrics?.tpsMean ?? ''} | ${item.metrics?.tpsStd ?? ''}` + + ` | ${item.metrics?.ppTpsMean ?? ''} | ${item.metrics?.ppTpsStd ?? ''}` + + ` | ${item.metrics?.loadMsMean ?? ''} | ${item.metrics?.loadMsStd ?? ''}` + + ` | ${item.metrics?.runMsMean ?? ''} | ${item.metrics?.runMsStd ?? ''}` + ` | ${item.metrics?.unloadMsMean ?? ''} | ${item.metrics?.unloadMsStd ?? ''}` + ` | ${item.metrics?.promptTokens ?? ''} | ${item.metrics?.generatedTokens ?? ''}` + - ` | ${quality} | ${errorCell} |` + ` | ${item.qualityMatch != null ? item.qualityMatch.toFixed(3) : ''} | ${errorCell} |` ) } lines.push('') diff --git a/packages/llm-llamacpp/benchmarks/performance/test-prompts.json b/packages/llm-llamacpp/benchmarks/performance/test-prompts.json index d8bce45bbb..abeffddb47 100644 --- a/packages/llm-llamacpp/benchmarks/performance/test-prompts.json +++ b/packages/llm-llamacpp/benchmarks/performance/test-prompts.json @@ -8,7 +8,7 @@ }, { "role": "user", - "content": "You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. " + "content": "Summarize the following passage and explain its key technical implications for on-device inference.\n\nModern large language models have transformed natural language processing. Unlike earlier systems that relied on handcrafted features and task-specific architectures, transformer-based models learn general-purpose representations that transfer across many tasks. This shift enabled strong performance in text generation, translation, question answering, and code synthesis, frequently matching expert humans on established benchmarks.\n\nThe scaling laws governing these models describe a consistent relationship between compute, training data, and model capacity. As researchers grow model size and dataset volume, capabilities tend to improve smoothly and predictably, with occasional emergent abilities appearing at particular scale thresholds. This predictability has guided the design of increasingly capable systems, while raising real questions about energy use and cost.\n\nInference efficiency is now a central challenge. Quantization reduces the memory footprint and increases throughput by storing weights at lower numerical precision, allowing deployment on edge devices that would otherwise lack the necessary memory bandwidth. Speculative decoding and continuous batching push throughput further by using available compute more fully during autoregressive generation. Together these techniques make it practical to run capable models locally on consumer hardware, cutting latency and preserving privacy because data never leaves the device.\n\nReasoning quality continues to improve through chain-of-thought prompting and reinforcement learning from human feedback. Models with an explicit reasoning budget can spend more computation on hard problems while staying efficient on simple queries by disabling the reasoning trace entirely. Balancing this budget against latency and battery on mobile hardware is an open and practical engineering problem that the field is only beginning to address in production systems.\n\nOn mobile devices the constraints are sharper than on servers. Memory is limited, thermal headroom is small, and sustained throughput drops as the device heats up under a long generation. Prefill throughput, measured as prompt tokens processed per second, often behaves very differently from decode throughput, because prefill is compute bound across the whole prompt while decode is memory bound on a single token at a time. Quantization format interacts with both phases in ways that are hard to predict from first principles, which is exactly why empirical benchmarks across formats and devices matter. A format that is fast to decode on a desktop GPU may be slower on a phone because of how its blocks map onto the available kernels and cache hierarchy. Measuring time to first token, decode tokens per second, and prefill tokens per second across each quantization and reasoning setting gives the clearest practical picture of what users will actually experience." } ] }, diff --git a/packages/llm-llamacpp/scripts/generate-mobile-integration-tests.js b/packages/llm-llamacpp/scripts/generate-mobile-integration-tests.js index 09fefedb00..5e87bec4bd 100644 --- a/packages/llm-llamacpp/scripts/generate-mobile-integration-tests.js +++ b/packages/llm-llamacpp/scripts/generate-mobile-integration-tests.js @@ -66,9 +66,15 @@ function validateGroups (functionNames) { } const groups = JSON.parse(fs.readFileSync(groupsFile, 'utf-8')) const nameSet = new Set(functionNames) + // Benchmark shards (benchmark-perf-*.test.js -> runBenchmarkPerf*) are + // scheduled only by the Benchmark Performance workflow via an explicit + // test_groups override, and are deliberately absent from test-groups.json + // so normal mobile integration runs never trigger the heavy benchmark. + // Exclude them from the group-coverage requirement. + const isOverrideOnly = (n) => n.startsWith('runBenchmarkPerf') for (const [platform, splits] of Object.entries(groups)) { const covered = new Set(Object.values(splits).flat()) - const missing = functionNames.filter(n => !covered.has(n)) + const missing = functionNames.filter(n => !covered.has(n) && !isOverrideOnly(n)) const extra = [...covered].filter(n => !nameSet.has(n)) if (missing.length) { throw new Error( diff --git a/packages/llm-llamacpp/test/integration/_benchmark-perf.js b/packages/llm-llamacpp/test/integration/_benchmark-perf.js new file mode 100644 index 0000000000..949e11fd69 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/_benchmark-perf.js @@ -0,0 +1,222 @@ +'use strict' + +// Shared runner for the mobile perf benchmark. Sharded into one test file per +// (model x KV-cache type) (benchmark-perf---.test.js) +// so each Device Farm session finishes inside the fixed 20-minute iOS per-test +// ceiling; this module holds the logic they all share. Underscore prefix keeps +// it out of the mobile test generator (it is not a *.test.js file). +// +// Each shard sweeps its model across both devices (gpu, cpu) and both +// reasoning-budget values (-1, 0), recording TTFT / TPS / ppTPS. The full +// matrix (2 sizes x 5 quants x 3 KV-cache types x 2 devices x 2 budgets) is +// split across the shard files; nothing here reduces it. + +const path = require('bare-path') +const LlmLlamacpp = require('../../index.js') +const { ensureModel, safeTest } = require('./utils') +const { attachSpecLogger } = require('./spec-logger') +const { recordPerformance, isMobile } = require('./_perf-helper.js') +const os = require('bare-os') + +const DEVICES = ['gpu', 'cpu'] +const REASONING_BUDGETS = ['-1', '0'] + +const RUNTIME = { + gpu_layers: '999', + ctx_size: '2048', + n_predict: '512', + temp: '0.1', + seed: '42', + verbosity: '0' +} + +// ~512-token prompt (verified against the Qwen3.5 tokenizer at 518 templated tokens). +const PROMPT = [ + { role: 'system', content: 'You are a helpful assistant.' }, + { + role: 'user', + content: 'Summarize the following passage and explain its key technical implications for on-device inference.\n\nModern large language models have transformed natural language processing. Unlike earlier systems that relied on handcrafted features and task-specific architectures, transformer-based models learn general-purpose representations that transfer across many tasks. This shift enabled strong performance in text generation, translation, question answering, and code synthesis, frequently matching expert humans on established benchmarks.\n\nThe scaling laws governing these models describe a consistent relationship between compute, training data, and model capacity. As researchers grow model size and dataset volume, capabilities tend to improve smoothly and predictably, with occasional emergent abilities appearing at particular scale thresholds. This predictability has guided the design of increasingly capable systems, while raising real questions about energy use and cost.\n\nInference efficiency is now a central challenge. Quantization reduces the memory footprint and increases throughput by storing weights at lower numerical precision, allowing deployment on edge devices that would otherwise lack the necessary memory bandwidth. Speculative decoding and continuous batching push throughput further by using available compute more fully during autoregressive generation. Together these techniques make it practical to run capable models locally on consumer hardware, cutting latency and preserving privacy because data never leaves the device.\n\nReasoning quality continues to improve through chain-of-thought prompting and reinforcement learning from human feedback. Models with an explicit reasoning budget can spend more computation on hard problems while staying efficient on simple queries by disabling the reasoning trace entirely. Balancing this budget against latency and battery on mobile hardware is an open and practical engineering problem that the field is only beginning to address in production systems.\n\nOn mobile devices the constraints are sharper than on servers. Memory is limited, thermal headroom is small, and sustained throughput drops as the device heats up under a long generation. Prefill throughput, measured as prompt tokens processed per second, often behaves very differently from decode throughput, because prefill is compute bound across the whole prompt while decode is memory bound on a single token at a time. Quantization format interacts with both phases in ways that are hard to predict from first principles, which is exactly why empirical benchmarks across formats and devices matter. A format that is fast to decode on a desktop GPU may be slower on a phone because of how its blocks map onto the available kernels and cache hierarchy. Measuring time to first token, decode tokens per second, and prefill tokens per second across each quantization and reasoning setting gives the clearest practical picture of what users will actually experience.' + } +] + +function _envInt (key, fallback) { + let raw = '' + if (typeof os.getEnv === 'function') raw = os.getEnv(key) || '' + if (!raw && typeof process !== 'undefined' && process.env) raw = process.env[key] || '' + const v = parseInt(raw, 10) + return Number.isFinite(v) && v > 0 ? v : fallback +} +const PERF_RUNS = _envInt('QVAC_PERF_RUNS', 1) +const PERF_WARMUP_RUNS = _envInt('QVAC_PERF_WARMUP_RUNS', 1) + +function modelSpec (size, quant) { + return { + id: `qwen3.5-${size.toLowerCase()}-${quant}`, + name: `Qwen3.5-${size}-${quant}.gguf`, + url: `https://huggingface.co/unsloth/Qwen3.5-${size}-GGUF/resolve/main/Qwen3.5-${size}-${quant}.gguf` + } +} + +function modelSpec17b (quant) { + return { + id: `qwen3-1.7b-${quant}`, + name: `Qwen3-1.7B-${quant}.gguf`, + url: `https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-${quant}.gguf` + } +} + +function benchmarkModel17b (quant, cacheType) { + const spec = modelSpec17b(quant) + const id = `${spec.id}-${cacheType}` + safeTest(`Mobile perf benchmark: ${id} (TTFT / TPS / ppTPS)`, { + timeout: 1_800_000, + skip: !isMobile + }, async t => { + const specLogger = attachSpecLogger({ forwardToConsole: true }) + try { + const [modelName, dirPath] = await ensureModel({ modelName: spec.name, downloadUrl: spec.url }) + const modelPath = path.join(dirPath, modelName) + for (const device of DEVICES) { + const labelFor = rb => `[${spec.id}] [${device}] [rb=${rb}] [kv=${cacheType}]` + const modelFor = rb => `${id}-${device}-rb${rb}` + for (const rb of REASONING_BUDGETS) recordCrashedPlaceholder(labelFor(rb), device, modelFor(rb)) + let addon = null + try { + addon = new LlmLlamacpp({ + files: { model: [modelPath] }, + config: { ...RUNTIME, device, 'cache-type-k': cacheType, 'cache-type-v': cacheType }, + logger: { error: () => {}, warn: () => {}, info: () => {}, debug: () => {} }, + opts: { stats: true } + }) + await addon.load() + } catch (loadErr) { + t.comment(`[${id}] [${device}] load failed (reported as Crashed): ${loadErr && loadErr.message ? loadErr.message : loadErr}`) + await (addon && addon.unload && addon.unload().catch(() => {})) + continue + } + try { + for (const rb of REASONING_BUDGETS) { + const label = labelFor(rb) + try { + for (let w = 1; w <= PERF_WARMUP_RUNS; w++) { + const { endTime, startTime } = await runInference(addon, PROMPT, rb) + t.comment(`${label} warmup ${w}/${PERF_WARMUP_RUNS} (${endTime - startTime}ms) - perf NOT recorded`) + } + for (let run = 1; run <= PERF_RUNS; run++) { + const { output, startTime, endTime, stats } = await runInference(addon, PROMPT, rb) + t.comment(recordPerformance(label, endTime - startTime, { stats, deviceId: device, scenario: 'benchmark-perf', model: modelFor(rb) })) + t.ok(output.length > 0, `${label} run ${run}/${PERF_RUNS} produced output`) + } + } catch (runErr) { + t.comment(`${label} run failed (reported as Crashed): ${runErr && runErr.message ? runErr.message : runErr}`) + } + } + } finally { + await addon.unload().catch(() => {}) + } + } + } finally { + specLogger.release() + } + }) +} + +async function runInference (addon, prompt, reasoningBudget) { + const startTime = Date.now() + const response = await addon.run(prompt, { + generationParams: { reasoning_budget: parseInt(reasoningBudget, 10) } + }) + const chunks = [] + let error = null + response + .onUpdate(data => { chunks.push(data) }) + .onError(err => { error = err }) + await response.await() + if (error) throw new Error('inference failed: ' + error) + return { output: chunks.join('').trim(), startTime, endTime: Date.now(), stats: response.stats || null } +} + +// Records a placeholder row with no metrics. The renderer shows any row +// without TTFT/TPS/ppTPS as `Crashed`. We emit one up-front for every combo +// BEFORE loading/running it, so a hard native crash that kills the Device +// Farm session still leaves a `Crashed` row in the logs (the mobile reporter +// flushes each record to console immediately). A successful run records the +// real metrics afterwards, which supersedes the placeholder in the renderer. +function recordCrashedPlaceholder (label, device, model) { + recordPerformance(label, 0, { stats: null, deviceId: device, scenario: 'benchmark-perf', model }) +} + +// Registers the benchmark test for one (model x quant x kv-cache type), +// sweeping device x reasoning-budget. One Device Farm session per call. +// kv-cache type is set as cache-type-k/v at load time; Adreno devices don't +// support quantized KV cache, so those combos may crash — reported as Crashed. +function benchmarkModel (size, quant, cacheType) { + const spec = modelSpec(size, quant) + const id = `${spec.id}-${cacheType}` + safeTest(`Mobile perf benchmark: ${id} (TTFT / TPS / ppTPS)`, { + timeout: 1_800_000, + skip: !isMobile + }, async t => { + const specLogger = attachSpecLogger({ forwardToConsole: true }) + try { + const [modelName, dirPath] = await ensureModel({ modelName: spec.name, downloadUrl: spec.url }) + const modelPath = path.join(dirPath, modelName) + + for (const device of DEVICES) { + const labelFor = rb => `[${spec.id}] [${device}] [rb=${rb}] [kv=${cacheType}]` + const modelFor = rb => `${id}-${device}-rb${rb}` + // Up-front Crashed placeholders for every combo on this device. + for (const rb of REASONING_BUDGETS) recordCrashedPlaceholder(labelFor(rb), device, modelFor(rb)) + + let addon = null + try { + addon = new LlmLlamacpp({ + files: { model: [modelPath] }, + config: { ...RUNTIME, device, 'cache-type-k': cacheType, 'cache-type-v': cacheType }, + logger: { error: () => {}, warn: () => {}, info: () => {}, debug: () => {} }, + opts: { stats: true } + }) + await addon.load() + } catch (loadErr) { + // Load failed (e.g. unsupported quantized KV cache) — placeholders + // remain Crashed for this device's combos. Move on. + t.comment(`[${id}] [${device}] load failed (reported as Crashed): ${loadErr && loadErr.message ? loadErr.message : loadErr}`) + await (addon && addon.unload && addon.unload().catch(() => {})) + continue + } + + try { + for (const rb of REASONING_BUDGETS) { + const label = labelFor(rb) + try { + for (let w = 1; w <= PERF_WARMUP_RUNS; w++) { + const { endTime, startTime } = await runInference(addon, PROMPT, rb) + t.comment(`${label} warmup ${w}/${PERF_WARMUP_RUNS} (${endTime - startTime}ms) - perf NOT recorded`) + } + for (let run = 1; run <= PERF_RUNS; run++) { + const { output, startTime, endTime, stats } = await runInference(addon, PROMPT, rb) + // Real metrics supersede the Crashed placeholder in the renderer. + t.comment(recordPerformance(label, endTime - startTime, { + stats, + deviceId: device, + scenario: 'benchmark-perf', + model: modelFor(rb) + })) + t.ok(output.length > 0, `${label} run ${run}/${PERF_RUNS} produced output`) + } + } catch (runErr) { + // Catchable run failure — placeholder stays Crashed for this combo. + t.comment(`${label} run failed (reported as Crashed): ${runErr && runErr.message ? runErr.message : runErr}`) + } + } + } finally { + await addon.unload().catch(() => {}) + } + } + } finally { + specLogger.release() + } + }) +} + +module.exports = { benchmarkModel, modelSpec, benchmarkModel17b, modelSpec17b } diff --git a/packages/llm-llamacpp/test/integration/_perf-helper.js b/packages/llm-llamacpp/test/integration/_perf-helper.js index 86d2bc9681..ffa71ab0b8 100644 --- a/packages/llm-llamacpp/test/integration/_perf-helper.js +++ b/packages/llm-llamacpp/test/integration/_perf-helper.js @@ -277,6 +277,7 @@ function recordPerformance (label, totalTime, extra) { const ttftMs = stats ? _num(stats.TTFT) : null const tps = stats ? _num(stats.TPS) : null + const ppTps = stats ? _num(stats.ppTPS) : null const generatedTokens = stats ? _num(stats.generatedTokens) : null const promptTokens = stats ? _num(stats.promptTokens) : null @@ -309,7 +310,8 @@ function recordPerformance (label, totalTime, extra) { ttft_ms: ttftMs !== null ? Math.round(ttftMs) : null, generated_tokens: generatedTokens, prompt_tokens: promptTokens, - tps: tps !== null ? Number(tps.toFixed(2)) : null + tps: tps !== null ? Number(tps.toFixed(2)) : null, + pp_tps: ppTps !== null ? Number(ppTps.toFixed(2)) : null }, { scenario: (extra && extra.scenario) || 'default', model: (extra && extra.model) || null, @@ -342,6 +344,7 @@ function recordPerformance (label, totalTime, extra) { ` - Prefill / TTFT: ${ttftMs !== null ? Math.round(ttftMs) + 'ms' : 'n/a'}`, ` - Decode: ${decodeMs !== null ? decodeMs + 'ms' : 'n/a'}`, ` - TPS: ${tps !== null ? tps.toFixed(2) : 'n/a'}`, + ` - ppTPS: ${ppTps !== null ? ppTps.toFixed(2) : 'n/a'}`, ` - Tokens: ${generatedTokens !== null ? generatedTokens : 'n/a'} gen / ${promptTokens !== null ? promptTokens : 'n/a'} prompt` ] return lines.join('\n') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-f16.test.js new file mode 100644 index 0000000000..b2011f5e3f --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q4_0', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q4-0.test.js new file mode 100644 index 0000000000..62cbd0db72 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q4_0', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q8-0.test.js new file mode 100644 index 0000000000..12d3897dba --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q4_0', 'q8_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-f16.test.js new file mode 100644 index 0000000000..ec653390c3 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q4_1', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q4-0.test.js new file mode 100644 index 0000000000..16e6559a6e --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q4_1', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q8-0.test.js new file mode 100644 index 0000000000..e8ea6d15f4 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q4_1', 'q8_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-f16.test.js new file mode 100644 index 0000000000..2fe48245cf --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q4_K_M', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q4-0.test.js new file mode 100644 index 0000000000..829734d9b1 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q4_K_M', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q8-0.test.js new file mode 100644 index 0000000000..b8343a978b --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q4_K_M', 'q8_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-f16.test.js new file mode 100644 index 0000000000..b47efb3d9e --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q6_K', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q4-0.test.js new file mode 100644 index 0000000000..7debe08857 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q6_K', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q8-0.test.js new file mode 100644 index 0000000000..087dace127 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q6_K', 'q8_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-f16.test.js new file mode 100644 index 0000000000..df404731e8 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q8_0', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q4-0.test.js new file mode 100644 index 0000000000..c06838216d --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q8_0', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q8-0.test.js new file mode 100644 index 0000000000..2cf00cb768 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('0.8B', 'Q8_0', 'q8_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-f16.test.js new file mode 100644 index 0000000000..0eaa6ae7e1 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel17b } = require('./_benchmark-perf.js') +benchmarkModel17b('Q4_0', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q4-0.test.js new file mode 100644 index 0000000000..ebda1f97e9 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel17b } = require('./_benchmark-perf.js') +benchmarkModel17b('Q4_0', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q8-0.test.js new file mode 100644 index 0000000000..7889d715a9 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel17b } = require('./_benchmark-perf.js') +benchmarkModel17b('Q4_0', 'q8_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-f16.test.js new file mode 100644 index 0000000000..26e208ada4 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel17b } = require('./_benchmark-perf.js') +benchmarkModel17b('Q4_K_M', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q4-0.test.js new file mode 100644 index 0000000000..65eb74c8a2 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel17b } = require('./_benchmark-perf.js') +benchmarkModel17b('Q4_K_M', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q8-0.test.js new file mode 100644 index 0000000000..5c958e4768 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel17b } = require('./_benchmark-perf.js') +benchmarkModel17b('Q4_K_M', 'q8_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-f16.test.js new file mode 100644 index 0000000000..0c2c448f4f --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel17b } = require('./_benchmark-perf.js') +benchmarkModel17b('Q8_0', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q4-0.test.js new file mode 100644 index 0000000000..947d7b7534 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel17b } = require('./_benchmark-perf.js') +benchmarkModel17b('Q8_0', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q8-0.test.js new file mode 100644 index 0000000000..11531dac36 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel17b } = require('./_benchmark-perf.js') +benchmarkModel17b('Q8_0', 'q8_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-f16.test.js new file mode 100644 index 0000000000..0d520054b5 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q4_0', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q4-0.test.js new file mode 100644 index 0000000000..c8bcd0cc2a --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q4_0', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q8-0.test.js new file mode 100644 index 0000000000..d059dc5acd --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q4_0', 'q8_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-f16.test.js new file mode 100644 index 0000000000..4c9b824d03 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q4_1', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q4-0.test.js new file mode 100644 index 0000000000..fb6ecd7440 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q4_1', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q8-0.test.js new file mode 100644 index 0000000000..982192cba9 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q4_1', 'q8_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-f16.test.js new file mode 100644 index 0000000000..897b184957 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q4_K_M', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q4-0.test.js new file mode 100644 index 0000000000..58f0a64f76 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q4_K_M', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q8-0.test.js new file mode 100644 index 0000000000..5fab3f553c --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q4_K_M', 'q8_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-f16.test.js new file mode 100644 index 0000000000..9f32e33a64 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q6_K', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q4-0.test.js new file mode 100644 index 0000000000..f0497f88ba --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q6_K', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q8-0.test.js new file mode 100644 index 0000000000..b86363d867 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q6_K', 'q8_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-f16.test.js new file mode 100644 index 0000000000..8f47551a85 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-f16.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q8_0', 'f16') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q4-0.test.js new file mode 100644 index 0000000000..37a82b6ddb --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q4-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q8_0', 'q4_0') diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q8-0.test.js new file mode 100644 index 0000000000..827ff27d24 --- /dev/null +++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q8-0.test.js @@ -0,0 +1,3 @@ +'use strict' +const { benchmarkModel } = require('./_benchmark-perf.js') +benchmarkModel('2B', 'Q8_0', 'q8_0') diff --git a/packages/llm-llamacpp/test/mobile/integration.auto.cjs b/packages/llm-llamacpp/test/mobile/integration.auto.cjs index 6f6d1fd7ec..9eb42d5f8e 100644 --- a/packages/llm-llamacpp/test/mobile/integration.auto.cjs +++ b/packages/llm-llamacpp/test/mobile/integration.auto.cjs @@ -16,6 +16,201 @@ async function runApiBehaviorTest (options = {}) { // eslint-disable-line no-unu return runIntegrationModule('../integration/api-behavior.test.js', options) } +async function runBenchmarkPerf08bQ40F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-f16.test.js', options) +} + +async function runBenchmarkPerf08bQ40Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-q4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ40Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-q8-0.test.js', options) +} + +async function runBenchmarkPerf08bQ41F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-f16.test.js', options) +} + +async function runBenchmarkPerf08bQ41Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-q4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ41Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-q8-0.test.js', options) +} + +async function runBenchmarkPerf08bQ4KMF16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMF16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-f16.test.js', options) +} + +async function runBenchmarkPerf08bQ4KMQ40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMQ40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-q4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ4KMQ80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMQ80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-q8-0.test.js', options) +} + +async function runBenchmarkPerf08bQ6KF16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KF16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-f16.test.js', options) +} + +async function runBenchmarkPerf08bQ6KQ40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KQ40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-q4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ6KQ80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KQ80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-q8-0.test.js', options) +} + +async function runBenchmarkPerf08bQ80F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-f16.test.js', options) +} + +async function runBenchmarkPerf08bQ80Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-q4-0.test.js', options) +} + +async function runBenchmarkPerf08bQ80Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-q8-0.test.js', options) +} + +async function runBenchmarkPerf17bQ40F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ40F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-17b-q4-0-f16.test.js', options) +} + +async function runBenchmarkPerf17bQ40Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ40Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-17b-q4-0-q4-0.test.js', options) +} + +async function runBenchmarkPerf17bQ40Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ40Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-17b-q4-0-q8-0.test.js', options) +} + +async function runBenchmarkPerf17bQ4KMF16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ4KMF16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-17b-q4-k-m-f16.test.js', options) +} + +async function runBenchmarkPerf17bQ4KMQ40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ4KMQ40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-17b-q4-k-m-q4-0.test.js', options) +} + +async function runBenchmarkPerf17bQ4KMQ80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ4KMQ80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-17b-q4-k-m-q8-0.test.js', options) +} + +async function runBenchmarkPerf17bQ80F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ80F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-17b-q8-0-f16.test.js', options) +} + +async function runBenchmarkPerf17bQ80Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ80Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-17b-q8-0-q4-0.test.js', options) +} + +async function runBenchmarkPerf17bQ80Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ80Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-17b-q8-0-q8-0.test.js', options) +} + +async function runBenchmarkPerf2bQ40F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-f16.test.js', options) +} + +async function runBenchmarkPerf2bQ40Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-q4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ40Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-q8-0.test.js', options) +} + +async function runBenchmarkPerf2bQ41F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-f16.test.js', options) +} + +async function runBenchmarkPerf2bQ41Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-q4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ41Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-q8-0.test.js', options) +} + +async function runBenchmarkPerf2bQ4KMF16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMF16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-f16.test.js', options) +} + +async function runBenchmarkPerf2bQ4KMQ40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMQ40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-q4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ4KMQ80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMQ80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-q8-0.test.js', options) +} + +async function runBenchmarkPerf2bQ6KF16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KF16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-f16.test.js', options) +} + +async function runBenchmarkPerf2bQ6KQ40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KQ40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-q4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ6KQ80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KQ80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-q8-0.test.js', options) +} + +async function runBenchmarkPerf2bQ80F16Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80F16Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-f16.test.js', options) +} + +async function runBenchmarkPerf2bQ80Q40Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80Q40Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-q4-0.test.js', options) +} + +async function runBenchmarkPerf2bQ80Q80Test (options = {}) { // eslint-disable-line no-unused-vars + if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80Q80Test')) return __FILTERED + return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-q8-0.test.js', options) +} + async function runBitnetTest (options = {}) { // eslint-disable-line no-unused-vars if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBitnetTest')) return __FILTERED return runIntegrationModule('../integration/bitnet.test.js', options)