From da900582bec5d66a84b2c15d325a2dd448a13233 Mon Sep 17 00:00:00 2001
From: Ridwan Taiwo <donriddo@gmail.com>
Date: Wed, 3 Jun 2026 16:09:53 +0100
Subject: [PATCH 01/10] =?UTF-8?q?feat:=20LLM=20benchmark=20perf=20suite=20?=
 =?UTF-8?q?(Qwen3.5)=20=E2=80=94=20desktop=20+=20mobile,=20KV-cache=20swee?=
 =?UTF-8?q?p,=20crash=20reporting,=20unified=20report?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../workflows/benchmark-perf-llm-llamacpp.yml | 129 ++++++++----
 .../benchmarks/performance/case-runner.js     |  21 +-
 .../performance/llm-parameter-sweep.config.js |  25 ++-
 .../performance/llm-parameter-sweep.js        |   7 +-
 .../performance/models.manifest.json          |  18 +-
 .../benchmarks/performance/prepare-prompts.js |  10 +-
 .../benchmarks/performance/render-report.js   | 194 ++++++++++++++++++
 .../benchmarks/performance/reporters.js       |  15 +-
 .../benchmarks/performance/test-prompts.json  |   2 +-
 .../generate-mobile-integration-tests.js      |   8 +-
 .../test/integration/_benchmark-perf.js       | 158 ++++++++++++++
 .../test/integration/_perf-helper.js          |   5 +-
 .../benchmark-perf-08b-q4-0-f16.test.js       |   3 +
 .../benchmark-perf-08b-q4-0-q4-0.test.js      |   3 +
 .../benchmark-perf-08b-q4-0-q8-0.test.js      |   3 +
 .../benchmark-perf-08b-q4-1-f16.test.js       |   3 +
 .../benchmark-perf-08b-q4-1-q4-0.test.js      |   3 +
 .../benchmark-perf-08b-q4-1-q8-0.test.js      |   3 +
 .../benchmark-perf-08b-q4-k-m-f16.test.js     |   3 +
 .../benchmark-perf-08b-q4-k-m-q4-0.test.js    |   3 +
 .../benchmark-perf-08b-q4-k-m-q8-0.test.js    |   3 +
 .../benchmark-perf-08b-q6-k-f16.test.js       |   3 +
 .../benchmark-perf-08b-q6-k-q4-0.test.js      |   3 +
 .../benchmark-perf-08b-q6-k-q8-0.test.js      |   3 +
 .../benchmark-perf-08b-q8-0-f16.test.js       |   3 +
 .../benchmark-perf-08b-q8-0-q4-0.test.js      |   3 +
 .../benchmark-perf-08b-q8-0-q8-0.test.js      |   3 +
 .../benchmark-perf-2b-q4-0-f16.test.js        |   3 +
 .../benchmark-perf-2b-q4-0-q4-0.test.js       |   3 +
 .../benchmark-perf-2b-q4-0-q8-0.test.js       |   3 +
 .../benchmark-perf-2b-q4-1-f16.test.js        |   3 +
 .../benchmark-perf-2b-q4-1-q4-0.test.js       |   3 +
 .../benchmark-perf-2b-q4-1-q8-0.test.js       |   3 +
 .../benchmark-perf-2b-q4-k-m-f16.test.js      |   3 +
 .../benchmark-perf-2b-q4-k-m-q4-0.test.js     |   3 +
 .../benchmark-perf-2b-q4-k-m-q8-0.test.js     |   3 +
 .../benchmark-perf-2b-q6-k-f16.test.js        |   3 +
 .../benchmark-perf-2b-q6-k-q4-0.test.js       |   3 +
 .../benchmark-perf-2b-q6-k-q8-0.test.js       |   3 +
 .../benchmark-perf-2b-q8-0-f16.test.js        |   3 +
 .../benchmark-perf-2b-q8-0-q4-0.test.js       |   3 +
 .../benchmark-perf-2b-q8-0-q8-0.test.js       |   3 +
 .../test/mobile/integration.auto.cjs          | 150 ++++++++++++++
 43 files changed, 754 insertions(+), 78 deletions(-)
 create mode 100644 packages/llm-llamacpp/benchmarks/performance/render-report.js
 create mode 100644 packages/llm-llamacpp/test/integration/_benchmark-perf.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q8-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q8-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q8-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q8-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q8-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q8-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q8-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q8-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q8-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q8-0.test.js

diff --git a/.github/workflows/benchmark-perf-llm-llamacpp.yml b/.github/workflows/benchmark-perf-llm-llamacpp.yml
index 3cb312e98b..4bfabec2c0 100644
--- a/.github/workflows/benchmark-perf-llm-llamacpp.yml
+++ b/.github/workflows/benchmark-perf-llm-llamacpp.yml
@@ -6,7 +6,8 @@ name: Benchmark Performance — LLM Parameter Sweep
 #
 # To change what runs, edit:
 #   desktop: models.manifest.json (models) + llm-parameter-sweep.config.js (sweep dims)
-#   mobile:  mobile.config.json
+#   mobile:  test/integration/_benchmark-perf.js (shared runner) + the
+#            benchmark-perf-*.test.js shards (one per model x KV-cache type)
 
 on:
   workflow_dispatch:
@@ -123,6 +124,12 @@ jobs:
       - name: Setup LLVM
         uses: tetherto/qvac/.github/actions/setup-llvm@98a6a6b6e8f3866dfdd75052a4071269ce85dc41
 
+      - name: Setup Vulkan SDK
+        uses: tetherto/qvac/.github/actions/setup-vulkan-sdk@0bbdca93da303a0b1634ba14a89cec085621078d
+        with:
+          platform: linux
+          arch: x64
+
       - name: Build addon from source
         working-directory: packages/llm-llamacpp
         run: |
@@ -157,21 +164,6 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y libxi-dev libxtst-dev libxrandr-dev xz-utils
 
-          echo "Installing Vulkan SDK (latest)..."
-          wget -q -O /tmp/vulkansdk.tar.xz https://sdk.lunarg.com/sdk/download/latest/linux/vulkan_sdk.tar.xz
-          mkdir -p "$HOME/vulkan" && cd "$HOME/vulkan"
-          tar xf /tmp/vulkansdk.tar.xz --strip-components=1
-          export VULKAN_SDK="$HOME/vulkan/x86_64"
-          export PATH="$VULKAN_SDK/bin:$PATH"
-          export LD_LIBRARY_PATH="$VULKAN_SDK/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
-          export VK_ADD_LAYER_PATH="$VULKAN_SDK/share/vulkan/explicit_layer.d"
-          export PKG_CONFIG_PATH="$VULKAN_SDK/share/pkgconfig:$VULKAN_SDK/lib/pkgconfig${PKG_CONFIG_PATH:+:$PKG_CONFIG_PATH}"
-          echo "VULKAN_SDK=$VULKAN_SDK" >> $GITHUB_ENV
-          echo "PATH=$PATH" >> $GITHUB_ENV
-          echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV
-          echo "VK_ADD_LAYER_PATH=$VK_ADD_LAYER_PATH" >> $GITHUB_ENV
-          echo "PKG_CONFIG_PATH=$PKG_CONFIG_PATH" >> $GITHUB_ENV
-
           cd "$GITHUB_WORKSPACE/packages/llm-llamacpp"
           npm install
 
@@ -199,24 +191,8 @@ jobs:
         working-directory: packages/llm-llamacpp/benchmarks/performance
         run: bare ./llm-parameter-sweep.js --addon-source local
 
-      - name: Add job summary
-        if: always()
-        working-directory: packages/llm-llamacpp/benchmarks/performance
-        shell: bash
-        run: |
-          LATEST_MD=$(find results/parameter-sweep -name "*.md" -type f 2>/dev/null | sort | tail -1)
-          {
-            echo "## LLM Parameter Sweep — Desktop"
-            echo ""
-            echo "ref: \`${{ needs.context.outputs.ref }}\`"
-            echo ""
-            if [ -n "${LATEST_MD:-}" ]; then
-              cat "$LATEST_MD"
-            else
-              echo "No results file found."
-            fi
-          } >> "$GITHUB_STEP_SUMMARY"
-
+      # The run summary is rendered by the summarize job (unified desktop +
+      # mobile view); this job just uploads the raw sweep results.
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
@@ -242,7 +218,86 @@ jobs:
     with:
       repository: ${{ github.repository }}
       ref: ${{ needs.context.outputs.ref }}
-      # Schedule only the benchmark group. runBenchmarkPerfTest is deliberately
+      # One group per (model x KV-cache type) so each Device Farm session
+      # finishes inside the fixed 20-minute iOS per-test ceiling. These wrappers are deliberately
       # absent from the addon's test-groups.json, so this override is the only
-      # path that runs it — normal mobile integration runs never trigger it.
-      test_groups: '[{"name":"benchmarkPerf","grep":"runBenchmarkPerfTest"}]'
+      # path that runs them — normal mobile integration runs never trigger the
+      # benchmark.
+      test_groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]'
+
+  # Aggregates the mobile per-model perf-report artifacts (one per shard,
+  # both platforms) into a single consolidated table rendered into the run
+  # summary, so the full mobile matrix is visible from the run page without
+  # opening artifacts. Desktop writes its own table inline in its job.
+  summarize:
+    needs:
+      - context
+      - label-gate
+      - desktop-benchmark
+      - mobile-benchmark
+    if: needs.label-gate.outputs.authorised == 'true' && always() && needs.context.result == 'success'
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    permissions:
+      contents: read
+    steps:
+      - name: Checkout aggregator
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+        with:
+          ref: ${{ needs.context.outputs.ref }}
+          sparse-checkout: |
+            packages/llm-llamacpp/benchmarks/performance/render-report.js
+
+      - name: Setup Node.js
+        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
+        with:
+          node-version: lts/*
+
+      # Both desktop (llm-param-sweep-desktop-*) and mobile
+      # (perf-report-llamacpp-llm-*) artifacts feed the same renderer, so the
+      # run summary shows desktop and every device in one identical format.
+      - name: Download desktop sweep artifact
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
+        with:
+          pattern: llm-param-sweep-desktop-${{ github.run_number }}
+          path: combined-reports
+        continue-on-error: true
+
+      - name: Download mobile perf-report artifacts
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
+        with:
+          pattern: perf-report-llamacpp-llm-*-${{ github.run_number }}
+          path: combined-reports
+        continue-on-error: true
+
+      - name: Render unified benchmark report
+        run: |
+          if ! find combined-reports -name "*.json" -type f 2>/dev/null | grep -q .; then
+            echo "No benchmark reports found."
+            exit 0
+          fi
+          mkdir -p benchmark-artifacts
+          node packages/llm-llamacpp/benchmarks/performance/render-report.js \
+            --dir combined-reports \
+            --output benchmark-artifacts/qwen35-benchmark-findings.md
+
+      - name: Add to run summary
+        if: always()
+        shell: bash
+        run: |
+          set +e
+          MD_FILE="benchmark-artifacts/qwen35-benchmark-findings.md"
+          if [ -f "$MD_FILE" ]; then
+            cat "$MD_FILE" >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "No consolidated benchmark report available." >> "$GITHUB_STEP_SUMMARY"
+          fi
+
+      - name: Upload consolidated report
+        if: always()
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
+        with:
+          name: qwen35-benchmark-findings-${{ github.run_number }}
+          path: benchmark-artifacts/
+          retention-days: 90
+          if-no-files-found: ignore
diff --git a/packages/llm-llamacpp/benchmarks/performance/case-runner.js b/packages/llm-llamacpp/benchmarks/performance/case-runner.js
index ca149705ac..462aa62700 100644
--- a/packages/llm-llamacpp/benchmarks/performance/case-runner.js
+++ b/packages/llm-llamacpp/benchmarks/performance/case-runner.js
@@ -5,7 +5,9 @@ const path = require('bare-path')
 const { round, average, stddev, cartesianProduct } = require('./math')
 const { stripSurroundingQuotes, normalizeArgValue } = require('./utils')
 
-const PROMPT_CASES = ['long', 'ctx-filling', 'span-fill']
+// Focused WB run uses a single ~512-token prompt. Add 'ctx-filling' /
+// 'span-fill' back to also sweep context-fill and batch-spanning prompts.
+const PROMPT_CASES = ['long']
 const PROMPTS_PER_CASE = 1
 
 const SWEEP_OVERRIDE_KEYS = [
@@ -17,7 +19,8 @@ const SWEEP_OVERRIDE_KEYS = [
   'ubatch-size',
   'flash-attn',
   'cache-type-k',
-  'cache-type-v'
+  'cache-type-v',
+  'reasoning-budget'
 ]
 
 function splitCsvArg (value, key) {
@@ -83,6 +86,7 @@ function buildCases (modelDef, sweep) {
   const threadsValues = sweep.threads || []
   const cacheTypeKValues = sweep['cache-type-k'] || []
   const cacheTypeVValues = sweep['cache-type-v'] || []
+  const reasoningBudgetValues = sweep['reasoning-budget'] || []
 
   const cases = []
   for (const promptCase of PROMPT_CASES) {
@@ -101,6 +105,7 @@ function buildCases (modelDef, sweep) {
   if (devices.length > 0 && ctxSizes.length > 0 && batchSizes.length > 0 && ubatchSizes.length > 0 &&
       flashAttnValues.length > 0 &&
       threadsValues.length > 0 && cacheTypeKValues.length > 0 && cacheTypeVValues.length > 0) {
+    const rbValues = reasoningBudgetValues.length > 0 ? reasoningBudgetValues : [null]
     const combos = cartesianProduct([
       supportedQuants,
       devices,
@@ -110,10 +115,11 @@ function buildCases (modelDef, sweep) {
       flashAttnValues,
       threadsValues,
       cacheTypeKValues,
-      cacheTypeVValues
+      cacheTypeVValues,
+      rbValues
     ])
 
-    for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV] of combos) {
+    for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV, reasoningBudget] of combos) {
       if (Number(ubatchSize) > Number(batchSize)) {
         continue // Skip combinations where ubatchSize is greater than batchSize
       }
@@ -128,8 +134,10 @@ function buildCases (modelDef, sweep) {
         'cache-type-k': cacheTypeK,
         'cache-type-v': cacheTypeV
       }
+      if (reasoningBudget !== null) runtimeConfig['reasoning-budget'] = reasoningBudget
 
-      const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}`
+      const rbSuffix = reasoningBudget !== null ? `__rb=${reasoningBudget}` : ''
+      const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}${rbSuffix}`
 
       for (const promptCase of PROMPT_CASES) {
         cases.push({
@@ -207,6 +215,7 @@ function aggregateRunMetrics (runMetrics) {
   const unloadMsValues = runMetrics.map((x) => x.unloadMs).filter((x) => x != null)
   const ttftMsValues = runMetrics.map((x) => x.ttftMs).filter((x) => x != null)
   const tpsValues = runMetrics.map((x) => x.tps).filter((x) => x != null)
+  const ppTpsValues = runMetrics.map((x) => x.ppTps).filter((x) => x != null)
   const firstPromptTokens = runMetrics.find((x) => x.promptTokens != null)?.promptTokens ?? null
   const firstGeneratedTokens = runMetrics.find((x) => x.generatedTokens != null)?.generatedTokens ?? null
 
@@ -222,6 +231,8 @@ function aggregateRunMetrics (runMetrics) {
     ttftMsStd: round(stddev(ttftMsValues), 3),
     tpsMean: round(average(tpsValues), 3),
     tpsStd: round(stddev(tpsValues), 3),
+    ppTpsMean: round(average(ppTpsValues), 3),
+    ppTpsStd: round(stddev(ppTpsValues), 3),
     promptTokens: firstPromptTokens,
     generatedTokens: firstGeneratedTokens
   }
diff --git a/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.config.js b/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.config.js
index a495ec4a9a..5cf59638d3 100644
--- a/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.config.js
+++ b/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.config.js
@@ -3,10 +3,6 @@
 const fs = require('bare-fs')
 const path = require('bare-path')
 const os = require('bare-os')
-const {
-  DEFAULT_SWEEP_CTX_SIZES,
-  DEFAULT_SWEEP_BATCH_SIZES
-} = require('./utils')
 
 const DEFAULT_RESULTS_DIR = path.resolve(__dirname, 'results', 'parameter-sweep')
 const DEFAULT_MODELS_DIR = path.resolve(__dirname, 'models')
@@ -94,17 +90,20 @@ function loadModelsFromManifest () {
 
 const MODELS = loadModelsFromManifest()
 
-// Parameter sweep: full factorial (cartesian product)
+// Parameter sweep (cartesian product). Tuned to the focused WB run:
+// only quantization and reasoning-budget vary; every other dimension is
+// pinned to a single value. Edit these arrays to sweep more dimensions.
 const PARAMETER_SWEEP = {
-  quantization: ['Q4_0', 'Q4_K_M', 'Q8_0', 'F16'],
+  quantization: ['Q4_0', 'Q4_1', 'Q4_K_M', 'Q6_K', 'Q8_0'],
   device: getDefaultSweepDevices(),
-  'ctx-size': DEFAULT_SWEEP_CTX_SIZES.map(String),
-  threads: ['2', '4', '8'],
-  'batch-size': DEFAULT_SWEEP_BATCH_SIZES.map(String), // max: 10k
-  'ubatch-size': ['128', '512'], // must be <= batch-size
-  'flash-attn': ['off', 'on'],
-  'cache-type-k': ['f16', 'q8_0', 'q4_0'],
-  'cache-type-v': ['f16', 'q8_0', 'q4_0']
+  'ctx-size': ['2048'],
+  threads: ['4'],
+  'batch-size': ['512'],
+  'ubatch-size': ['512'],
+  'flash-attn': ['off'],
+  'cache-type-k': ['f16'],
+  'cache-type-v': ['f16'],
+  'reasoning-budget': ['-1', '0']
   // verbosity: fixed at '0' (not swept)
 }
 
diff --git a/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.js b/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.js
index c65f81c658..b82f8fbdb1 100644
--- a/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.js
+++ b/packages/llm-llamacpp/benchmarks/performance/llm-parameter-sweep.js
@@ -331,7 +331,8 @@ async function main () {
         const caseMetricSamples = {
           runMs: [],
           ttftMs: [],
-          tps: []
+          tps: [],
+          ppTps: []
         }
         let firstPromptTokens = null
         let firstGeneratedTokens = null
@@ -366,6 +367,7 @@ async function main () {
                 unloadMs: null, // Will unload after all prompts
                 ttftMs: round(ttftMs, 3),
                 tps: round(stats.TPS != null ? stats.TPS : null, 3),
+                ppTps: round(stats.ppTPS != null ? stats.ppTPS : null, 3),
                 promptTokens: stats.promptTokens ?? null,
                 generatedTokens: stats.generatedTokens ?? null
               }
@@ -374,6 +376,7 @@ async function main () {
               caseMetricSamples.runMs.push(metrics.runMs)
               if (metrics.ttftMs != null) caseMetricSamples.ttftMs.push(metrics.ttftMs)
               if (metrics.tps != null) caseMetricSamples.tps.push(metrics.tps)
+              if (metrics.ppTps != null) caseMetricSamples.ppTps.push(metrics.ppTps)
               if (firstPromptTokens == null && metrics.promptTokens != null) firstPromptTokens = metrics.promptTokens
               if (firstGeneratedTokens == null && metrics.generatedTokens != null) firstGeneratedTokens = metrics.generatedTokens
               caseRepeatsAttempted += 1
@@ -537,6 +540,8 @@ async function main () {
               ttftMsStd: round(stddev(caseMetricSamples.ttftMs), 3),
               tpsMean: round(average(caseMetricSamples.tps), 3),
               tpsStd: round(stddev(caseMetricSamples.tps), 3),
+              ppTpsMean: round(average(caseMetricSamples.ppTps), 3),
+              ppTpsStd: round(stddev(caseMetricSamples.ppTps), 3),
               promptTokens: firstPromptTokens,
               generatedTokens: firstGeneratedTokens
             }
diff --git a/packages/llm-llamacpp/benchmarks/performance/models.manifest.json b/packages/llm-llamacpp/benchmarks/performance/models.manifest.json
index ceed7a7dd9..a3f521d212 100644
--- a/packages/llm-llamacpp/benchmarks/performance/models.manifest.json
+++ b/packages/llm-llamacpp/benchmarks/performance/models.manifest.json
@@ -14,15 +14,19 @@
       }
     },
     {
-      "id": "qwen3-4b",
+      "id": "qwen3.5-0.8b",
       "gguf": {
-        "repo": "unsloth/Qwen3-4B-GGUF",
+        "repo": "unsloth/Qwen3.5-0.8B-GGUF",
         "revision": "main",
-        "quantizations": ["Q4_0", "Q4_K_M", "Q8_0", "F16"]
-      },
-      "pytorch": {
-        "repo": "Qwen/Qwen3-4B",
-        "revision": "main"
+        "quantizations": ["Q4_0", "Q4_1", "Q4_K_M", "Q6_K", "Q8_0"]
+      }
+    },
+    {
+      "id": "qwen3.5-2b",
+      "gguf": {
+        "repo": "unsloth/Qwen3.5-2B-GGUF",
+        "revision": "main",
+        "quantizations": ["Q4_0", "Q4_1", "Q4_K_M", "Q6_K", "Q8_0"]
       }
     }
   ]
diff --git a/packages/llm-llamacpp/benchmarks/performance/prepare-prompts.js b/packages/llm-llamacpp/benchmarks/performance/prepare-prompts.js
index 0a70fe49d1..44fd58eb1f 100644
--- a/packages/llm-llamacpp/benchmarks/performance/prepare-prompts.js
+++ b/packages/llm-llamacpp/benchmarks/performance/prepare-prompts.js
@@ -153,6 +153,10 @@ async function tuneToBudget (model, templateMessages, budget) {
   }
 }
 
+// The 'long' prompt is the focused ~512-token benchmark prompt (verified
+// against the Qwen3.5 tokenizer). Kept in sync with the committed
+// test-prompts.json and benchmarks/performance/mobile.config.json so desktop
+// and mobile runs measure the same input.
 function basePrompts () {
   return [
     {
@@ -161,11 +165,7 @@ function basePrompts () {
         { role: 'system', content: 'You are a helpful assistant.' },
         {
           role: 'user',
-          content: (
-            'You are reviewing an incident report. Write a detailed narrative with sections for timeline, ' +
-            'root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, ' +
-            'include concrete checkpoints, and avoid bullet points unless needed for clarity. '
-          ).repeat(15)
+          content: 'Summarize the following passage and explain its key technical implications for on-device inference.\n\nModern large language models have transformed natural language processing. Unlike earlier systems that relied on handcrafted features and task-specific architectures, transformer-based models learn general-purpose representations that transfer across many tasks. This shift enabled strong performance in text generation, translation, question answering, and code synthesis, frequently matching expert humans on established benchmarks.\n\nThe scaling laws governing these models describe a consistent relationship between compute, training data, and model capacity. As researchers grow model size and dataset volume, capabilities tend to improve smoothly and predictably, with occasional emergent abilities appearing at particular scale thresholds. This predictability has guided the design of increasingly capable systems, while raising real questions about energy use and cost.\n\nInference efficiency is now a central challenge. Quantization reduces the memory footprint and increases throughput by storing weights at lower numerical precision, allowing deployment on edge devices that would otherwise lack the necessary memory bandwidth. Speculative decoding and continuous batching push throughput further by using available compute more fully during autoregressive generation. Together these techniques make it practical to run capable models locally on consumer hardware, cutting latency and preserving privacy because data never leaves the device.\n\nReasoning quality continues to improve through chain-of-thought prompting and reinforcement learning from human feedback. Models with an explicit reasoning budget can spend more computation on hard problems while staying efficient on simple queries by disabling the reasoning trace entirely. Balancing this budget against latency and battery on mobile hardware is an open and practical engineering problem that the field is only beginning to address in production systems.\n\nOn mobile devices the constraints are sharper than on servers. Memory is limited, thermal headroom is small, and sustained throughput drops as the device heats up under a long generation. Prefill throughput, measured as prompt tokens processed per second, often behaves very differently from decode throughput, because prefill is compute bound across the whole prompt while decode is memory bound on a single token at a time. Quantization format interacts with both phases in ways that are hard to predict from first principles, which is exactly why empirical benchmarks across formats and devices matter. A format that is fast to decode on a desktop GPU may be slower on a phone because of how its blocks map onto the available kernels and cache hierarchy. Measuring time to first token, decode tokens per second, and prefill tokens per second across each quantization and reasoning setting gives the clearest practical picture of what users will actually experience.'
         }
       ]
     }
diff --git a/packages/llm-llamacpp/benchmarks/performance/render-report.js b/packages/llm-llamacpp/benchmarks/performance/render-report.js
new file mode 100644
index 0000000000..0af2ea9495
--- /dev/null
+++ b/packages/llm-llamacpp/benchmarks/performance/render-report.js
@@ -0,0 +1,194 @@
+#!/usr/bin/env node
+'use strict'
+
+// Unified benchmark report renderer for the Qwen3.5 perf benchmark.
+//
+// Reads perf JSON from --dir (recursively) and renders ONE markdown report
+// with the same shape for desktop and mobile, per Gianfranco's request:
+//   - one table per device, columns: Config | TTFT (ms) | TPS | ppTPS
+//     (only the addon runtimeStats; no Total/Prefill/Decode/Platform/Mean)
+//   - a closing "best config per device" summary (highest TPS, highest ppTPS)
+//   - crashed combos render as `Crashed` instead of metrics
+//
+// Two input schemas are normalized:
+//   - desktop sweep:  { models: [ { modelId, cases: [ { quantization,
+//                       runtimeConfig, metrics:{ttftMsMean,tpsMean,ppTpsMean},
+//                       status, isBaseline } ] } ]   (one logical device: the
+//                       desktop GPU runner)
+//   - mobile report:  { device:{name}, results:[ { test, status,
+//                       metrics:{ttft_ms,tps,pp_tps} } ] }
+
+const fs = require('fs')
+const path = require('path')
+
+function parseArgs (argv) {
+  const a = { dir: null, output: null, desktopDevice: 'Desktop (linux-x64 GPU)' }
+  for (let i = 2; i < argv.length; i++) {
+    const t = argv[i]
+    if (t === '--dir') a.dir = argv[++i]
+    else if (t === '--output') a.output = argv[++i]
+    else if (t === '--desktop-device') a.desktopDevice = argv[++i]
+  }
+  if (!a.dir) throw new Error('usage: render-report.js --dir <path> [--output <md>] [--desktop-device <name>]')
+  return a
+}
+
+function walkJson (dir) {
+  const out = []
+  for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
+    const p = path.join(dir, entry.name)
+    if (entry.isDirectory()) out.push(...walkJson(p))
+    else if (entry.name.endsWith('.json')) out.push(p)
+  }
+  return out
+}
+
+function num (v) {
+  return typeof v === 'number' && Number.isFinite(v) ? v : null
+}
+
+// Normalize any report file into rows: { device, config, ttft, tps, ppTps, crashed }
+function rowsFromFile (file, desktopDevice) {
+  let doc
+  try { doc = JSON.parse(fs.readFileSync(file, 'utf8')) } catch { return [] }
+  const rows = []
+
+  // Desktop sweep schema
+  if (Array.isArray(doc.models) && doc.models.length && Array.isArray(doc.models[0].cases)) {
+    for (const model of doc.models) {
+      for (const c of model.cases) {
+        if (c.isBaseline) continue // baseline duplicates a swept combo
+        const rc = c.runtimeConfig || {}
+        const config = configLabel({
+          model: `${model.modelId}-${c.quantization}`,
+          backend: rc.device,
+          rb: rc['reasoning-budget'],
+          ck: rc['cache-type-k'],
+          cv: rc['cache-type-v']
+        })
+        const m = c.metrics || {}
+        const crashed = c.status && c.status !== 'ok' && c.status !== 'partial-failure'
+        rows.push({
+          device: desktopDevice,
+          config,
+          ttft: num(m.ttftMsMean),
+          tps: num(m.tpsMean),
+          ppTps: num(m.ppTpsMean),
+          crashed: !!crashed
+        })
+      }
+    }
+    return rows
+  }
+
+  // Mobile perf-report schema
+  if (doc.device && Array.isArray(doc.results)) {
+    const device = (doc.device.name || 'unknown').trim()
+    for (const r of doc.results) {
+      const m = r.metrics || {}
+      const crashed = (r.status && String(r.status).toLowerCase() === 'crashed') ||
+        (num(m.ttft_ms) === null && num(m.tps) === null && num(m.pp_tps) === null)
+      rows.push({
+        device,
+        config: r.test || '(unknown)',
+        ttft: num(m.ttft_ms),
+        tps: num(m.tps),
+        ppTps: num(m.pp_tps),
+        crashed: !!crashed
+      })
+    }
+    return rows
+  }
+
+  return rows
+}
+
+function configLabel ({ model, backend, rb, ck, cv }) {
+  const parts = [`[${model}]`]
+  if (backend) parts.push(`[${backend}]`)
+  if (rb !== undefined && rb !== null && rb !== '') parts.push(`[rb=${rb}]`)
+  if (ck || cv) parts.push(`[kv=${ck || '?'}/${cv || '?'}]`)
+  return parts.join(' ')
+}
+
+function fmt (v) {
+  return v === null ? '-' : (Math.round(v * 100) / 100)
+}
+
+function dedupe (rows) {
+  // last write wins per (device, config); prefer a non-crashed row with metrics
+  const byKey = new Map()
+  for (const r of rows) {
+    const k = `${r.device}@@${r.config}`
+    const prev = byKey.get(k)
+    if (!prev || (prev.crashed && !r.crashed)) byKey.set(k, r)
+  }
+  return [...byKey.values()]
+}
+
+function render (rows, desktopDevice) {
+  const byDevice = new Map()
+  for (const r of rows) {
+    if (!byDevice.has(r.device)) byDevice.set(r.device, [])
+    byDevice.get(r.device).push(r)
+  }
+  // desktop device first, then the rest alphabetically
+  const devices = [...byDevice.keys()].sort((a, b) => {
+    if (a === desktopDevice) return -1
+    if (b === desktopDevice) return 1
+    return a.localeCompare(b)
+  })
+
+  const lines = []
+  lines.push('# Qwen3.5 Benchmark Results')
+  lines.push('')
+  lines.push('Metrics are the addon `runtimeStats`: TTFT (time to first token, ms), TPS (decode tokens/sec), ppTPS (prefill tokens/sec). `Crashed` means the configuration crashed or produced no output on that device.')
+  lines.push('')
+
+  for (const device of devices) {
+    const items = byDevice.get(device).slice().sort((a, b) => a.config.localeCompare(b.config))
+    lines.push(`## ${device}`)
+    lines.push('')
+    lines.push('| Config | TTFT (ms) | TPS | ppTPS |')
+    lines.push('| --- | ---: | ---: | ---: |')
+    for (const r of items) {
+      if (r.crashed) lines.push(`| ${r.config} | Crashed | Crashed | Crashed |`)
+      else lines.push(`| ${r.config} | ${fmt(r.ttft)} | ${fmt(r.tps)} | ${fmt(r.ppTps)} |`)
+    }
+    lines.push('')
+  }
+
+  lines.push('## Best configuration per device')
+  lines.push('')
+  lines.push('| Device | Highest TPS | Highest ppTPS |')
+  lines.push('| --- | --- | --- |')
+  for (const device of devices) {
+    const ok = byDevice.get(device).filter(r => !r.crashed)
+    const bestTps = ok.filter(r => r.tps !== null).sort((a, b) => b.tps - a.tps)[0]
+    const bestPp = ok.filter(r => r.ppTps !== null).sort((a, b) => b.ppTps - a.ppTps)[0]
+    const tpsCell = bestTps ? `${bestTps.config} — ${fmt(bestTps.tps)}` : '-'
+    const ppCell = bestPp ? `${bestPp.config} — ${fmt(bestPp.ppTps)}` : '-'
+    lines.push(`| ${device} | ${tpsCell} | ${ppCell} |`)
+  }
+  lines.push('')
+  return lines.join('\n') + '\n'
+}
+
+function main () {
+  const args = parseArgs(process.argv)
+  const files = walkJson(args.dir)
+  let rows = []
+  for (const f of files) rows.push(...rowsFromFile(f, args.desktopDevice))
+  rows = dedupe(rows)
+  if (rows.length === 0) {
+    const msg = 'No benchmark results found.\n'
+    if (args.output) fs.writeFileSync(args.output, msg)
+    else process.stdout.write(msg)
+    return
+  }
+  const md = render(rows, args.desktopDevice)
+  if (args.output) fs.writeFileSync(args.output, md)
+  else process.stdout.write(md)
+}
+
+main()
diff --git a/packages/llm-llamacpp/benchmarks/performance/reporters.js b/packages/llm-llamacpp/benchmarks/performance/reporters.js
index 9293b29e5c..d21b36d602 100644
--- a/packages/llm-llamacpp/benchmarks/performance/reporters.js
+++ b/packages/llm-llamacpp/benchmarks/performance/reporters.js
@@ -45,12 +45,12 @@ function toMarkdown (report) {
   lines.push('')
   for (const model of report.models) {
     lines.push(`## Model: ${model.modelId}`)
-    lines.push('| Quantization | Device | Ctx Size | Batch Size | Ubatch Size | Flash Attn | Threads | Cache K | Cache V | Prompt Case | Status | Load Mean | Load Std | Run Mean | Run Std | TTFT Mean | TTFT Std | TPS Mean | TPS Std | Unload Mean | Unload Std | Prompt Tokens | Generated Tokens | Quality Match | Error |')
-    lines.push('|---|---|---:|---:|---:|---|---:|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|')
+    lines.push('| Quantization | Reasoning Budget | Device | Ctx Size | Batch Size | Ubatch Size | Flash Attn | Threads | Cache K | Cache V | Prompt Case | Status | TTFT Mean | TTFT Std | TPS Mean | TPS Std | ppTPS Mean | ppTPS Std | Load Mean | Load Std | Run Mean | Run Std | Unload Mean | Unload Std | Prompt Tokens | Generated Tokens | Quality Match | Error |')
+    lines.push('|---|---|---|---:|---:|---:|---|---:|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|')
     for (const item of model.cases) {
       const runtimeConfig = item.runtimeConfig || {}
-      const quality = item.qualityMatch != null ? item.qualityMatch.toFixed(3) : ''
       const quantizationCell = item.isBaseline ? 'default' : (item.quantization ?? '')
+      const rbCell = item.isBaseline ? 'default' : (runtimeConfig['reasoning-budget'] != null ? String(runtimeConfig['reasoning-budget']) : '')
       const deviceCell = item.isBaseline ? 'default' : (runtimeConfig.device != null ? String(runtimeConfig.device) : '')
       const ctxSizeCell = item.isBaseline ? 'default' : (runtimeConfig['ctx-size'] != null ? String(runtimeConfig['ctx-size']) : '')
       const batchSizeCell = item.isBaseline ? 'default' : (runtimeConfig['batch-size'] != null ? String(runtimeConfig['batch-size']) : '')
@@ -65,14 +65,15 @@ function toMarkdown (report) {
         ? truncateText(item.error.message, 120)
         : ''
       lines.push(
-        `| ${quantizationCell} | ${deviceCell} | ${ctxSizeCell} | ${batchSizeCell} | ${ubatchSizeCell} | ${flashAttnCell} | ${threadsCell} | ${cacheKCell} | ${cacheVCell} | ${item.promptCase ?? ''} | ${item.status ?? ''}` +
-        ` | ${item.metrics?.loadMsMean ?? ''} | ${item.metrics?.loadMsStd ?? ''}` +
-        ` | ${item.metrics?.runMsMean ?? ''} | ${item.metrics?.runMsStd ?? ''}` +
+        `| ${quantizationCell} | ${rbCell} | ${deviceCell} | ${ctxSizeCell} | ${batchSizeCell} | ${ubatchSizeCell} | ${flashAttnCell} | ${threadsCell} | ${cacheKCell} | ${cacheVCell} | ${item.promptCase ?? ''} | ${item.status ?? ''}` +
         ` | ${item.metrics?.ttftMsMean ?? ''} | ${item.metrics?.ttftMsStd ?? ''}` +
         ` | ${item.metrics?.tpsMean ?? ''} | ${item.metrics?.tpsStd ?? ''}` +
+        ` | ${item.metrics?.ppTpsMean ?? ''} | ${item.metrics?.ppTpsStd ?? ''}` +
+        ` | ${item.metrics?.loadMsMean ?? ''} | ${item.metrics?.loadMsStd ?? ''}` +
+        ` | ${item.metrics?.runMsMean ?? ''} | ${item.metrics?.runMsStd ?? ''}` +
         ` | ${item.metrics?.unloadMsMean ?? ''} | ${item.metrics?.unloadMsStd ?? ''}` +
         ` | ${item.metrics?.promptTokens ?? ''} | ${item.metrics?.generatedTokens ?? ''}` +
-        ` | ${quality} | ${errorCell} |`
+        ` | ${item.qualityMatch != null ? item.qualityMatch.toFixed(3) : ''} | ${errorCell} |`
       )
     }
     lines.push('')
diff --git a/packages/llm-llamacpp/benchmarks/performance/test-prompts.json b/packages/llm-llamacpp/benchmarks/performance/test-prompts.json
index d8bce45bbb..abeffddb47 100644
--- a/packages/llm-llamacpp/benchmarks/performance/test-prompts.json
+++ b/packages/llm-llamacpp/benchmarks/performance/test-prompts.json
@@ -8,7 +8,7 @@
       },
       {
         "role": "user",
-        "content": "You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. You are reviewing an incident report. Write a detailed narrative with sections for timeline, root cause, impact, mitigations, and follow-up actions. Target a long answer close to 1000 tokens, include concrete checkpoints, and avoid bullet points unless needed for clarity. "
+        "content": "Summarize the following passage and explain its key technical implications for on-device inference.\n\nModern large language models have transformed natural language processing. Unlike earlier systems that relied on handcrafted features and task-specific architectures, transformer-based models learn general-purpose representations that transfer across many tasks. This shift enabled strong performance in text generation, translation, question answering, and code synthesis, frequently matching expert humans on established benchmarks.\n\nThe scaling laws governing these models describe a consistent relationship between compute, training data, and model capacity. As researchers grow model size and dataset volume, capabilities tend to improve smoothly and predictably, with occasional emergent abilities appearing at particular scale thresholds. This predictability has guided the design of increasingly capable systems, while raising real questions about energy use and cost.\n\nInference efficiency is now a central challenge. Quantization reduces the memory footprint and increases throughput by storing weights at lower numerical precision, allowing deployment on edge devices that would otherwise lack the necessary memory bandwidth. Speculative decoding and continuous batching push throughput further by using available compute more fully during autoregressive generation. Together these techniques make it practical to run capable models locally on consumer hardware, cutting latency and preserving privacy because data never leaves the device.\n\nReasoning quality continues to improve through chain-of-thought prompting and reinforcement learning from human feedback. Models with an explicit reasoning budget can spend more computation on hard problems while staying efficient on simple queries by disabling the reasoning trace entirely. Balancing this budget against latency and battery on mobile hardware is an open and practical engineering problem that the field is only beginning to address in production systems.\n\nOn mobile devices the constraints are sharper than on servers. Memory is limited, thermal headroom is small, and sustained throughput drops as the device heats up under a long generation. Prefill throughput, measured as prompt tokens processed per second, often behaves very differently from decode throughput, because prefill is compute bound across the whole prompt while decode is memory bound on a single token at a time. Quantization format interacts with both phases in ways that are hard to predict from first principles, which is exactly why empirical benchmarks across formats and devices matter. A format that is fast to decode on a desktop GPU may be slower on a phone because of how its blocks map onto the available kernels and cache hierarchy. Measuring time to first token, decode tokens per second, and prefill tokens per second across each quantization and reasoning setting gives the clearest practical picture of what users will actually experience."
       }
     ]
   },
diff --git a/packages/llm-llamacpp/scripts/generate-mobile-integration-tests.js b/packages/llm-llamacpp/scripts/generate-mobile-integration-tests.js
index 09fefedb00..5e87bec4bd 100644
--- a/packages/llm-llamacpp/scripts/generate-mobile-integration-tests.js
+++ b/packages/llm-llamacpp/scripts/generate-mobile-integration-tests.js
@@ -66,9 +66,15 @@ function validateGroups (functionNames) {
   }
   const groups = JSON.parse(fs.readFileSync(groupsFile, 'utf-8'))
   const nameSet = new Set(functionNames)
+  // Benchmark shards (benchmark-perf-*.test.js -> runBenchmarkPerf*) are
+  // scheduled only by the Benchmark Performance workflow via an explicit
+  // test_groups override, and are deliberately absent from test-groups.json
+  // so normal mobile integration runs never trigger the heavy benchmark.
+  // Exclude them from the group-coverage requirement.
+  const isOverrideOnly = (n) => n.startsWith('runBenchmarkPerf')
   for (const [platform, splits] of Object.entries(groups)) {
     const covered = new Set(Object.values(splits).flat())
-    const missing = functionNames.filter(n => !covered.has(n))
+    const missing = functionNames.filter(n => !covered.has(n) && !isOverrideOnly(n))
     const extra = [...covered].filter(n => !nameSet.has(n))
     if (missing.length) {
       throw new Error(
diff --git a/packages/llm-llamacpp/test/integration/_benchmark-perf.js b/packages/llm-llamacpp/test/integration/_benchmark-perf.js
new file mode 100644
index 0000000000..6157532662
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/_benchmark-perf.js
@@ -0,0 +1,158 @@
+'use strict'
+
+// Shared runner for the mobile perf benchmark. Sharded into one test file per
+// (model x KV-cache type) (benchmark-perf-<size>-<quant>-<cachetype>.test.js)
+// so each Device Farm session finishes inside the fixed 20-minute iOS per-test
+// ceiling; this module holds the logic they all share. Underscore prefix keeps
+// it out of the mobile test generator (it is not a *.test.js file).
+//
+// Each shard sweeps its model across both devices (gpu, cpu) and both
+// reasoning-budget values (-1, 0), recording TTFT / TPS / ppTPS. The full
+// matrix (2 sizes x 5 quants x 3 KV-cache types x 2 devices x 2 budgets) is
+// split across the shard files; nothing here reduces it.
+
+const path = require('bare-path')
+const LlmLlamacpp = require('../../index.js')
+const { ensureModel, safeTest } = require('./utils')
+const { attachSpecLogger } = require('./spec-logger')
+const { recordPerformance, isMobile } = require('./_perf-helper.js')
+const os = require('bare-os')
+
+const DEVICES = ['gpu', 'cpu']
+const REASONING_BUDGETS = ['-1', '0']
+
+const RUNTIME = {
+  gpu_layers: '999',
+  ctx_size: '2048',
+  n_predict: '512',
+  temp: '0.1',
+  seed: '42',
+  verbosity: '0'
+}
+
+// ~512-token prompt (verified against the Qwen3.5 tokenizer at 518 templated tokens).
+const PROMPT = [
+  { role: 'system', content: 'You are a helpful assistant.' },
+  {
+    role: 'user',
+    content: 'Summarize the following passage and explain its key technical implications for on-device inference.\n\nModern large language models have transformed natural language processing. Unlike earlier systems that relied on handcrafted features and task-specific architectures, transformer-based models learn general-purpose representations that transfer across many tasks. This shift enabled strong performance in text generation, translation, question answering, and code synthesis, frequently matching expert humans on established benchmarks.\n\nThe scaling laws governing these models describe a consistent relationship between compute, training data, and model capacity. As researchers grow model size and dataset volume, capabilities tend to improve smoothly and predictably, with occasional emergent abilities appearing at particular scale thresholds. This predictability has guided the design of increasingly capable systems, while raising real questions about energy use and cost.\n\nInference efficiency is now a central challenge. Quantization reduces the memory footprint and increases throughput by storing weights at lower numerical precision, allowing deployment on edge devices that would otherwise lack the necessary memory bandwidth. Speculative decoding and continuous batching push throughput further by using available compute more fully during autoregressive generation. Together these techniques make it practical to run capable models locally on consumer hardware, cutting latency and preserving privacy because data never leaves the device.\n\nReasoning quality continues to improve through chain-of-thought prompting and reinforcement learning from human feedback. Models with an explicit reasoning budget can spend more computation on hard problems while staying efficient on simple queries by disabling the reasoning trace entirely. Balancing this budget against latency and battery on mobile hardware is an open and practical engineering problem that the field is only beginning to address in production systems.\n\nOn mobile devices the constraints are sharper than on servers. Memory is limited, thermal headroom is small, and sustained throughput drops as the device heats up under a long generation. Prefill throughput, measured as prompt tokens processed per second, often behaves very differently from decode throughput, because prefill is compute bound across the whole prompt while decode is memory bound on a single token at a time. Quantization format interacts with both phases in ways that are hard to predict from first principles, which is exactly why empirical benchmarks across formats and devices matter. A format that is fast to decode on a desktop GPU may be slower on a phone because of how its blocks map onto the available kernels and cache hierarchy. Measuring time to first token, decode tokens per second, and prefill tokens per second across each quantization and reasoning setting gives the clearest practical picture of what users will actually experience.'
+  }
+]
+
+function _envInt (key, fallback) {
+  let raw = ''
+  if (typeof os.getEnv === 'function') raw = os.getEnv(key) || ''
+  if (!raw && typeof process !== 'undefined' && process.env) raw = process.env[key] || ''
+  const v = parseInt(raw, 10)
+  return Number.isFinite(v) && v > 0 ? v : fallback
+}
+const PERF_RUNS = _envInt('QVAC_PERF_RUNS', 1)
+const PERF_WARMUP_RUNS = _envInt('QVAC_PERF_WARMUP_RUNS', 1)
+
+function modelSpec (size, quant) {
+  return {
+    id: `qwen3.5-${size.toLowerCase()}-${quant}`,
+    name: `Qwen3.5-${size}-${quant}.gguf`,
+    url: `https://huggingface.co/unsloth/Qwen3.5-${size}-GGUF/resolve/main/Qwen3.5-${size}-${quant}.gguf`
+  }
+}
+
+async function runInference (addon, prompt, reasoningBudget) {
+  const startTime = Date.now()
+  const response = await addon.run(prompt, {
+    generationParams: { reasoning_budget: parseInt(reasoningBudget, 10) }
+  })
+  const chunks = []
+  let error = null
+  response
+    .onUpdate(data => { chunks.push(data) })
+    .onError(err => { error = err })
+  await response.await()
+  if (error) throw new Error('inference failed: ' + error)
+  return { output: chunks.join('').trim(), startTime, endTime: Date.now(), stats: response.stats || null }
+}
+
+// Records a placeholder row with no metrics. The renderer shows any row
+// without TTFT/TPS/ppTPS as `Crashed`. We emit one up-front for every combo
+// BEFORE loading/running it, so a hard native crash that kills the Device
+// Farm session still leaves a `Crashed` row in the logs (the mobile reporter
+// flushes each record to console immediately). A successful run records the
+// real metrics afterwards, which supersedes the placeholder in the renderer.
+function recordCrashedPlaceholder (label, device, model) {
+  recordPerformance(label, 0, { stats: null, deviceId: device, scenario: 'benchmark-perf', model })
+}
+
+// Registers the benchmark test for one (model x quant x kv-cache type),
+// sweeping device x reasoning-budget. One Device Farm session per call.
+// kv-cache type is set as cache-type-k/v at load time; Adreno devices don't
+// support quantized KV cache, so those combos may crash — reported as Crashed.
+function benchmarkModel (size, quant, cacheType) {
+  const spec = modelSpec(size, quant)
+  const id = `${spec.id}-${cacheType}`
+  safeTest(`Mobile perf benchmark: ${id} (TTFT / TPS / ppTPS)`, {
+    timeout: 1_800_000,
+    skip: !isMobile
+  }, async t => {
+    const specLogger = attachSpecLogger({ forwardToConsole: true })
+    try {
+      const [modelName, dirPath] = await ensureModel({ modelName: spec.name, downloadUrl: spec.url })
+      const modelPath = path.join(dirPath, modelName)
+
+      for (const device of DEVICES) {
+        const labelFor = rb => `[${spec.id}] [${device}] [rb=${rb}] [kv=${cacheType}]`
+        const modelFor = rb => `${id}-${device}-rb${rb}`
+        // Up-front Crashed placeholders for every combo on this device.
+        for (const rb of REASONING_BUDGETS) recordCrashedPlaceholder(labelFor(rb), device, modelFor(rb))
+
+        let addon = null
+        try {
+          addon = new LlmLlamacpp({
+            files: { model: [modelPath] },
+            config: { ...RUNTIME, device, 'cache-type-k': cacheType, 'cache-type-v': cacheType },
+            logger: { error: () => {}, warn: () => {}, info: () => {}, debug: () => {} },
+            opts: { stats: true }
+          })
+          await addon.load()
+        } catch (loadErr) {
+          // Load failed (e.g. unsupported quantized KV cache) — placeholders
+          // remain Crashed for this device's combos. Move on.
+          t.comment(`[${id}] [${device}] load failed (reported as Crashed): ${loadErr && loadErr.message ? loadErr.message : loadErr}`)
+          await (addon && addon.unload && addon.unload().catch(() => {}))
+          continue
+        }
+
+        try {
+          for (const rb of REASONING_BUDGETS) {
+            const label = labelFor(rb)
+            try {
+              for (let w = 1; w <= PERF_WARMUP_RUNS; w++) {
+                const { endTime, startTime } = await runInference(addon, PROMPT, rb)
+                t.comment(`${label} warmup ${w}/${PERF_WARMUP_RUNS} (${endTime - startTime}ms) - perf NOT recorded`)
+              }
+              for (let run = 1; run <= PERF_RUNS; run++) {
+                const { output, startTime, endTime, stats } = await runInference(addon, PROMPT, rb)
+                // Real metrics supersede the Crashed placeholder in the renderer.
+                t.comment(recordPerformance(label, endTime - startTime, {
+                  stats,
+                  deviceId: device,
+                  scenario: 'benchmark-perf',
+                  model: modelFor(rb)
+                }))
+                t.ok(output.length > 0, `${label} run ${run}/${PERF_RUNS} produced output`)
+              }
+            } catch (runErr) {
+              // Catchable run failure — placeholder stays Crashed for this combo.
+              t.comment(`${label} run failed (reported as Crashed): ${runErr && runErr.message ? runErr.message : runErr}`)
+            }
+          }
+        } finally {
+          await addon.unload().catch(() => {})
+        }
+      }
+    } finally {
+      specLogger.release()
+    }
+  })
+}
+
+module.exports = { benchmarkModel, modelSpec }
diff --git a/packages/llm-llamacpp/test/integration/_perf-helper.js b/packages/llm-llamacpp/test/integration/_perf-helper.js
index 86d2bc9681..ffa71ab0b8 100644
--- a/packages/llm-llamacpp/test/integration/_perf-helper.js
+++ b/packages/llm-llamacpp/test/integration/_perf-helper.js
@@ -277,6 +277,7 @@ function recordPerformance (label, totalTime, extra) {
 
   const ttftMs = stats ? _num(stats.TTFT) : null
   const tps = stats ? _num(stats.TPS) : null
+  const ppTps = stats ? _num(stats.ppTPS) : null
   const generatedTokens = stats ? _num(stats.generatedTokens) : null
   const promptTokens = stats ? _num(stats.promptTokens) : null
 
@@ -309,7 +310,8 @@ function recordPerformance (label, totalTime, extra) {
     ttft_ms: ttftMs !== null ? Math.round(ttftMs) : null,
     generated_tokens: generatedTokens,
     prompt_tokens: promptTokens,
-    tps: tps !== null ? Number(tps.toFixed(2)) : null
+    tps: tps !== null ? Number(tps.toFixed(2)) : null,
+    pp_tps: ppTps !== null ? Number(ppTps.toFixed(2)) : null
   }, {
     scenario: (extra && extra.scenario) || 'default',
     model: (extra && extra.model) || null,
@@ -342,6 +344,7 @@ function recordPerformance (label, totalTime, extra) {
     `    - Prefill / TTFT: ${ttftMs !== null ? Math.round(ttftMs) + 'ms' : 'n/a'}`,
     `    - Decode: ${decodeMs !== null ? decodeMs + 'ms' : 'n/a'}`,
     `    - TPS: ${tps !== null ? tps.toFixed(2) : 'n/a'}`,
+    `    - ppTPS: ${ppTps !== null ? ppTps.toFixed(2) : 'n/a'}`,
     `    - Tokens: ${generatedTokens !== null ? generatedTokens : 'n/a'} gen / ${promptTokens !== null ? promptTokens : 'n/a'} prompt`
   ]
   return lines.join('\n')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-f16.test.js
new file mode 100644
index 0000000000..b2011f5e3f
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q4_0', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q4-0.test.js
new file mode 100644
index 0000000000..62cbd0db72
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q4_0', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q8-0.test.js
new file mode 100644
index 0000000000..12d3897dba
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-0-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q4_0', 'q8_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-f16.test.js
new file mode 100644
index 0000000000..ec653390c3
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q4_1', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q4-0.test.js
new file mode 100644
index 0000000000..16e6559a6e
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q4_1', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q8-0.test.js
new file mode 100644
index 0000000000..e8ea6d15f4
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-1-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q4_1', 'q8_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-f16.test.js
new file mode 100644
index 0000000000..2fe48245cf
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q4_K_M', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q4-0.test.js
new file mode 100644
index 0000000000..829734d9b1
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q4_K_M', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q8-0.test.js
new file mode 100644
index 0000000000..b8343a978b
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q4-k-m-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q4_K_M', 'q8_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-f16.test.js
new file mode 100644
index 0000000000..b47efb3d9e
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q6_K', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q4-0.test.js
new file mode 100644
index 0000000000..7debe08857
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q6_K', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q8-0.test.js
new file mode 100644
index 0000000000..087dace127
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q6-k-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q6_K', 'q8_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-f16.test.js
new file mode 100644
index 0000000000..df404731e8
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q8_0', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q4-0.test.js
new file mode 100644
index 0000000000..c06838216d
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q8_0', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q8-0.test.js
new file mode 100644
index 0000000000..2cf00cb768
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-08b-q8-0-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('0.8B', 'Q8_0', 'q8_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-f16.test.js
new file mode 100644
index 0000000000..0d520054b5
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q4_0', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q4-0.test.js
new file mode 100644
index 0000000000..c8bcd0cc2a
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q4_0', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q8-0.test.js
new file mode 100644
index 0000000000..d059dc5acd
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-0-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q4_0', 'q8_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-f16.test.js
new file mode 100644
index 0000000000..4c9b824d03
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q4_1', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q4-0.test.js
new file mode 100644
index 0000000000..fb6ecd7440
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q4_1', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q8-0.test.js
new file mode 100644
index 0000000000..982192cba9
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-1-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q4_1', 'q8_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-f16.test.js
new file mode 100644
index 0000000000..897b184957
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q4_K_M', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q4-0.test.js
new file mode 100644
index 0000000000..58f0a64f76
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q4_K_M', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q8-0.test.js
new file mode 100644
index 0000000000..5fab3f553c
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q4-k-m-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q4_K_M', 'q8_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-f16.test.js
new file mode 100644
index 0000000000..9f32e33a64
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q6_K', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q4-0.test.js
new file mode 100644
index 0000000000..f0497f88ba
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q6_K', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q8-0.test.js
new file mode 100644
index 0000000000..b86363d867
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q6-k-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q6_K', 'q8_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-f16.test.js
new file mode 100644
index 0000000000..8f47551a85
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q8_0', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q4-0.test.js
new file mode 100644
index 0000000000..37a82b6ddb
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q8_0', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q8-0.test.js
new file mode 100644
index 0000000000..827ff27d24
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-2b-q8-0-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel } = require('./_benchmark-perf.js')
+benchmarkModel('2B', 'Q8_0', 'q8_0')
diff --git a/packages/llm-llamacpp/test/mobile/integration.auto.cjs b/packages/llm-llamacpp/test/mobile/integration.auto.cjs
index 6f6d1fd7ec..d5b6439be9 100644
--- a/packages/llm-llamacpp/test/mobile/integration.auto.cjs
+++ b/packages/llm-llamacpp/test/mobile/integration.auto.cjs
@@ -16,6 +16,156 @@ async function runApiBehaviorTest (options = {}) { // eslint-disable-line no-unu
   return runIntegrationModule('../integration/api-behavior.test.js', options)
 }
 
+async function runBenchmarkPerf08bQ40F16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40F16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-f16.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ40Q40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40Q40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ40Q80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ40Q80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q4-0-q8-0.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ41F16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41F16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-f16.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ41Q40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41Q40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ41Q80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ41Q80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q4-1-q8-0.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ4KMF16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMF16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-f16.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ4KMQ40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMQ40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ4KMQ80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ4KMQ80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q4-k-m-q8-0.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ6KF16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KF16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-f16.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ6KQ40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KQ40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ6KQ80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ6KQ80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q6-k-q8-0.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ80F16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80F16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-f16.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ80Q40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80Q40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf08bQ80Q80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf08bQ80Q80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-q8-0.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ40F16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40F16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-f16.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ40Q40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40Q40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ40Q80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40Q80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-q8-0.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ41F16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41F16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-f16.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ41Q40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41Q40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ41Q80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ41Q80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q4-1-q8-0.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ4KMF16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMF16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-f16.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ4KMQ40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMQ40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ4KMQ80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ4KMQ80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q4-k-m-q8-0.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ6KF16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KF16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-f16.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ6KQ40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KQ40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ6KQ80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ6KQ80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q6-k-q8-0.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ80F16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80F16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-f16.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ80Q40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80Q40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf2bQ80Q80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ80Q80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-2b-q8-0-q8-0.test.js', options)
+}
+
 async function runBitnetTest (options = {}) { // eslint-disable-line no-unused-vars
   if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBitnetTest')) return __FILTERED
   return runIntegrationModule('../integration/bitnet.test.js', options)

From c76cb8babae142541958926c2cc294f65180b312 Mon Sep 17 00:00:00 2001
From: Ridwan Taiwo <donriddo@gmail.com>
Date: Wed, 3 Jun 2026 19:05:07 +0100
Subject: [PATCH 02/10] fix: split mobile benchmark into 3 sequential KV-cache
 batches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

30 mobile shards in one reused-workflow call exceed that workflow's 120-min
job timeout (Android was already at 119 min with 10 shards). Split the groups
into three batches by KV-cache type (10 each — the proven in-budget load) via
a max-parallel:1 matrix so each batch runs in isolation with no Device Farm
pool contention. Add an optional artifact_suffix input to the mobile workflow
(default empty, so other addons' artifact names are unchanged) to keep the
three batches' perf-report artifacts from colliding; summarize aggregates all
three into the unified report.
---
 .../workflows/benchmark-perf-llm-llamacpp.yml | 29 +++++++++++++++----
 .../integration-mobile-test-llm-llamacpp.yml  |  7 ++++-
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmark-perf-llm-llamacpp.yml b/.github/workflows/benchmark-perf-llm-llamacpp.yml
index 4bfabec2c0..e36e0e919a 100644
--- a/.github/workflows/benchmark-perf-llm-llamacpp.yml
+++ b/.github/workflows/benchmark-perf-llm-llamacpp.yml
@@ -202,6 +202,16 @@ jobs:
           retention-days: 90
           if-no-files-found: ignore
 
+  # Mobile is sharded one group per (model x KV-cache type) = 30 groups so each
+  # Device Farm session finishes inside the fixed 20-minute iOS per-test ceiling.
+  # 30 groups in one reused-workflow call exceed that workflow's 120-minute job
+  # timeout, so we split them into three batches by KV-cache type (10 groups
+  # each — the proven, in-budget load) and run the batches sequentially
+  # (max-parallel: 1) to avoid Device Farm pool starvation. Each batch gets a
+  # distinct artifact_suffix so its perf-report artifact doesn't collide with
+  # the others; summarize aggregates all three. These wrappers are deliberately
+  # absent from the addon's test-groups.json, so this override is the only path
+  # that runs them — normal mobile integration runs never trigger the benchmark.
   mobile-benchmark:
     needs:
       - context
@@ -213,17 +223,24 @@ jobs:
       packages: read
       pull-requests: write
       id-token: write
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        include:
+          - cache: f16
+            groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"}]'
+          - cache: q8_0
+            groups: '[{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]'
+          - cache: q4_0
+            groups: '[{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"}]'
     uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml
     secrets: inherit
     with:
       repository: ${{ github.repository }}
       ref: ${{ needs.context.outputs.ref }}
-      # One group per (model x KV-cache type) so each Device Farm session
-      # finishes inside the fixed 20-minute iOS per-test ceiling. These wrappers are deliberately
-      # absent from the addon's test-groups.json, so this override is the only
-      # path that runs them — normal mobile integration runs never trigger the
-      # benchmark.
-      test_groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]'
+      test_groups: ${{ matrix.groups }}
+      artifact_suffix: ${{ matrix.cache }}-
 
   # Aggregates the mobile per-model perf-report artifacts (one per shard,
   # both platforms) into a single consolidated table rendered into the run
diff --git a/.github/workflows/integration-mobile-test-llm-llamacpp.yml b/.github/workflows/integration-mobile-test-llm-llamacpp.yml
index a7df080202..9a02b84d69 100644
--- a/.github/workflows/integration-mobile-test-llm-llamacpp.yml
+++ b/.github/workflows/integration-mobile-test-llm-llamacpp.yml
@@ -41,6 +41,11 @@ on:
         type: string
         required: false
         default: ""
+      artifact_suffix:
+        description: "Optional prefix inserted into the perf-report artifact name so multiple invocations in one run don't collide. Default empty keeps the existing name."
+        type: string
+        required: false
+        default: ""
   workflow_dispatch:
     inputs:
       ref:
@@ -217,7 +222,7 @@ jobs:
           platform: ${{ matrix.platform }}
           merge: 'true'
           unzip-customer-artifacts: 'true'
-          artifact-name: perf-report-llamacpp-llm-${{ matrix.platform }}-${{ github.run_number }}
+          artifact-name: perf-report-llamacpp-llm-${{ inputs.artifact_suffix }}${{ matrix.platform }}-${{ github.run_number }}
 
       - name: Comment results on PR
         if: always() && !cancelled()

From 8d101dc332a280eb292caa1a32565631f36112a7 Mon Sep 17 00:00:00 2001
From: Ridwan Taiwo <donriddo@gmail.com>
Date: Wed, 3 Jun 2026 19:17:21 +0100
Subject: [PATCH 03/10] fix: run full mobile benchmark in one job via raised
 timeout

Replace the 3-batch KV-cache split with a single 30-group mobile call and add
an optional job_timeout_minutes input to the mobile workflow (default 120, so
every other caller is unchanged). Benchmark passes 240; observed mobile wall
for the full matrix is ~140-160 min, comfortably inside it. One app build
instead of three, single run, ~2.5-3h.
---
 .../workflows/benchmark-perf-llm-llamacpp.yml | 24 ++++---------------
 .../integration-mobile-test-llm-llamacpp.yml  | 12 +++++-----
 2 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/benchmark-perf-llm-llamacpp.yml b/.github/workflows/benchmark-perf-llm-llamacpp.yml
index e36e0e919a..917da94c3f 100644
--- a/.github/workflows/benchmark-perf-llm-llamacpp.yml
+++ b/.github/workflows/benchmark-perf-llm-llamacpp.yml
@@ -204,12 +204,9 @@ jobs:
 
   # Mobile is sharded one group per (model x KV-cache type) = 30 groups so each
   # Device Farm session finishes inside the fixed 20-minute iOS per-test ceiling.
-  # 30 groups in one reused-workflow call exceed that workflow's 120-minute job
-  # timeout, so we split them into three batches by KV-cache type (10 groups
-  # each — the proven, in-budget load) and run the batches sequentially
-  # (max-parallel: 1) to avoid Device Farm pool starvation. Each batch gets a
-  # distinct artifact_suffix so its perf-report artifact doesn't collide with
-  # the others; summarize aggregates all three. These wrappers are deliberately
+  # 30 groups exceed the mobile workflow's default 120-minute job timeout, so we
+  # raise it to 240 via job_timeout_minutes (observed mobile wall ~140-160 min;
+  # default stays 120 for every other caller). These wrappers are deliberately
   # absent from the addon's test-groups.json, so this override is the only path
   # that runs them — normal mobile integration runs never trigger the benchmark.
   mobile-benchmark:
@@ -223,24 +220,13 @@ jobs:
       packages: read
       pull-requests: write
       id-token: write
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        include:
-          - cache: f16
-            groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"}]'
-          - cache: q8_0
-            groups: '[{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]'
-          - cache: q4_0
-            groups: '[{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"}]'
     uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml
     secrets: inherit
     with:
       repository: ${{ github.repository }}
       ref: ${{ needs.context.outputs.ref }}
-      test_groups: ${{ matrix.groups }}
-      artifact_suffix: ${{ matrix.cache }}-
+      job_timeout_minutes: 240
+      test_groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]'
 
   # Aggregates the mobile per-model perf-report artifacts (one per shard,
   # both platforms) into a single consolidated table rendered into the run
diff --git a/.github/workflows/integration-mobile-test-llm-llamacpp.yml b/.github/workflows/integration-mobile-test-llm-llamacpp.yml
index 9a02b84d69..974f86240e 100644
--- a/.github/workflows/integration-mobile-test-llm-llamacpp.yml
+++ b/.github/workflows/integration-mobile-test-llm-llamacpp.yml
@@ -41,11 +41,11 @@ on:
         type: string
         required: false
         default: ""
-      artifact_suffix:
-        description: "Optional prefix inserted into the perf-report artifact name so multiple invocations in one run don't collide. Default empty keeps the existing name."
-        type: string
+      job_timeout_minutes:
+        description: "Override the build-and-test job timeout (minutes). Default 120. Raised by Benchmark Performance (LLM) where the full sharded matrix needs longer."
+        type: number
         required: false
-        default: ""
+        default: 120
   workflow_dispatch:
     inputs:
       ref:
@@ -79,7 +79,7 @@ jobs:
     name: Build ${{ matrix.platform }} and Run E2E Tests
     runs-on: ${{ matrix.runner }}
     environment: release
-    timeout-minutes: 120
+    timeout-minutes: ${{ inputs.job_timeout_minutes || 120 }}
     continue-on-error: true
     permissions:
       contents: read
@@ -222,7 +222,7 @@ jobs:
           platform: ${{ matrix.platform }}
           merge: 'true'
           unzip-customer-artifacts: 'true'
-          artifact-name: perf-report-llamacpp-llm-${{ inputs.artifact_suffix }}${{ matrix.platform }}-${{ github.run_number }}
+          artifact-name: perf-report-llamacpp-llm-${{ matrix.platform }}-${{ github.run_number }}
 
       - name: Comment results on PR
         if: always() && !cancelled()

From bfb3f278422f250c697d051f209090937902c35d Mon Sep 17 00:00:00 2001
From: Ridwan Taiwo <donriddo@gmail.com>
Date: Wed, 3 Jun 2026 23:39:50 +0100
Subject: [PATCH 04/10] fix: run mobile benchmark as 3 sequential KV-cache
 batches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The single 30-shard mobile job fails on both platforms: Android serializes 30
runs against its Device Farm pool (>240 min) and the macOS runner fills its
disk collecting 30 runs' logs (no space left on device). Split into three
KV-cache batches (10 groups each — the proven ~119 min load) run sequentially
(max-parallel: 1) to avoid pool contention. Each batch passes job_timeout_minutes
180 for headroom and a distinct artifact_suffix so the three perf-reports don't
collide; summarize aggregates all three. Both new mobile-workflow inputs are
optional and default to current behaviour, so other callers are unchanged.
---
 .../workflows/benchmark-perf-llm-llamacpp.yml | 31 ++++++++++++++-----
 .../integration-mobile-test-llm-llamacpp.yml  |  9 ++++--
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/benchmark-perf-llm-llamacpp.yml b/.github/workflows/benchmark-perf-llm-llamacpp.yml
index 917da94c3f..8ce2fa8e53 100644
--- a/.github/workflows/benchmark-perf-llm-llamacpp.yml
+++ b/.github/workflows/benchmark-perf-llm-llamacpp.yml
@@ -204,11 +204,16 @@ jobs:
 
   # Mobile is sharded one group per (model x KV-cache type) = 30 groups so each
   # Device Farm session finishes inside the fixed 20-minute iOS per-test ceiling.
-  # 30 groups exceed the mobile workflow's default 120-minute job timeout, so we
-  # raise it to 240 via job_timeout_minutes (observed mobile wall ~140-160 min;
-  # default stays 120 for every other caller). These wrappers are deliberately
-  # absent from the addon's test-groups.json, so this override is the only path
-  # that runs them — normal mobile integration runs never trigger the benchmark.
+  # All 30 in one reused-workflow call do NOT fit: Android serializes the runs
+  # against its device pool (>240 min) and the macOS runner fills its disk
+  # collecting 30 runs' logs. So we split the groups into three batches by
+  # KV-cache type (10 each — the proven in-budget load) and run them
+  # sequentially (max-parallel: 1) to avoid Device Farm pool contention. Each
+  # batch raises the job timeout to 180 for headroom (proven 10-shard wall ~119
+  # min) and gets a distinct artifact_suffix so its perf-report doesn't collide;
+  # summarize aggregates all three. These wrappers are deliberately absent from
+  # the addon's test-groups.json, so this override is the only path that runs
+  # them — normal mobile integration runs never trigger the benchmark.
   mobile-benchmark:
     needs:
       - context
@@ -220,13 +225,25 @@ jobs:
       packages: read
       pull-requests: write
       id-token: write
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        include:
+          - cache: f16
+            groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"}]'
+          - cache: q8_0
+            groups: '[{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]'
+          - cache: q4_0
+            groups: '[{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"}]'
     uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml
     secrets: inherit
     with:
       repository: ${{ github.repository }}
       ref: ${{ needs.context.outputs.ref }}
-      job_timeout_minutes: 240
-      test_groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]'
+      job_timeout_minutes: 180
+      test_groups: ${{ matrix.groups }}
+      artifact_suffix: ${{ matrix.cache }}-
 
   # Aggregates the mobile per-model perf-report artifacts (one per shard,
   # both platforms) into a single consolidated table rendered into the run
diff --git a/.github/workflows/integration-mobile-test-llm-llamacpp.yml b/.github/workflows/integration-mobile-test-llm-llamacpp.yml
index 974f86240e..3e53e42c8b 100644
--- a/.github/workflows/integration-mobile-test-llm-llamacpp.yml
+++ b/.github/workflows/integration-mobile-test-llm-llamacpp.yml
@@ -42,10 +42,15 @@ on:
         required: false
         default: ""
       job_timeout_minutes:
-        description: "Override the build-and-test job timeout (minutes). Default 120. Raised by Benchmark Performance (LLM) where the full sharded matrix needs longer."
+        description: "Override the build-and-test job timeout (minutes). Default 120. Raised by Benchmark Performance (LLM) where the sharded matrix needs more headroom."
         type: number
         required: false
         default: 120
+      artifact_suffix:
+        description: "Optional prefix inserted into the perf-report artifact name so multiple invocations in one run (e.g. Benchmark batches) don't collide. Default empty keeps the existing name."
+        type: string
+        required: false
+        default: ""
   workflow_dispatch:
     inputs:
       ref:
@@ -222,7 +227,7 @@ jobs:
           platform: ${{ matrix.platform }}
           merge: 'true'
           unzip-customer-artifacts: 'true'
-          artifact-name: perf-report-llamacpp-llm-${{ matrix.platform }}-${{ github.run_number }}
+          artifact-name: perf-report-llamacpp-llm-${{ inputs.artifact_suffix }}${{ matrix.platform }}-${{ github.run_number }}
 
       - name: Comment results on PR
         if: always() && !cancelled()

From 2d4efd3a290fd23f8f17a93e937254aedaed20d6 Mon Sep 17 00:00:00 2001
From: Ridwan Taiwo <donriddo@gmail.com>
Date: Thu, 4 Jun 2026 06:03:00 +0100
Subject: [PATCH 05/10] fix(render-report): deduplicate symmetric kv labels;
 add lowest TTFT to best-config

When cache-type-k == cache-type-v the config label now renders [kv=f16] not
[kv=f16/f16], making desktop consistent with mobile. Also add a Lowest TTFT
column to the best-config-per-device summary.
---
 .../benchmarks/performance/render-report.js            | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/packages/llm-llamacpp/benchmarks/performance/render-report.js b/packages/llm-llamacpp/benchmarks/performance/render-report.js
index 0af2ea9495..2c0ba34733 100644
--- a/packages/llm-llamacpp/benchmarks/performance/render-report.js
+++ b/packages/llm-llamacpp/benchmarks/performance/render-report.js
@@ -107,7 +107,7 @@ function configLabel ({ model, backend, rb, ck, cv }) {
   const parts = [`[${model}]`]
   if (backend) parts.push(`[${backend}]`)
   if (rb !== undefined && rb !== null && rb !== '') parts.push(`[rb=${rb}]`)
-  if (ck || cv) parts.push(`[kv=${ck || '?'}/${cv || '?'}]`)
+  if (ck || cv) parts.push(ck === cv ? `[kv=${ck}]` : `[kv=${ck || '?'}/${cv || '?'}]`)
   return parts.join(' ')
 }
 
@@ -160,15 +160,17 @@ function render (rows, desktopDevice) {
 
   lines.push('## Best configuration per device')
   lines.push('')
-  lines.push('| Device | Highest TPS | Highest ppTPS |')
-  lines.push('| --- | --- | --- |')
+  lines.push('| Device | Lowest TTFT (ms) | Highest TPS | Highest ppTPS |')
+  lines.push('| --- | --- | --- | --- |')
   for (const device of devices) {
     const ok = byDevice.get(device).filter(r => !r.crashed)
+    const bestTtft = ok.filter(r => r.ttft !== null).sort((a, b) => a.ttft - b.ttft)[0]
     const bestTps = ok.filter(r => r.tps !== null).sort((a, b) => b.tps - a.tps)[0]
     const bestPp = ok.filter(r => r.ppTps !== null).sort((a, b) => b.ppTps - a.ppTps)[0]
+    const ttftCell = bestTtft ? `${bestTtft.config} — ${fmt(bestTtft.ttft)}` : '-'
     const tpsCell = bestTps ? `${bestTps.config} — ${fmt(bestTps.tps)}` : '-'
     const ppCell = bestPp ? `${bestPp.config} — ${fmt(bestPp.ppTps)}` : '-'
-    lines.push(`| ${device} | ${tpsCell} | ${ppCell} |`)
+    lines.push(`| ${device} | ${ttftCell} | ${tpsCell} | ${ppCell} |`)
   }
   lines.push('')
   return lines.join('\n') + '\n'

From 74e4c76e86b8e08ae42844d8b555db08ea6eea61 Mon Sep 17 00:00:00 2001
From: Ridwan Taiwo <donriddo@gmail.com>
Date: Thu, 4 Jun 2026 06:26:10 +0100
Subject: [PATCH 06/10] fix(render-report): revert Lowest TTFT from best-config
 summary

Gianfranco's spec was highest TPS and highest ppTPS only. Lowest TTFT was not
requested and is removed.
---
 .../llm-llamacpp/benchmarks/performance/render-report.js  | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/packages/llm-llamacpp/benchmarks/performance/render-report.js b/packages/llm-llamacpp/benchmarks/performance/render-report.js
index 2c0ba34733..35da100341 100644
--- a/packages/llm-llamacpp/benchmarks/performance/render-report.js
+++ b/packages/llm-llamacpp/benchmarks/performance/render-report.js
@@ -160,17 +160,15 @@ function render (rows, desktopDevice) {
 
   lines.push('## Best configuration per device')
   lines.push('')
-  lines.push('| Device | Lowest TTFT (ms) | Highest TPS | Highest ppTPS |')
-  lines.push('| --- | --- | --- | --- |')
+  lines.push('| Device | Highest TPS | Highest ppTPS |')
+  lines.push('| --- | --- | --- |')
   for (const device of devices) {
     const ok = byDevice.get(device).filter(r => !r.crashed)
-    const bestTtft = ok.filter(r => r.ttft !== null).sort((a, b) => a.ttft - b.ttft)[0]
     const bestTps = ok.filter(r => r.tps !== null).sort((a, b) => b.tps - a.tps)[0]
     const bestPp = ok.filter(r => r.ppTps !== null).sort((a, b) => b.ppTps - a.ppTps)[0]
-    const ttftCell = bestTtft ? `${bestTtft.config} — ${fmt(bestTtft.ttft)}` : '-'
     const tpsCell = bestTps ? `${bestTps.config} — ${fmt(bestTps.tps)}` : '-'
     const ppCell = bestPp ? `${bestPp.config} — ${fmt(bestPp.ppTps)}` : '-'
-    lines.push(`| ${device} | ${ttftCell} | ${tpsCell} | ${ppCell} |`)
+    lines.push(`| ${device} | ${tpsCell} | ${ppCell} |`)
   }
   lines.push('')
   return lines.join('\n') + '\n'

From 71e2676624b9d978810a1c8b6ff7c076e5a75b9c Mon Sep 17 00:00:00 2001
From: Ridwan Taiwo <donriddo@gmail.com>
Date: Thu, 4 Jun 2026 06:42:45 +0100
Subject: [PATCH 07/10] feat: add Qwen3-1.7B to mobile benchmark (comparison
 baseline)

---
 .../test/integration/_benchmark-perf.js       | 66 ++++++++++++++++++-
 .../benchmark-perf-17b-q4-0-f16.test.js       |  3 +
 .../benchmark-perf-17b-q4-0-q4-0.test.js      |  3 +
 .../benchmark-perf-17b-q4-0-q8-0.test.js      |  3 +
 .../benchmark-perf-17b-q4-k-m-f16.test.js     |  3 +
 .../benchmark-perf-17b-q4-k-m-q4-0.test.js    |  3 +
 .../benchmark-perf-17b-q4-k-m-q8-0.test.js    |  3 +
 .../benchmark-perf-17b-q8-0-f16.test.js       |  3 +
 .../benchmark-perf-17b-q8-0-q4-0.test.js      |  3 +
 .../benchmark-perf-17b-q8-0-q8-0.test.js      |  3 +
 .../test/mobile/integration.auto.cjs          | 45 +++++++++++++
 11 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q8-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q8-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-f16.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q4-0.test.js
 create mode 100644 packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q8-0.test.js

diff --git a/packages/llm-llamacpp/test/integration/_benchmark-perf.js b/packages/llm-llamacpp/test/integration/_benchmark-perf.js
index 6157532662..949e11fd69 100644
--- a/packages/llm-llamacpp/test/integration/_benchmark-perf.js
+++ b/packages/llm-llamacpp/test/integration/_benchmark-perf.js
@@ -57,6 +57,70 @@ function modelSpec (size, quant) {
   }
 }
 
+function modelSpec17b (quant) {
+  return {
+    id: `qwen3-1.7b-${quant}`,
+    name: `Qwen3-1.7B-${quant}.gguf`,
+    url: `https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-${quant}.gguf`
+  }
+}
+
+function benchmarkModel17b (quant, cacheType) {
+  const spec = modelSpec17b(quant)
+  const id = `${spec.id}-${cacheType}`
+  safeTest(`Mobile perf benchmark: ${id} (TTFT / TPS / ppTPS)`, {
+    timeout: 1_800_000,
+    skip: !isMobile
+  }, async t => {
+    const specLogger = attachSpecLogger({ forwardToConsole: true })
+    try {
+      const [modelName, dirPath] = await ensureModel({ modelName: spec.name, downloadUrl: spec.url })
+      const modelPath = path.join(dirPath, modelName)
+      for (const device of DEVICES) {
+        const labelFor = rb => `[${spec.id}] [${device}] [rb=${rb}] [kv=${cacheType}]`
+        const modelFor = rb => `${id}-${device}-rb${rb}`
+        for (const rb of REASONING_BUDGETS) recordCrashedPlaceholder(labelFor(rb), device, modelFor(rb))
+        let addon = null
+        try {
+          addon = new LlmLlamacpp({
+            files: { model: [modelPath] },
+            config: { ...RUNTIME, device, 'cache-type-k': cacheType, 'cache-type-v': cacheType },
+            logger: { error: () => {}, warn: () => {}, info: () => {}, debug: () => {} },
+            opts: { stats: true }
+          })
+          await addon.load()
+        } catch (loadErr) {
+          t.comment(`[${id}] [${device}] load failed (reported as Crashed): ${loadErr && loadErr.message ? loadErr.message : loadErr}`)
+          await (addon && addon.unload && addon.unload().catch(() => {}))
+          continue
+        }
+        try {
+          for (const rb of REASONING_BUDGETS) {
+            const label = labelFor(rb)
+            try {
+              for (let w = 1; w <= PERF_WARMUP_RUNS; w++) {
+                const { endTime, startTime } = await runInference(addon, PROMPT, rb)
+                t.comment(`${label} warmup ${w}/${PERF_WARMUP_RUNS} (${endTime - startTime}ms) - perf NOT recorded`)
+              }
+              for (let run = 1; run <= PERF_RUNS; run++) {
+                const { output, startTime, endTime, stats } = await runInference(addon, PROMPT, rb)
+                t.comment(recordPerformance(label, endTime - startTime, { stats, deviceId: device, scenario: 'benchmark-perf', model: modelFor(rb) }))
+                t.ok(output.length > 0, `${label} run ${run}/${PERF_RUNS} produced output`)
+              }
+            } catch (runErr) {
+              t.comment(`${label} run failed (reported as Crashed): ${runErr && runErr.message ? runErr.message : runErr}`)
+            }
+          }
+        } finally {
+          await addon.unload().catch(() => {})
+        }
+      }
+    } finally {
+      specLogger.release()
+    }
+  })
+}
+
 async function runInference (addon, prompt, reasoningBudget) {
   const startTime = Date.now()
   const response = await addon.run(prompt, {
@@ -155,4 +219,4 @@ function benchmarkModel (size, quant, cacheType) {
   })
 }
 
-module.exports = { benchmarkModel, modelSpec }
+module.exports = { benchmarkModel, modelSpec, benchmarkModel17b, modelSpec17b }
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-f16.test.js
new file mode 100644
index 0000000000..0eaa6ae7e1
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel17b } = require('./_benchmark-perf.js')
+benchmarkModel17b('Q4_0', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q4-0.test.js
new file mode 100644
index 0000000000..ebda1f97e9
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel17b } = require('./_benchmark-perf.js')
+benchmarkModel17b('Q4_0', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q8-0.test.js
new file mode 100644
index 0000000000..7889d715a9
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-0-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel17b } = require('./_benchmark-perf.js')
+benchmarkModel17b('Q4_0', 'q8_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-f16.test.js
new file mode 100644
index 0000000000..26e208ada4
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel17b } = require('./_benchmark-perf.js')
+benchmarkModel17b('Q4_K_M', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q4-0.test.js
new file mode 100644
index 0000000000..65eb74c8a2
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel17b } = require('./_benchmark-perf.js')
+benchmarkModel17b('Q4_K_M', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q8-0.test.js
new file mode 100644
index 0000000000..5c958e4768
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q4-k-m-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel17b } = require('./_benchmark-perf.js')
+benchmarkModel17b('Q4_K_M', 'q8_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-f16.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-f16.test.js
new file mode 100644
index 0000000000..0c2c448f4f
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-f16.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel17b } = require('./_benchmark-perf.js')
+benchmarkModel17b('Q8_0', 'f16')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q4-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q4-0.test.js
new file mode 100644
index 0000000000..947d7b7534
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q4-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel17b } = require('./_benchmark-perf.js')
+benchmarkModel17b('Q8_0', 'q4_0')
diff --git a/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q8-0.test.js b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q8-0.test.js
new file mode 100644
index 0000000000..11531dac36
--- /dev/null
+++ b/packages/llm-llamacpp/test/integration/benchmark-perf-17b-q8-0-q8-0.test.js
@@ -0,0 +1,3 @@
+'use strict'
+const { benchmarkModel17b } = require('./_benchmark-perf.js')
+benchmarkModel17b('Q8_0', 'q8_0')
diff --git a/packages/llm-llamacpp/test/mobile/integration.auto.cjs b/packages/llm-llamacpp/test/mobile/integration.auto.cjs
index d5b6439be9..9eb42d5f8e 100644
--- a/packages/llm-llamacpp/test/mobile/integration.auto.cjs
+++ b/packages/llm-llamacpp/test/mobile/integration.auto.cjs
@@ -91,6 +91,51 @@ async function runBenchmarkPerf08bQ80Q80Test (options = {}) { // eslint-disable-
   return runIntegrationModule('../integration/benchmark-perf-08b-q8-0-q8-0.test.js', options)
 }
 
+async function runBenchmarkPerf17bQ40F16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ40F16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-17b-q4-0-f16.test.js', options)
+}
+
+async function runBenchmarkPerf17bQ40Q40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ40Q40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-17b-q4-0-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf17bQ40Q80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ40Q80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-17b-q4-0-q8-0.test.js', options)
+}
+
+async function runBenchmarkPerf17bQ4KMF16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ4KMF16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-17b-q4-k-m-f16.test.js', options)
+}
+
+async function runBenchmarkPerf17bQ4KMQ40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ4KMQ40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-17b-q4-k-m-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf17bQ4KMQ80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ4KMQ80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-17b-q4-k-m-q8-0.test.js', options)
+}
+
+async function runBenchmarkPerf17bQ80F16Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ80F16Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-17b-q8-0-f16.test.js', options)
+}
+
+async function runBenchmarkPerf17bQ80Q40Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ80Q40Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-17b-q8-0-q4-0.test.js', options)
+}
+
+async function runBenchmarkPerf17bQ80Q80Test (options = {}) { // eslint-disable-line no-unused-vars
+  if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf17bQ80Q80Test')) return __FILTERED
+  return runIntegrationModule('../integration/benchmark-perf-17b-q8-0-q8-0.test.js', options)
+}
+
 async function runBenchmarkPerf2bQ40F16Test (options = {}) { // eslint-disable-line no-unused-vars
   if (typeof __shouldRunTest === 'function' && !__shouldRunTest('runBenchmarkPerf2bQ40F16Test')) return __FILTERED
   return runIntegrationModule('../integration/benchmark-perf-2b-q4-0-f16.test.js', options)

From f1c4f77d504ad7bd47fa54aeec9388184fd57b94 Mon Sep 17 00:00:00 2001
From: Ridwan Taiwo <donriddo@gmail.com>
Date: Thu, 4 Jun 2026 06:43:29 +0100
Subject: [PATCH 08/10] tmp: 1.7B-only mobile dispatch (3 batches x 3 groups)

---
 .github/workflows/benchmark-perf-llm-llamacpp.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmark-perf-llm-llamacpp.yml b/.github/workflows/benchmark-perf-llm-llamacpp.yml
index 8ce2fa8e53..bb1bb15518 100644
--- a/.github/workflows/benchmark-perf-llm-llamacpp.yml
+++ b/.github/workflows/benchmark-perf-llm-llamacpp.yml
@@ -231,11 +231,11 @@ jobs:
       matrix:
         include:
           - cache: f16
-            groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"}]'
+            groups: '[{"name":"BenchmarkPerf17bQ40F16","grep":"runBenchmarkPerf17bQ40F16Test"},{"name":"BenchmarkPerf17bQ4KMF16","grep":"runBenchmarkPerf17bQ4KMF16Test"},{"name":"BenchmarkPerf17bQ80F16","grep":"runBenchmarkPerf17bQ80F16Test"}]'
           - cache: q8_0
-            groups: '[{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]'
+            groups: '[{"name":"BenchmarkPerf17bQ40Q80","grep":"runBenchmarkPerf17bQ40Q80Test"},{"name":"BenchmarkPerf17bQ4KMQ80","grep":"runBenchmarkPerf17bQ4KMQ80Test"},{"name":"BenchmarkPerf17bQ80Q80","grep":"runBenchmarkPerf17bQ80Q80Test"}]'
           - cache: q4_0
-            groups: '[{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"}]'
+            groups: '[{"name":"BenchmarkPerf17bQ40Q40","grep":"runBenchmarkPerf17bQ40Q40Test"},{"name":"BenchmarkPerf17bQ4KMQ40","grep":"runBenchmarkPerf17bQ4KMQ40Test"},{"name":"BenchmarkPerf17bQ80Q40","grep":"runBenchmarkPerf17bQ80Q40Test"}]'
     uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml
     secrets: inherit
     with:

From 511f50aa18654d161ca54d2e9b79a532918a6fc3 Mon Sep 17 00:00:00 2001
From: Ridwan Taiwo <donriddo@gmail.com>
Date: Thu, 4 Jun 2026 06:44:15 +0100
Subject: [PATCH 09/10] chore: restore full Qwen3.5 matrix after 1.7B dispatch

---
 .github/workflows/benchmark-perf-llm-llamacpp.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmark-perf-llm-llamacpp.yml b/.github/workflows/benchmark-perf-llm-llamacpp.yml
index bb1bb15518..8ce2fa8e53 100644
--- a/.github/workflows/benchmark-perf-llm-llamacpp.yml
+++ b/.github/workflows/benchmark-perf-llm-llamacpp.yml
@@ -231,11 +231,11 @@ jobs:
       matrix:
         include:
           - cache: f16
-            groups: '[{"name":"BenchmarkPerf17bQ40F16","grep":"runBenchmarkPerf17bQ40F16Test"},{"name":"BenchmarkPerf17bQ4KMF16","grep":"runBenchmarkPerf17bQ4KMF16Test"},{"name":"BenchmarkPerf17bQ80F16","grep":"runBenchmarkPerf17bQ80F16Test"}]'
+            groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"}]'
           - cache: q8_0
-            groups: '[{"name":"BenchmarkPerf17bQ40Q80","grep":"runBenchmarkPerf17bQ40Q80Test"},{"name":"BenchmarkPerf17bQ4KMQ80","grep":"runBenchmarkPerf17bQ4KMQ80Test"},{"name":"BenchmarkPerf17bQ80Q80","grep":"runBenchmarkPerf17bQ80Q80Test"}]'
+            groups: '[{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]'
           - cache: q4_0
-            groups: '[{"name":"BenchmarkPerf17bQ40Q40","grep":"runBenchmarkPerf17bQ40Q40Test"},{"name":"BenchmarkPerf17bQ4KMQ40","grep":"runBenchmarkPerf17bQ4KMQ40Test"},{"name":"BenchmarkPerf17bQ80Q40","grep":"runBenchmarkPerf17bQ80Q40Test"}]'
+            groups: '[{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"}]'
     uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml
     secrets: inherit
     with:

From 6f5fd88c780b708e9270ca79018c248e73d6b608 Mon Sep 17 00:00:00 2001
From: Ridwan Taiwo <donriddo@gmail.com>
Date: Thu, 4 Jun 2026 11:20:01 +0100
Subject: [PATCH 10/10] =?UTF-8?q?feat[notask]:=20enhance=20benchmark=20rep?=
 =?UTF-8?q?ort=20=E2=80=94=20tokens=20column,=20addon=20version,=20prompt?=
 =?UTF-8?q?=20size,=20run=20count,=20GPU,=20cross-run=20diff?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

render-report.js:
- Add Tokens column (generated tokens per config) to all device tables
- Add report header line: addon version, prompt size (tokens), runs per config
- Extract addon version from mobile JSON's `addon` field; accept --addon-version CLI override
- Extract promptTokens and generatedTokens from desktop sweep and mobile schemas
- Add --compare-dir flag: when provided, renders Δ TTFT / Δ TPS / Δ ppTPS columns
  against a baseline directory (cross-run regression view)

benchmark-perf-llm-llamacpp.yml:
- Add nvidia-smi step to detect GPU name; pass it as --desktop-device to render-report
- Extract addon version from package.json and pass as --addon-version
- Add summarize_only + artifact_run_number inputs: re-render report from a previous
  run's artifacts without re-running the 6-hour benchmark
- Add compare_run_number input: download baseline artifacts from a prior run and
  pass --compare-dir to render-report for the Δ regression view
- Guard desktop/mobile jobs with !inputs.summarize_only
---
 .../workflows/benchmark-perf-llm-llamacpp.yml |  80 ++++++--
 .../benchmarks/performance/render-report.js   | 190 ++++++++++++++----
 2 files changed, 224 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/benchmark-perf-llm-llamacpp.yml b/.github/workflows/benchmark-perf-llm-llamacpp.yml
index 8ce2fa8e53..b357e05fbd 100644
--- a/.github/workflows/benchmark-perf-llm-llamacpp.yml
+++ b/.github/workflows/benchmark-perf-llm-llamacpp.yml
@@ -26,6 +26,19 @@ on:
         required: false
         default: true
         type: boolean
+      summarize_only:
+        description: "Re-render report from a previous run's artifacts (skips benchmarks)"
+        required: false
+        default: false
+        type: boolean
+      artifact_run_number:
+        description: "Run number to pull artifacts from when summarize_only=true (e.g. 9)"
+        required: false
+        type: string
+      compare_run_number:
+        description: "Run number of a baseline run to diff against (shows Δ TPS / Δ TTFT columns)"
+        required: false
+        type: string
 
 permissions:
   contents: read
@@ -91,7 +104,7 @@ jobs:
     needs:
       - context
       - label-gate
-    if: needs.label-gate.outputs.authorised == 'true' && inputs.run_desktop
+    if: needs.label-gate.outputs.authorised == 'true' && inputs.run_desktop && !inputs.summarize_only
     name: Desktop Parameter Sweep
     runs-on: ai-run-linux-gpu
     timeout-minutes: 360
@@ -178,6 +191,13 @@ jobs:
 
           echo "=== Build complete ==="
 
+      - name: Detect GPU
+        id: gpu
+        shell: bash
+        run: |
+          gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 | tr -d '\r' || echo "GPU")
+          echo "name=$gpu_name" >> "$GITHUB_OUTPUT"
+
       - name: Install benchmark dependencies
         working-directory: packages/llm-llamacpp/benchmarks/performance
         run: npm install
@@ -201,6 +221,8 @@ jobs:
           path: packages/llm-llamacpp/benchmarks/performance/results/parameter-sweep/
           retention-days: 90
           if-no-files-found: ignore
+        env:
+          DESKTOP_GPU: ${{ steps.gpu.outputs.name }}
 
   # Mobile is sharded one group per (model x KV-cache type) = 30 groups so each
   # Device Farm session finishes inside the fixed 20-minute iOS per-test ceiling.
@@ -219,7 +241,7 @@ jobs:
       - context
       - prebuild
       - label-gate
-    if: needs.label-gate.outputs.authorised == 'true' && inputs.run_mobile
+    if: needs.label-gate.outputs.authorised == 'true' && inputs.run_mobile && !inputs.summarize_only
     permissions:
       contents: read
       packages: read
@@ -245,10 +267,10 @@ jobs:
       test_groups: ${{ matrix.groups }}
       artifact_suffix: ${{ matrix.cache }}-
 
-  # Aggregates the mobile per-model perf-report artifacts (one per shard,
-  # both platforms) into a single consolidated table rendered into the run
-  # summary, so the full mobile matrix is visible from the run page without
-  # opening artifacts. Desktop writes its own table inline in its job.
+  # Aggregates desktop + mobile artifacts into one unified markdown report.
+  # Runs after benchmarks finish, or standalone when summarize_only=true
+  # (pass artifact_run_number to pull results from a previous run).
+  # Pass compare_run_number to show Δ TPS / Δ TTFT regression columns.
   summarize:
     needs:
       - context
@@ -260,6 +282,9 @@ jobs:
     timeout-minutes: 10
     permissions:
       contents: read
+    env:
+      # Use artifact_run_number when summarize_only, otherwise current run
+      ARTIFACT_RUN: ${{ inputs.artifact_run_number || github.run_number }}
     steps:
       - name: Checkout aggregator
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
@@ -273,33 +298,64 @@ jobs:
         with:
           node-version: lts/*
 
-      # Both desktop (llm-param-sweep-desktop-*) and mobile
-      # (perf-report-llamacpp-llm-*) artifacts feed the same renderer, so the
-      # run summary shows desktop and every device in one identical format.
+      - name: Get addon version
+        id: addon_ver
+        shell: bash
+        run: |
+          ver=$(node -e "process.stdout.write(require('./packages/llm-llamacpp/package.json').version)" 2>/dev/null || true)
+          echo "version=${ver:+@qvac/llm-llamacpp@$ver}" >> "$GITHUB_OUTPUT"
+
       - name: Download desktop sweep artifact
         uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
         with:
-          pattern: llm-param-sweep-desktop-${{ github.run_number }}
+          pattern: llm-param-sweep-desktop-${{ env.ARTIFACT_RUN }}
           path: combined-reports
         continue-on-error: true
 
       - name: Download mobile perf-report artifacts
         uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
         with:
-          pattern: perf-report-llamacpp-llm-*-${{ github.run_number }}
+          pattern: perf-report-llamacpp-llm-*-${{ env.ARTIFACT_RUN }}
           path: combined-reports
         continue-on-error: true
 
+      - name: Download baseline artifacts for comparison
+        if: inputs.compare_run_number != ''
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
+        with:
+          pattern: llm-param-sweep-desktop-${{ inputs.compare_run_number }}
+          path: baseline-reports
+        continue-on-error: true
+
+      - name: Download baseline mobile artifacts for comparison
+        if: inputs.compare_run_number != ''
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
+        with:
+          pattern: perf-report-llamacpp-llm-*-${{ inputs.compare_run_number }}
+          path: baseline-reports
+        continue-on-error: true
+
       - name: Render unified benchmark report
+        shell: bash
         run: |
           if ! find combined-reports -name "*.json" -type f 2>/dev/null | grep -q .; then
             echo "No benchmark reports found."
             exit 0
           fi
           mkdir -p benchmark-artifacts
+
+          EXTRA_ARGS=""
+          if [ -n "${{ steps.addon_ver.outputs.version }}" ]; then
+            EXTRA_ARGS="$EXTRA_ARGS --addon-version ${{ steps.addon_ver.outputs.version }}"
+          fi
+          if [ -d baseline-reports ] && find baseline-reports -name "*.json" -type f 2>/dev/null | grep -q .; then
+            EXTRA_ARGS="$EXTRA_ARGS --compare-dir baseline-reports"
+          fi
+
           node packages/llm-llamacpp/benchmarks/performance/render-report.js \
             --dir combined-reports \
-            --output benchmark-artifacts/qwen35-benchmark-findings.md
+            --output benchmark-artifacts/qwen35-benchmark-findings.md \
+            $EXTRA_ARGS
 
       - name: Add to run summary
         if: always()
diff --git a/packages/llm-llamacpp/benchmarks/performance/render-report.js b/packages/llm-llamacpp/benchmarks/performance/render-report.js
index 35da100341..13a4343d79 100644
--- a/packages/llm-llamacpp/benchmarks/performance/render-report.js
+++ b/packages/llm-llamacpp/benchmarks/performance/render-report.js
@@ -3,33 +3,44 @@
 
 // Unified benchmark report renderer for the Qwen3.5 perf benchmark.
 //
-// Reads perf JSON from --dir (recursively) and renders ONE markdown report
-// with the same shape for desktop and mobile, per Gianfranco's request:
-//   - one table per device, columns: Config | TTFT (ms) | TPS | ppTPS
-//     (only the addon runtimeStats; no Total/Prefill/Decode/Platform/Mean)
-//   - a closing "best config per device" summary (highest TPS, highest ppTPS)
-//   - crashed combos render as `Crashed` instead of metrics
+// Reads perf JSON from --dir (recursively) and renders ONE markdown report:
+//   - header with addon version, prompt size, runs-per-config, GPU
+//   - one table per device: Config | TTFT (ms) | TPS | ppTPS | Tokens
+//   - optional Δ columns when --compare-dir is provided (cross-run regression)
+//   - a closing "best config per device" summary
 //
-// Two input schemas are normalized:
-//   - desktop sweep:  { models: [ { modelId, cases: [ { quantization,
-//                       runtimeConfig, metrics:{ttftMsMean,tpsMean,ppTpsMean},
-//                       status, isBaseline } ] } ]   (one logical device: the
-//                       desktop GPU runner)
-//   - mobile report:  { device:{name}, results:[ { test, status,
-//                       metrics:{ttft_ms,tps,pp_tps} } ] }
+// Two input schemas are normalised:
+//   desktop sweep:  { models:[{modelId, cases:[{quantization, runtimeConfig,
+//                    metrics:{ttftMsMean,tpsMean,ppTpsMean,promptTokens,
+//                    generatedTokens}, status, isBaseline}]}], repeats, ... }
+//   mobile report:  { addon, device:{name}, results:[{test, metrics:{ttft_ms,
+//                    tps, pp_tps, generated_tokens, prompt_tokens}}] }
 
 const fs = require('fs')
 const path = require('path')
 
 function parseArgs (argv) {
-  const a = { dir: null, output: null, desktopDevice: 'Desktop (linux-x64 GPU)' }
+  const a = {
+    dir: null,
+    output: null,
+    desktopDevice: 'Desktop (linux-x64 GPU)',
+    addonVersion: null,
+    compareDir: null
+  }
   for (let i = 2; i < argv.length; i++) {
     const t = argv[i]
     if (t === '--dir') a.dir = argv[++i]
     else if (t === '--output') a.output = argv[++i]
     else if (t === '--desktop-device') a.desktopDevice = argv[++i]
+    else if (t === '--addon-version') a.addonVersion = argv[++i]
+    else if (t === '--compare-dir') a.compareDir = argv[++i]
+  }
+  if (!a.dir) {
+    throw new Error(
+      'usage: render-report.js --dir <path> [--output <md>] ' +
+      '[--desktop-device <name>] [--addon-version <ver>] [--compare-dir <path>]'
+    )
   }
-  if (!a.dir) throw new Error('usage: render-report.js --dir <path> [--output <md>] [--desktop-device <name>]')
   return a
 }
 
@@ -47,17 +58,38 @@ function num (v) {
   return typeof v === 'number' && Number.isFinite(v) ? v : null
 }
 
-// Normalize any report file into rows: { device, config, ttft, tps, ppTps, crashed }
-function rowsFromFile (file, desktopDevice) {
+function int (v) {
+  const n = num(v)
+  return n !== null ? Math.round(n) : null
+}
+
+// Collect metadata and rows from all files in a directory.
+// Returns { rows, meta } where meta = { addonVersion, repeats, promptTokens }.
+function loadDir (dir, desktopDevice) {
+  const files = walkJson(dir)
+  const meta = { addonVersion: null, repeats: null, promptTokens: null }
+  let rows = []
+  for (const f of files) {
+    const r = rowsFromFile(f, desktopDevice, meta)
+    rows.push(...r)
+  }
+  rows = dedupe(rows)
+  return { rows, meta }
+}
+
+// Normalise any report file into rows: { device, config, ttft, tps, ppTps, tokens, crashed }
+// Also fills in meta fields when found.
+function rowsFromFile (file, desktopDevice, meta) {
   let doc
   try { doc = JSON.parse(fs.readFileSync(file, 'utf8')) } catch { return [] }
   const rows = []
 
   // Desktop sweep schema
   if (Array.isArray(doc.models) && doc.models.length && Array.isArray(doc.models[0].cases)) {
+    if (num(doc.repeats) !== null && meta.repeats === null) meta.repeats = doc.repeats
     for (const model of doc.models) {
       for (const c of model.cases) {
-        if (c.isBaseline) continue // baseline duplicates a swept combo
+        if (c.isBaseline) continue
         const rc = c.runtimeConfig || {}
         const config = configLabel({
           model: `${model.modelId}-${c.quantization}`,
@@ -67,6 +99,9 @@ function rowsFromFile (file, desktopDevice) {
           cv: rc['cache-type-v']
         })
         const m = c.metrics || {}
+        if (int(m.promptTokens) !== null && meta.promptTokens === null) {
+          meta.promptTokens = int(m.promptTokens)
+        }
         const crashed = c.status && c.status !== 'ok' && c.status !== 'partial-failure'
         rows.push({
           device: desktopDevice,
@@ -74,6 +109,7 @@ function rowsFromFile (file, desktopDevice) {
           ttft: num(m.ttftMsMean),
           tps: num(m.tpsMean),
           ppTps: num(m.ppTpsMean),
+          tokens: int(m.generatedTokens),
           crashed: !!crashed
         })
       }
@@ -83,9 +119,13 @@ function rowsFromFile (file, desktopDevice) {
 
   // Mobile perf-report schema
   if (doc.device && Array.isArray(doc.results)) {
+    if (doc.addon && meta.addonVersion === null) meta.addonVersion = doc.addon
     const device = (doc.device.name || 'unknown').trim()
     for (const r of doc.results) {
       const m = r.metrics || {}
+      if (int(m.prompt_tokens) !== null && meta.promptTokens === null) {
+        meta.promptTokens = int(m.prompt_tokens)
+      }
       const crashed = (r.status && String(r.status).toLowerCase() === 'crashed') ||
         (num(m.ttft_ms) === null && num(m.tps) === null && num(m.pp_tps) === null)
       rows.push({
@@ -94,6 +134,7 @@ function rowsFromFile (file, desktopDevice) {
         ttft: num(m.ttft_ms),
         tps: num(m.tps),
         ppTps: num(m.pp_tps),
+        tokens: int(m.generated_tokens),
         crashed: !!crashed
       })
     }
@@ -111,12 +152,18 @@ function configLabel ({ model, backend, rb, ck, cv }) {
   return parts.join(' ')
 }
 
-function fmt (v) {
-  return v === null ? '-' : (Math.round(v * 100) / 100)
+function fmt (v, decimals = 2) {
+  if (v === null) return '-'
+  return (Math.round(v * Math.pow(10, decimals)) / Math.pow(10, decimals)).toFixed(decimals)
+}
+
+function fmtDelta (v) {
+  if (v === null) return '-'
+  const sign = v >= 0 ? '+' : ''
+  return `${sign}${fmt(v)}`
 }
 
 function dedupe (rows) {
-  // last write wins per (device, config); prefer a non-crashed row with metrics
   const byKey = new Map()
   for (const r of rows) {
     const k = `${r.device}@@${r.config}`
@@ -126,34 +173,103 @@ function dedupe (rows) {
   return [...byKey.values()]
 }
 
-function render (rows, desktopDevice) {
+function buildBaselineMap (baseRows) {
+  const m = new Map()
+  for (const r of baseRows) m.set(`${r.device}@@${r.config}`, r)
+  return m
+}
+
+function render (rows, desktopDevice, meta, addonVersionArg, baselineMap) {
   const byDevice = new Map()
   for (const r of rows) {
     if (!byDevice.has(r.device)) byDevice.set(r.device, [])
     byDevice.get(r.device).push(r)
   }
-  // desktop device first, then the rest alphabetically
   const devices = [...byDevice.keys()].sort((a, b) => {
     if (a === desktopDevice) return -1
     if (b === desktopDevice) return 1
     return a.localeCompare(b)
   })
 
+  const addonVersion = addonVersionArg || meta.addonVersion || null
+  const comparing = baselineMap !== null
+
   const lines = []
   lines.push('# Qwen3.5 Benchmark Results')
   lines.push('')
-  lines.push('Metrics are the addon `runtimeStats`: TTFT (time to first token, ms), TPS (decode tokens/sec), ppTPS (prefill tokens/sec). `Crashed` means the configuration crashed or produced no output on that device.')
+
+  // Header metadata block
+  const headerParts = []
+  if (addonVersion) headerParts.push(`**Addon:** \`${addonVersion}\``)
+  if (meta.promptTokens !== null) headerParts.push(`**Prompt:** ${meta.promptTokens} tokens`)
+  if (meta.repeats !== null) headerParts.push(`**Runs per config:** ${meta.repeats}`)
+  if (headerParts.length) {
+    lines.push(headerParts.join(' · '))
+    lines.push('')
+  }
+
+  lines.push(
+    'Metrics are addon `runtimeStats`: ' +
+    'TTFT = time to first token (ms), TPS = decode tokens/sec, ' +
+    'ppTPS = prefill tokens/sec, Tokens = generated tokens.' +
+    (comparing ? ' Δ columns show current minus baseline.' : '') +
+    ' `Crashed` = configuration crashed or produced no output.'
+  )
   lines.push('')
 
+  const hasTokens = rows.some(r => r.tokens !== null)
+
   for (const device of devices) {
     const items = byDevice.get(device).slice().sort((a, b) => a.config.localeCompare(b.config))
     lines.push(`## ${device}`)
     lines.push('')
-    lines.push('| Config | TTFT (ms) | TPS | ppTPS |')
-    lines.push('| --- | ---: | ---: | ---: |')
-    for (const r of items) {
-      if (r.crashed) lines.push(`| ${r.config} | Crashed | Crashed | Crashed |`)
-      else lines.push(`| ${r.config} | ${fmt(r.ttft)} | ${fmt(r.tps)} | ${fmt(r.ppTps)} |`)
+
+    if (comparing) {
+      const hdr = hasTokens
+        ? '| Config | TTFT (ms) | Δ TTFT | TPS | Δ TPS | ppTPS | Δ ppTPS | Tokens |'
+        : '| Config | TTFT (ms) | Δ TTFT | TPS | Δ TPS | ppTPS | Δ ppTPS |'
+      const sep = hasTokens
+        ? '| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |'
+        : '| --- | ---: | ---: | ---: | ---: | ---: | ---: |'
+      lines.push(hdr)
+      lines.push(sep)
+      for (const r of items) {
+        const b = baselineMap.get(`${r.device}@@${r.config}`)
+        if (r.crashed) {
+          const crash = hasTokens
+            ? `| ${r.config} | Crashed | - | Crashed | - | Crashed | - | - |`
+            : `| ${r.config} | Crashed | - | Crashed | - | Crashed | - |`
+          lines.push(crash)
+        } else {
+          const dTtft = (b && !b.crashed && r.ttft !== null && b.ttft !== null) ? r.ttft - b.ttft : null
+          const dTps = (b && !b.crashed && r.tps !== null && b.tps !== null) ? r.tps - b.tps : null
+          const dPp = (b && !b.crashed && r.ppTps !== null && b.ppTps !== null) ? r.ppTps - b.ppTps : null
+          const row = hasTokens
+            ? `| ${r.config} | ${fmt(r.ttft)} | ${fmtDelta(dTtft)} | ${fmt(r.tps)} | ${fmtDelta(dTps)} | ${fmt(r.ppTps)} | ${fmtDelta(dPp)} | ${r.tokens !== null ? r.tokens : '-'} |`
+            : `| ${r.config} | ${fmt(r.ttft)} | ${fmtDelta(dTtft)} | ${fmt(r.tps)} | ${fmtDelta(dTps)} | ${fmt(r.ppTps)} | ${fmtDelta(dPp)} |`
+          lines.push(row)
+        }
+      }
+    } else {
+      const hdr = hasTokens
+        ? '| Config | TTFT (ms) | TPS | ppTPS | Tokens |'
+        : '| Config | TTFT (ms) | TPS | ppTPS |'
+      const sep = hasTokens
+        ? '| --- | ---: | ---: | ---: | ---: |'
+        : '| --- | ---: | ---: | ---: |'
+      lines.push(hdr)
+      lines.push(sep)
+      for (const r of items) {
+        if (r.crashed) {
+          lines.push(hasTokens
+            ? `| ${r.config} | Crashed | Crashed | Crashed | - |`
+            : `| ${r.config} | Crashed | Crashed | Crashed |`)
+        } else {
+          lines.push(hasTokens
+            ? `| ${r.config} | ${fmt(r.ttft)} | ${fmt(r.tps)} | ${fmt(r.ppTps)} | ${r.tokens !== null ? r.tokens : '-'} |`
+            : `| ${r.config} | ${fmt(r.ttft)} | ${fmt(r.tps)} | ${fmt(r.ppTps)} |`)
+        }
+      }
     }
     lines.push('')
   }
@@ -176,17 +292,23 @@ function render (rows, desktopDevice) {
 
 function main () {
   const args = parseArgs(process.argv)
-  const files = walkJson(args.dir)
-  let rows = []
-  for (const f of files) rows.push(...rowsFromFile(f, args.desktopDevice))
-  rows = dedupe(rows)
+
+  const { rows, meta } = loadDir(args.dir, args.desktopDevice)
+
+  let baselineMap = null
+  if (args.compareDir) {
+    const { rows: baseRows } = loadDir(args.compareDir, args.desktopDevice)
+    baselineMap = buildBaselineMap(baseRows)
+  }
+
   if (rows.length === 0) {
     const msg = 'No benchmark results found.\n'
     if (args.output) fs.writeFileSync(args.output, msg)
     else process.stdout.write(msg)
     return
   }
-  const md = render(rows, args.desktopDevice)
+
+  const md = render(rows, args.desktopDevice, meta, args.addonVersion, baselineMap)
   if (args.output) fs.writeFileSync(args.output, md)
   else process.stdout.write(md)
 }