tetherto · donriddo · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
@@ -6,7 +6,8 @@ name: Benchmark Performance — LLM Parameter Sweep
 #
 # To change what runs, edit:
 #   desktop: models.manifest.json (models) + llm-parameter-sweep.config.js (sweep dims)
-#   mobile:  mobile.config.json
+#   mobile:  test/integration/_benchmark-perf.js (shared runner) + the
+#            benchmark-perf-*.test.js shards (one per model x KV-cache type)
 
 on:
   workflow_dispatch:
@@ -25,6 +26,19 @@ on:
         required: false
         default: true
         type: boolean
+      summarize_only:
+        description: "Re-render report from a previous run's artifacts (skips benchmarks)"
+        required: false
+        default: false
+        type: boolean
+      artifact_run_number:
+        description: "Run number to pull artifacts from when summarize_only=true (e.g. 9)"
+        required: false
+        type: string
+      compare_run_number:
+        description: "Run number of a baseline run to diff against (shows Δ TPS / Δ TTFT columns)"
+        required: false
+        type: string
 
 permissions:
   contents: read
@@ -90,7 +104,7 @@ jobs:
     needs:
       - context
       - label-gate
-    if: needs.label-gate.outputs.authorised == 'true' && inputs.run_desktop
+    if: needs.label-gate.outputs.authorised == 'true' && inputs.run_desktop && !inputs.summarize_only
     name: Desktop Parameter Sweep
     runs-on: ai-run-linux-gpu
     timeout-minutes: 360
@@ -123,6 +137,12 @@ jobs:
       - name: Setup LLVM
         uses: tetherto/qvac/.github/actions/setup-llvm@98a6a6b6e8f3866dfdd75052a4071269ce85dc41
 
+      - name: Setup Vulkan SDK
+        uses: tetherto/qvac/.github/actions/setup-vulkan-sdk@0bbdca93da303a0b1634ba14a89cec085621078d
+        with:
+          platform: linux
+          arch: x64
+
       - name: Build addon from source
         working-directory: packages/llm-llamacpp
         run: |
@@ -157,21 +177,6 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y libxi-dev libxtst-dev libxrandr-dev xz-utils
 
-          echo "Installing Vulkan SDK (latest)..."
-          wget -q -O /tmp/vulkansdk.tar.xz https://sdk.lunarg.com/sdk/download/latest/linux/vulkan_sdk.tar.xz
-          mkdir -p "$HOME/vulkan" && cd "$HOME/vulkan"
-          tar xf /tmp/vulkansdk.tar.xz --strip-components=1
-          export VULKAN_SDK="$HOME/vulkan/x86_64"
-          export PATH="$VULKAN_SDK/bin:$PATH"
-          export LD_LIBRARY_PATH="$VULKAN_SDK/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
-          export VK_ADD_LAYER_PATH="$VULKAN_SDK/share/vulkan/explicit_layer.d"
-          export PKG_CONFIG_PATH="$VULKAN_SDK/share/pkgconfig:$VULKAN_SDK/lib/pkgconfig${PKG_CONFIG_PATH:+:$PKG_CONFIG_PATH}"
-          echo "VULKAN_SDK=$VULKAN_SDK" >> $GITHUB_ENV
-          echo "PATH=$PATH" >> $GITHUB_ENV
-          echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV
-          echo "VK_ADD_LAYER_PATH=$VK_ADD_LAYER_PATH" >> $GITHUB_ENV
-          echo "PKG_CONFIG_PATH=$PKG_CONFIG_PATH" >> $GITHUB_ENV
-
           cd "$GITHUB_WORKSPACE/packages/llm-llamacpp"
           npm install
 
@@ -186,6 +191,13 @@ jobs:
 
           echo "=== Build complete ==="
 
+      - name: Detect GPU
+        id: gpu
+        shell: bash
+        run: |
+          gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 | tr -d '\r' || echo "GPU")
+          echo "name=$gpu_name" >> "$GITHUB_OUTPUT"
+
       - name: Install benchmark dependencies
         working-directory: packages/llm-llamacpp/benchmarks/performance
         run: npm install
@@ -199,24 +211,8 @@ jobs:
         working-directory: packages/llm-llamacpp/benchmarks/performance
         run: bare ./llm-parameter-sweep.js --addon-source local
 
-      - name: Add job summary
-        if: always()
-        working-directory: packages/llm-llamacpp/benchmarks/performance
-        shell: bash
-        run: |
-          LATEST_MD=$(find results/parameter-sweep -name "*.md" -type f 2>/dev/null | sort | tail -1)
-          {
-            echo "## LLM Parameter Sweep — Desktop"
-            echo ""
-            echo "ref: \`${{ needs.context.outputs.ref }}\`"
-            echo ""
-            if [ -n "${LATEST_MD:-}" ]; then
-              cat "$LATEST_MD"
-            else
-              echo "No results file found."
-            fi
-          } >> "$GITHUB_STEP_SUMMARY"
-
+      # The run summary is rendered by the summarize job (unified desktop +
+      # mobile view); this job just uploads the raw sweep results.
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
@@ -225,24 +221,159 @@ jobs:
           path: packages/llm-llamacpp/benchmarks/performance/results/parameter-sweep/
           retention-days: 90
           if-no-files-found: ignore
+        env:
+          DESKTOP_GPU: ${{ steps.gpu.outputs.name }}
 
+  # Mobile is sharded one group per (model x KV-cache type) = 30 groups so each
+  # Device Farm session finishes inside the fixed 20-minute iOS per-test ceiling.
+  # All 30 in one reused-workflow call do NOT fit: Android serializes the runs
+  # against its device pool (>240 min) and the macOS runner fills its disk
+  # collecting 30 runs' logs. So we split the groups into three batches by
+  # KV-cache type (10 each — the proven in-budget load) and run them
+  # sequentially (max-parallel: 1) to avoid Device Farm pool contention. Each
+  # batch raises the job timeout to 180 for headroom (proven 10-shard wall ~119
+  # min) and gets a distinct artifact_suffix so its perf-report doesn't collide;
+  # summarize aggregates all three. These wrappers are deliberately absent from
+  # the addon's test-groups.json, so this override is the only path that runs
+  # them — normal mobile integration runs never trigger the benchmark.
   mobile-benchmark:
     needs:
       - context
       - prebuild
       - label-gate
-    if: needs.label-gate.outputs.authorised == 'true' && inputs.run_mobile
+    if: needs.label-gate.outputs.authorised == 'true' && inputs.run_mobile && !inputs.summarize_only
     permissions:
       contents: read
       packages: read
       pull-requests: write
       id-token: write
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        include:
+          - cache: f16
+            groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"}]'
+          - cache: q8_0
+            groups: '[{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]'
+          - cache: q4_0
+            groups: '[{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"}]'
     uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml
     secrets: inherit
     with:
       repository: ${{ github.repository }}
       ref: ${{ needs.context.outputs.ref }}
-      # Schedule only the benchmark group. runBenchmarkPerfTest is deliberately
-      # absent from the addon's test-groups.json, so this override is the only
-      # path that runs it — normal mobile integration runs never trigger it.
-      test_groups: '[{"name":"benchmarkPerf","grep":"runBenchmarkPerfTest"}]'
+      job_timeout_minutes: 180
+      test_groups: ${{ matrix.groups }}
+      artifact_suffix: ${{ matrix.cache }}-
+
+  # Aggregates desktop + mobile artifacts into one unified markdown report.
+  # Runs after benchmarks finish, or standalone when summarize_only=true
+  # (pass artifact_run_number to pull results from a previous run).
+  # Pass compare_run_number to show Δ TPS / Δ TTFT regression columns.
+  summarize:
+    needs:
+      - context
+      - label-gate
+      - desktop-benchmark
+      - mobile-benchmark
+    if: needs.label-gate.outputs.authorised == 'true' && always() && needs.context.result == 'success'
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    permissions:
+      contents: read
+    env:
+      # Use artifact_run_number when summarize_only, otherwise current run
+      ARTIFACT_RUN: ${{ inputs.artifact_run_number || github.run_number }}
+    steps:
+      - name: Checkout aggregator
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+        with:
+          ref: ${{ needs.context.outputs.ref }}
+          sparse-checkout: |
+            packages/llm-llamacpp/benchmarks/performance/render-report.js
+
+      - name: Setup Node.js
+        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
+        with:
+          node-version: lts/*
+
+      - name: Get addon version
+        id: addon_ver
+        shell: bash
+        run: |
+          ver=$(node -e "process.stdout.write(require('./packages/llm-llamacpp/package.json').version)" 2>/dev/null || true)
+          echo "version=${ver:+@qvac/llm-llamacpp@$ver}" >> "$GITHUB_OUTPUT"
+
+      - name: Download desktop sweep artifact
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
+        with:
+          pattern: llm-param-sweep-desktop-${{ env.ARTIFACT_RUN }}
+          path: combined-reports
+        continue-on-error: true
+
+      - name: Download mobile perf-report artifacts
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
+        with:
+          pattern: perf-report-llamacpp-llm-*-${{ env.ARTIFACT_RUN }}
+          path: combined-reports
+        continue-on-error: true
+
+      - name: Download baseline artifacts for comparison
+        if: inputs.compare_run_number != ''
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
+        with:
+          pattern: llm-param-sweep-desktop-${{ inputs.compare_run_number }}
+          path: baseline-reports
+        continue-on-error: true
+
+      - name: Download baseline mobile artifacts for comparison
+        if: inputs.compare_run_number != ''
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
+        with:
+          pattern: perf-report-llamacpp-llm-*-${{ inputs.compare_run_number }}
+          path: baseline-reports
+        continue-on-error: true
+
+      - name: Render unified benchmark report
+        shell: bash
+        run: |
+          if ! find combined-reports -name "*.json" -type f 2>/dev/null | grep -q .; then
+            echo "No benchmark reports found."
+            exit 0
+          fi
+          mkdir -p benchmark-artifacts
+
+          EXTRA_ARGS=""
+          if [ -n "${{ steps.addon_ver.outputs.version }}" ]; then
+            EXTRA_ARGS="$EXTRA_ARGS --addon-version ${{ steps.addon_ver.outputs.version }}"
+          fi
+          if [ -d baseline-reports ] && find baseline-reports -name "*.json" -type f 2>/dev/null | grep -q .; then
+            EXTRA_ARGS="$EXTRA_ARGS --compare-dir baseline-reports"
+          fi
+
+          node packages/llm-llamacpp/benchmarks/performance/render-report.js \
+            --dir combined-reports \
+            --output benchmark-artifacts/qwen35-benchmark-findings.md \
+            $EXTRA_ARGS
+
+      - name: Add to run summary
+        if: always()
+        shell: bash
+        run: |
+          set +e
+          MD_FILE="benchmark-artifacts/qwen35-benchmark-findings.md"
+          if [ -f "$MD_FILE" ]; then
+            cat "$MD_FILE" >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "No consolidated benchmark report available." >> "$GITHUB_STEP_SUMMARY"
+          fi
+
+      - name: Upload consolidated report
+        if: always()
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
+        with:
+          name: qwen35-benchmark-findings-${{ github.run_number }}
+          path: benchmark-artifacts/
+          retention-days: 90
+          if-no-files-found: ignore
@@ -41,6 +41,16 @@ on:
         type: string
         required: false
         default: ""
+      job_timeout_minutes:
+        description: "Override the build-and-test job timeout (minutes). Default 120. Raised by Benchmark Performance (LLM) where the sharded matrix needs more headroom."
+        type: number
+        required: false
+        default: 120
+      artifact_suffix:
+        description: "Optional prefix inserted into the perf-report artifact name so multiple invocations in one run (e.g. Benchmark batches) don't collide. Default empty keeps the existing name."
+        type: string
+        required: false
+        default: ""
   workflow_dispatch:
     inputs:
       ref:
@@ -74,7 +84,7 @@ jobs:
     name: Build ${{ matrix.platform }} and Run E2E Tests
     runs-on: ${{ matrix.runner }}
     environment: release
-    timeout-minutes: 120
+    timeout-minutes: ${{ inputs.job_timeout_minutes || 120 }}
     continue-on-error: true
     permissions:
       contents: read
@@ -217,7 +227,7 @@ jobs:
           platform: ${{ matrix.platform }}
           merge: 'true'
           unzip-customer-artifacts: 'true'
-          artifact-name: perf-report-llamacpp-llm-${{ matrix.platform }}-${{ github.run_number }}
+          artifact-name: perf-report-llamacpp-llm-${{ inputs.artifact_suffix }}${{ matrix.platform }}-${{ github.run_number }}
 
       - name: Comment results on PR
         if: always() && !cancelled()

@@ -5,7 +5,9 @@ const path = require('bare-path')
 const { round, average, stddev, cartesianProduct } = require('./math')
 const { stripSurroundingQuotes, normalizeArgValue } = require('./utils')
 
-const PROMPT_CASES = ['long', 'ctx-filling', 'span-fill']
+// Focused WB run uses a single ~512-token prompt. Add 'ctx-filling' /
+// 'span-fill' back to also sweep context-fill and batch-spanning prompts.
+const PROMPT_CASES = ['long']
 const PROMPTS_PER_CASE = 1
 
 const SWEEP_OVERRIDE_KEYS = [
@@ -17,7 +19,8 @@ const SWEEP_OVERRIDE_KEYS = [
   'ubatch-size',
   'flash-attn',
   'cache-type-k',
-  'cache-type-v'
+  'cache-type-v',
+  'reasoning-budget'
 ]
 
 function splitCsvArg (value, key) {
@@ -83,6 +86,7 @@ function buildCases (modelDef, sweep) {
   const threadsValues = sweep.threads || []
   const cacheTypeKValues = sweep['cache-type-k'] || []
   const cacheTypeVValues = sweep['cache-type-v'] || []
+  const reasoningBudgetValues = sweep['reasoning-budget'] || []
 
   const cases = []
   for (const promptCase of PROMPT_CASES) {
@@ -101,6 +105,7 @@ function buildCases (modelDef, sweep) {
   if (devices.length > 0 && ctxSizes.length > 0 && batchSizes.length > 0 && ubatchSizes.length > 0 &&
       flashAttnValues.length > 0 &&
       threadsValues.length > 0 && cacheTypeKValues.length > 0 && cacheTypeVValues.length > 0) {
+    const rbValues = reasoningBudgetValues.length > 0 ? reasoningBudgetValues : [null]
     const combos = cartesianProduct([
       supportedQuants,
       devices,
@@ -110,10 +115,11 @@ function buildCases (modelDef, sweep) {
       flashAttnValues,
       threadsValues,
       cacheTypeKValues,
-      cacheTypeVValues
+      cacheTypeVValues,
+      rbValues
     ])
 
-    for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV] of combos) {
+    for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV, reasoningBudget] of combos) {
       if (Number(ubatchSize) > Number(batchSize)) {
         continue // Skip combinations where ubatchSize is greater than batchSize
       }
@@ -128,8 +134,10 @@ function buildCases (modelDef, sweep) {
         'cache-type-k': cacheTypeK,
         'cache-type-v': cacheTypeV
       }
+      if (reasoningBudget !== null) runtimeConfig['reasoning-budget'] = reasoningBudget
 
-      const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}`
+      const rbSuffix = reasoningBudget !== null ? `__rb=${reasoningBudget}` : ''
+      const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}${rbSuffix}`
 
       for (const promptCase of PROMPT_CASES) {
         cases.push({
@@ -207,6 +215,7 @@ function aggregateRunMetrics (runMetrics) {
   const unloadMsValues = runMetrics.map((x) => x.unloadMs).filter((x) => x != null)
   const ttftMsValues = runMetrics.map((x) => x.ttftMs).filter((x) => x != null)
   const tpsValues = runMetrics.map((x) => x.tps).filter((x) => x != null)
+  const ppTpsValues = runMetrics.map((x) => x.ppTps).filter((x) => x != null)
   const firstPromptTokens = runMetrics.find((x) => x.promptTokens != null)?.promptTokens ?? null
   const firstGeneratedTokens = runMetrics.find((x) => x.generatedTokens != null)?.generatedTokens ?? null
 
@@ -222,6 +231,8 @@ function aggregateRunMetrics (runMetrics) {
     ttftMsStd: round(stddev(ttftMsValues), 3),
     tpsMean: round(average(tpsValues), 3),
     tpsStd: round(stddev(tpsValues), 3),
+    ppTpsMean: round(average(ppTpsValues), 3),
+    ppTpsStd: round(stddev(ppTpsValues), 3),
     promptTokens: firstPromptTokens,
     generatedTokens: firstGeneratedTokens
   }