tetherto · donriddo · Jun 8, 2026 · Jun 8, 2026 · Jun 9, 2026 · Jun 9, 2026
@@ -41,6 +41,16 @@ on:
         type: string
         required: false
         default: ""
+      job_timeout_minutes:
+        description: "Override the build-and-test job timeout (minutes). Default 150. Raised by Benchmark Performance (LLM) where the sharded matrix needs more headroom."
+        type: number
+        required: false
+        default: 150
+      artifact_suffix:
+        description: "Optional suffix appended to the perf-report artifact-name stem (before the platform segment) so multiple invocations in one run (e.g. Benchmark batches) don't collide. Default empty keeps the existing name."
+        type: string
+        required: false
+        default: ""
       pre_build_script:
         description: "Optional node script (path under packages/llm-llamacpp) run before the mobile build to bootstrap a benchmark into the framework (stage files + regenerate the test list). Default '' = no-op."
         type: string
@@ -89,7 +99,7 @@ jobs:
     name: Build ${{ matrix.platform }} and Run E2E Tests
     runs-on: ${{ matrix.runner }}
     environment: release
-    timeout-minutes: 150
+    timeout-minutes: ${{ inputs.job_timeout_minutes || 150 }}
     continue-on-error: true
     permissions:
       contents: read
@@ -141,6 +151,20 @@ jobs:
           prebuild-artifact-prefix: 'llama-cpp-'
           pat-token: ${{ secrets.PAT_TOKEN }}
 
+      # The mobile perf benchmark shards (benchmark-perf-*.test.js) are not
+      # committed — they are generated from test/integration/_benchmark-matrix.js.
+      # Regenerate them after setup (which provisions Node) but before the
+      # bundle is built, so the Device Farm app contains them, then hard-fail if
+      # any are still missing. This makes it impossible to build the bundle
+      # without the shards present. (Runs for every LLM mobile run; for
+      # non-benchmark runs the generated shards are simply skipped, like any
+      # other mobile-only test.)
+      - name: Generate benchmark shards
+        working-directory: addon/packages/llm-llamacpp
+        run: |
+          npm run generate:benchmark-shards
+          node scripts/generate-benchmark-shards.js --assert-shards
+
       # ── Benchmark bootstrap (additive, opt-in via pre_build_script) ──────────
       # When a caller passes pre_build_script (only the VLM benchmark does), run the
       # bootstrap: optionally pull fixture assets from the caller-supplied object-store
@@ -266,7 +290,7 @@ jobs:
           platform: ${{ matrix.platform }}
           merge: 'true'
           unzip-customer-artifacts: 'true'
-          artifact-name: perf-report-llamacpp-llm-${{ matrix.platform }}-${{ github.run_number }}
+          artifact-name: perf-report-llamacpp-llm-${{ inputs.artifact_suffix }}${{ matrix.platform }}-${{ github.run_number }}
 
       - name: Comment results on PR
         if: always() && !cancelled()

@@ -8,6 +8,9 @@ prebuilds/
 
 test/unit/all.js
 test/integration/all.js
+# Mobile perf benchmark shards — generated from test/integration/_benchmark-matrix.js
+# by scripts/generate-benchmark-shards.js. Never commit them.
+test/integration/benchmark-perf-*.test.js
 test/model/
 test/results/
 .npmrc

@@ -7,6 +7,7 @@ Full-factorial parameter sweep for `@qvac/llm-llamacpp`, measuring TTFT, TPS, an
 - [Addon Source](#addon-source)
 - [Setup](#setup)
 - [Quick Start](#quick-start)
+- [CI Workflow (GitHub Actions)](#ci-workflow-github-actions)
 - [Sweep Flags](#sweep-flags)
 - [Prompt Cases](#prompt-cases)
 - [Judge Pass](#judge-pass)
@@ -62,22 +63,106 @@ npm run run:param-sweep -- \
 npm run run:judge
 ```
 
+## CI Workflow (GitHub Actions)
+
+Everything above runs locally. To run the benchmark on CI runners + AWS Device
+Farm (desktop **and** mobile), use the **Benchmark Performance — LLM Parameter
+Sweep** workflow (`.github/workflows/benchmark-perf-llm-llamacpp.yml`).
+
+Trigger it from the GitHub UI: **Actions → Benchmark Performance — LLM Parameter
+Sweep → Run workflow**. There is nothing to configure for a normal run — the
+matrix (models, quantizations, reasoning-budget, KV-cache types, repeats) is
+fixed in the scripts; edit those to change what runs.
+
+The **mobile** sweep runs one Device Farm session per
+`(size, quant, KV-cache)` combination. Those combinations live in a single
+source of truth, `test/integration/_benchmark-matrix.js`. The per-combination
+test files (`test/integration/benchmark-perf-*.test.js`) and the workflow's
+mobile `test_groups` are derived from it and the shard files are **not
+committed** — regenerate them with `npm run generate:benchmark-shards` (the CI
+mobile job does this automatically before the Device Farm bundle is built, and
+fails hard if any shard is missing). To change the mobile grid, edit
+`_benchmark-matrix.js`, run `npm run generate:benchmark-shards` and
+`npm run test:mobile:generate`, then update the workflow groups from
+`node scripts/generate-benchmark-shards.js --groups` and commit
+`integration.auto.cjs`. `npm run verify:benchmark-shards` checks they are all in
+sync.
+
+### Inputs
+
+| Input | Default | Purpose |
+|-------|---------|---------|
+| `ref` | launch branch | Branch/tag/SHA of the benchmark code + addon to build and run |
+| `run_desktop` | `true` | Run the desktop sweep (Linux GPU runner) |
+| `run_mobile` | `true` | Run the mobile sweep (Android + iOS via Device Farm) |
+| `summarize_only` | `false` | Re-render a previous run's report in ~1 min, skipping the ~6 h benchmarks. Needs `artifact_run_id` |
+| `artifact_run_id` | — | Previous run ID to re-render (the number in that run's URL). Only with `summarize_only` |
+| `compare_run_id` | — | Baseline run ID to diff against — adds Δ TTFT / TPS / ppTPS columns |
+
+Run IDs are the number in a run's URL (`.../actions/runs/<run_id>`). You never
+supply a run ID for a fresh run — leave them blank.
+
+### Recipes
+
+| Goal | Inputs |
+|------|--------|
+| Fresh full benchmark (desktop + mobile) | *(all blank)* |
+| Desktop only | `run_mobile = false` |
+| Mobile only | `run_desktop = false` |
+| Benchmark a specific code version | `ref = <branch/tag/SHA>` |
+| Re-render a finished run's report | `summarize_only = true`, `artifact_run_id = <run>` |
+| Compare two runs (regression check) | `summarize_only = true`, `artifact_run_id = <new run>`, `compare_run_id = <baseline run>` |
+| Fresh run that also diffs vs a baseline | `compare_run_id = <baseline run>` |
+
+The comparison downloads both runs' artifacts and prints a `Δ` for every
+metric, e.g. `122.37 ± 0.62 | -0.52` (current value ± stddev, then the delta vs
+baseline). It works against **any** two runs.
+
+### What the report contains
+
+Rendered into the run summary of the `summarize` job and uploaded as the
+`qwen35-benchmark-findings-<n>` artifact. One table per device, identical shape
+for desktop and mobile:
+
+- **Header** — addon version, prompt size, repeats per config (e.g.
+  `desktop=5, mobile=3`). The version is recorded into the run's artifacts at
+  benchmark time, so it is always the version that actually ran and a
+  comparison auto-reads each run's own version (nothing to type, nothing to get
+  wrong).
+- **Columns** — `TTFT (ms) | TPS | ppTPS | Tokens`, each as `mean ± stddev`
+  across the repeats (plus `Δ` columns when comparing).
+- **Desktop device** — shows the detected GPU (e.g. `Desktop (NVIDIA RTX …)`),
+  preserved on re-renders.
+- **`Crashed`** — a configuration that crashed or produced no output on that
+  device (e.g. quantized KV cache on Adreno GPUs).
+- **Best configuration per device** — highest TPS and highest ppTPS.
+
+> Note: the table shape is identical across desktop and mobile, but the
+> generation length differs — desktop caps at `n-predict` 1024 tokens, mobile
+> at 512. The rate metrics (TPS, ppTPS) stay comparable; the `Tokens` column
+> and absolute TTFT reflect those different caps.
+
 ## Sweep Flags
 
 All sweep dimensions accept comma-separated values for full-factorial grid.
 
+Defaults below are the focused set currently pinned in
+`llm-parameter-sweep.config.js` (`PARAMETER_SWEEP`). Pass a flag with
+comma-separated values to widen any dimension into the full grid.
+
 | Flag | Type | Default | Description |
 |------|------|---------|-------------|
 | `--models` | `str` | All in manifest | Comma-separated model IDs |
-| `--quantization` | `str` | `Q4_0,Q4_K_M,Q8_0,F16` | Quantization levels |
-| `--device` | `str` | `gpu` | `gpu`, `cpu` |
+| `--quantization` | `str` | `Q4_0,Q4_1,Q4_K_M,Q6_K,Q8_0` | Quantization levels |
+| `--reasoning-budget` | `str` | `-1,0` | Reasoning budget values |
+| `--device` | `str` | `gpu` (desktop) | `gpu`, `cpu` |
 | `--ctx-size` | `str` | `2048` | Context sizes |
-| `--batch-size` | `str` | `512,2048` | Batch sizes |
-| `--ubatch-size` | `str` | `128,512` | Micro-batch sizes (must be <= batch-size) |
-| `--threads` | `str` | `2,4,8` | Thread counts |
-| `--flash-attn` | `str` | `off,on` | Flash attention |
-| `--cache-type-k` | `str` | `f16,q8_0,q4_0` | KV cache key type |
-| `--cache-type-v` | `str` | `f16,q8_0,q4_0` | KV cache value type |
+| `--batch-size` | `str` | `512` | Batch sizes |
+| `--ubatch-size` | `str` | `512` | Micro-batch sizes (must be <= batch-size) |
+| `--threads` | `str` | `4` | Thread counts |
+| `--flash-attn` | `str` | `off` | Flash attention |
+| `--cache-type-k` | `str` | `f16` | KV cache key type |
+| `--cache-type-v` | `str` | `f16` | KV cache value type |
 | `--repeats` | `int` | `5` | Repeats per case |
 | `--results-dir` | `str` | `results/parameter-sweep/` | Output directory |
 | `--prompts-file` | `str` | `test-prompts.json` | Prompts file path |
@@ -86,11 +171,14 @@ All sweep dimensions accept comma-separated values for full-factorial grid.
 
 ## Prompt Cases
 
-Each parameter combination runs three prompt cases:
+The sweep currently runs a single prompt case, `long` (the focused ~512-token
+benchmark prompt) — `PROMPT_CASES = ['long']` in `case-runner.js`. The
+`ctx-filling` / `span-fill` fixtures below still exist in `test-prompts.json`
+and can be re-enabled by extending `PROMPT_CASES`.
 
 | Case | Description | Prompt Selection |
 |------|-------------|-----------------|
-| `long` | Long-output generation | Static `long` prompt |
+| `long` | Long-output generation (active) | Static `long` prompt |
 | `ctx-filling` | Maximizes context fill | `ctx-filling__ctx={ctx-size}` |
 | `span-fill` | Spans multiple prefill batches | `batch-spanning__ctx={ctx-size}__bs={batch-size}` |
 

@@ -5,7 +5,9 @@ const path = require('bare-path')
 const { round, average, stddev, cartesianProduct } = require('./math')
 const { stripSurroundingQuotes, normalizeArgValue } = require('./utils')
 
-const PROMPT_CASES = ['long', 'ctx-filling', 'span-fill']
+// The focused sweep uses a single ~512-token prompt. Add 'ctx-filling' /
+// 'span-fill' back to also sweep context-fill and batch-spanning prompts.
+const PROMPT_CASES = ['long']
 const PROMPTS_PER_CASE = 1
 
 const SWEEP_OVERRIDE_KEYS = [
@@ -17,7 +19,8 @@ const SWEEP_OVERRIDE_KEYS = [
   'ubatch-size',
   'flash-attn',
   'cache-type-k',
-  'cache-type-v'
+  'cache-type-v',
+  'reasoning-budget'
 ]
 
 function splitCsvArg (value, key) {
@@ -83,6 +86,7 @@ function buildCases (modelDef, sweep) {
   const threadsValues = sweep.threads || []
   const cacheTypeKValues = sweep['cache-type-k'] || []
   const cacheTypeVValues = sweep['cache-type-v'] || []
+  const reasoningBudgetValues = sweep['reasoning-budget'] || []
 
   const cases = []
   for (const promptCase of PROMPT_CASES) {
@@ -101,6 +105,7 @@ function buildCases (modelDef, sweep) {
   if (devices.length > 0 && ctxSizes.length > 0 && batchSizes.length > 0 && ubatchSizes.length > 0 &&
       flashAttnValues.length > 0 &&
       threadsValues.length > 0 && cacheTypeKValues.length > 0 && cacheTypeVValues.length > 0) {
+    const rbValues = reasoningBudgetValues.length > 0 ? reasoningBudgetValues : [null]
     const combos = cartesianProduct([
       supportedQuants,
       devices,
@@ -110,10 +115,11 @@ function buildCases (modelDef, sweep) {
       flashAttnValues,
       threadsValues,
       cacheTypeKValues,
-      cacheTypeVValues
+      cacheTypeVValues,
+      rbValues
     ])
 
-    for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV] of combos) {
+    for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV, reasoningBudget] of combos) {
       if (Number(ubatchSize) > Number(batchSize)) {
         continue // Skip combinations where ubatchSize is greater than batchSize
       }
@@ -128,8 +134,10 @@ function buildCases (modelDef, sweep) {
         'cache-type-k': cacheTypeK,
         'cache-type-v': cacheTypeV
       }
+      if (reasoningBudget !== null) runtimeConfig['reasoning-budget'] = reasoningBudget
 
-      const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}`
+      const rbSuffix = reasoningBudget !== null ? `__rb=${reasoningBudget}` : ''
+      const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}${rbSuffix}`
 
       for (const promptCase of PROMPT_CASES) {
         cases.push({
@@ -207,6 +215,7 @@ function aggregateRunMetrics (runMetrics) {
   const unloadMsValues = runMetrics.map((x) => x.unloadMs).filter((x) => x != null)
   const ttftMsValues = runMetrics.map((x) => x.ttftMs).filter((x) => x != null)
   const tpsValues = runMetrics.map((x) => x.tps).filter((x) => x != null)
+  const ppTpsValues = runMetrics.map((x) => x.ppTps).filter((x) => x != null)
   const firstPromptTokens = runMetrics.find((x) => x.promptTokens != null)?.promptTokens ?? null
   const firstGeneratedTokens = runMetrics.find((x) => x.generatedTokens != null)?.generatedTokens ?? null
 
@@ -222,6 +231,8 @@ function aggregateRunMetrics (runMetrics) {
     ttftMsStd: round(stddev(ttftMsValues), 3),
     tpsMean: round(average(tpsValues), 3),
     tpsStd: round(stddev(tpsValues), 3),
+    ppTpsMean: round(average(ppTpsValues), 3),
+    ppTpsStd: round(stddev(ppTpsValues), 3),
     promptTokens: firstPromptTokens,
     generatedTokens: firstGeneratedTokens
   }

@@ -3,10 +3,6 @@
 const fs = require('bare-fs')
 const path = require('bare-path')
 const os = require('bare-os')
-const {
-  DEFAULT_SWEEP_CTX_SIZES,
-  DEFAULT_SWEEP_BATCH_SIZES
-} = require('./utils')
 
 const DEFAULT_RESULTS_DIR = path.resolve(__dirname, 'results', 'parameter-sweep')
 const DEFAULT_MODELS_DIR = path.resolve(__dirname, 'models')
@@ -94,17 +90,20 @@ function loadModelsFromManifest () {
 
 const MODELS = loadModelsFromManifest()
 
-// Parameter sweep: full factorial (cartesian product)
+// Parameter sweep (cartesian product). Tuned to the focused sweep:
+// only quantization and reasoning-budget vary; every other dimension is
+// pinned to a single value. Edit these arrays to sweep more dimensions.
 const PARAMETER_SWEEP = {
-  quantization: ['Q4_0', 'Q4_K_M', 'Q8_0', 'F16'],
+  quantization: ['Q4_0', 'Q4_1', 'Q4_K_M', 'Q6_K', 'Q8_0'],
   device: getDefaultSweepDevices(),
-  'ctx-size': DEFAULT_SWEEP_CTX_SIZES.map(String),
-  threads: ['2', '4', '8'],
-  'batch-size': DEFAULT_SWEEP_BATCH_SIZES.map(String), // max: 10k
-  'ubatch-size': ['128', '512'], // must be <= batch-size
-  'flash-attn': ['off', 'on'],
-  'cache-type-k': ['f16', 'q8_0', 'q4_0'],
-  'cache-type-v': ['f16', 'q8_0', 'q4_0']
+  'ctx-size': ['2048'],
+  threads: ['4'],
+  'batch-size': ['512'],
+  'ubatch-size': ['512'],
+  'flash-attn': ['off'],
+  'cache-type-k': ['f16'],
+  'cache-type-v': ['f16'],
+  'reasoning-budget': ['-1', '0']
   // verbosity: fixed at '0' (not swept)
 }