Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
70e7db6
chore[skiplog]: Qwen3.5 LLM performance benchmark suite (desktop + mo…
donriddo Jun 8, 2026
df10434
fix: add warmup run to desktop sweep so cold-start doesn't skew TTFT/…
donriddo Jun 8, 2026
79004c4
fix: retry transient model-download failures in the benchmark
donriddo Jun 9, 2026
957a817
refactor: align benchmark download retry with the addon's existing he…
donriddo Jun 9, 2026
6181ebd
chore: simplify benchmark report and progress helpers
donriddo Jun 9, 2026
1bedc7d
doc: fix artifact_suffix input description wording
donriddo Jun 9, 2026
c77408b
test[skiplog]: generate mobile perf benchmark shards from a matrix
donriddo Jun 9, 2026
8218111
fix: generate benchmark shards after Node is provisioned
donriddo Jun 9, 2026
bf633af
chore: inline single-use shard filename set
donriddo Jun 10, 2026
85936d0
fix: self-heal benchmark shards before mobile test generation
donriddo Jun 10, 2026
c6b0be0
chore: generalize benchmark sweep comments
donriddo Jun 10, 2026
00ce18e
fix: fail benchmark summary when no report data exists
donriddo Jun 10, 2026
ff6ff01
feat: report mobile shard coverage in the benchmark report
donriddo Jun 10, 2026
1ff9adc
Merge remote-tracking branch 'upstream/main' into feat/benchmark-perf…
donriddo Jun 10, 2026
323cfcd
fix: fail the benchmark report when there are no data rows
donriddo Jun 10, 2026
78026d7
fix: check out package.json in the summarize job
donriddo Jun 10, 2026
e6805d7
feat: add config legend and missing-data warnings to the report
donriddo Jun 10, 2026
b7db817
fix: record crash placeholders for both devices before the run loop
donriddo Jun 10, 2026
a57bd8c
fix: guard empty baseline comparison and harden workflow input handling
donriddo Jun 10, 2026
c963353
feat: add TurboQuant and PolarQuant KV-cache types to the benchmark
donriddo Jun 10, 2026
e6e6911
Merge remote-tracking branch 'upstream/main' into feat/benchmark-perf…
donriddo Jun 10, 2026
d9e1509
fix: fail when a requested baseline comparison has no data
donriddo Jun 10, 2026
7463be1
docs: correct the matrix dimensions in the generator comment
donriddo Jun 10, 2026
36c3555
feat: add benchmark charts (inline mermaid summary + HTML artifact)
donriddo Jun 11, 2026
2f1801f
perf: warm up once per backend instead of per reasoning budget
donriddo Jun 11, 2026
8b28a7a
Merge remote-tracking branch 'upstream/main' into feat/benchmark-perf…
donriddo Jun 11, 2026
054f56e
infra: pass detected GPU name through env in desktop-stamp step
donriddo Jun 11, 2026
50a0bd2
Merge remote-tracking branch 'upstream/main' into feat/benchmark-perf…
donriddo Jun 11, 2026
80646a1
fix: chart one measured config per bar in the benchmark report
donriddo Jun 12, 2026
718d9a4
fix: score benchmark coverage against each run's own stamped matrix
donriddo Jun 12, 2026
7f30b14
feat: link the chart download straight from the report's chart note
donriddo Jun 12, 2026
5fd6400
Merge remote-tracking branch 'upstream/main' into feat/benchmark-perf…
donriddo Jun 12, 2026
f69eb8b
infra: run the desktop benchmark on the self-hosted GPU runner via pr…
donriddo Jun 12, 2026
72a6091
Merge remote-tracking branch 'upstream/main' into feat/benchmark-perf…
donriddo Jun 13, 2026
1b7f6b3
fix: download benchmark report artifacts via the gh CLI in summarize
donriddo Jun 14, 2026
1bedeb5
fix: grant summarize actions:read so gh run download can fetch artifacts
donriddo Jun 14, 2026
61688a5
Revert "fix: grant summarize actions:read so gh run download can fetc…
donriddo Jun 14, 2026
516baec
Revert "fix: download benchmark report artifacts via the gh CLI in su…
donriddo Jun 14, 2026
c71b080
Merge remote-tracking branch 'upstream/main' into feat/benchmark-perf…
donriddo Jun 15, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
437 changes: 341 additions & 96 deletions .github/workflows/benchmark-perf-llm-llamacpp.yml

Large diffs are not rendered by default.

28 changes: 26 additions & 2 deletions .github/workflows/integration-mobile-test-llm-llamacpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ on:
type: string
required: false
default: ""
job_timeout_minutes:
description: "Override the build-and-test job timeout (minutes). Default 150. Raised by Benchmark Performance (LLM) where the sharded matrix needs more headroom."
type: number
required: false
default: 150
artifact_suffix:
description: "Optional suffix appended to the perf-report artifact-name stem (before the platform segment) so multiple invocations in one run (e.g. Benchmark batches) don't collide. Default empty keeps the existing name."
type: string
required: false
default: ""
pre_build_script:
description: "Optional node script (path under packages/llm-llamacpp) run before the mobile build to bootstrap a benchmark into the framework (stage files + regenerate the test list). Default '' = no-op."
type: string
Expand Down Expand Up @@ -89,7 +99,7 @@ jobs:
name: Build ${{ matrix.platform }} and Run E2E Tests
runs-on: ${{ matrix.runner }}
environment: release
timeout-minutes: 150
timeout-minutes: ${{ inputs.job_timeout_minutes || 150 }}
continue-on-error: true
permissions:
contents: read
Expand Down Expand Up @@ -141,6 +151,20 @@ jobs:
prebuild-artifact-prefix: 'llama-cpp-'
pat-token: ${{ secrets.PAT_TOKEN }}

# The mobile perf benchmark shards (benchmark-perf-*.test.js) are not
# committed β€” they are generated from test/integration/_benchmark-matrix.js.
# Regenerate them after setup (which provisions Node) but before the
# bundle is built, so the Device Farm app contains them, then hard-fail if
# any are still missing. This makes it impossible to build the bundle
# without the shards present. (Runs for every LLM mobile run; for
# non-benchmark runs the generated shards are simply skipped, like any
# other mobile-only test.)
- name: Generate benchmark shards
working-directory: addon/packages/llm-llamacpp
run: |
npm run generate:benchmark-shards
node scripts/generate-benchmark-shards.js --assert-shards

# ── Benchmark bootstrap (additive, opt-in via pre_build_script) ──────────
# When a caller passes pre_build_script (only the VLM benchmark does), run the
# bootstrap: optionally pull fixture assets from the caller-supplied object-store
Expand Down Expand Up @@ -266,7 +290,7 @@ jobs:
platform: ${{ matrix.platform }}
merge: 'true'
unzip-customer-artifacts: 'true'
artifact-name: perf-report-llamacpp-llm-${{ matrix.platform }}-${{ github.run_number }}
artifact-name: perf-report-llamacpp-llm-${{ inputs.artifact_suffix }}${{ matrix.platform }}-${{ github.run_number }}

- name: Comment results on PR
if: always() && !cancelled()
Expand Down
3 changes: 3 additions & 0 deletions packages/llm-llamacpp/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ prebuilds/

test/unit/all.js
test/integration/all.js
# Mobile perf benchmark shards β€” generated from test/integration/_benchmark-matrix.js
# by scripts/generate-benchmark-shards.js. Never commit them.
test/integration/benchmark-perf-*.test.js
test/model/
test/results/
.npmrc
Expand Down
108 changes: 98 additions & 10 deletions packages/llm-llamacpp/benchmarks/performance/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Full-factorial parameter sweep for `@qvac/llm-llamacpp`, measuring TTFT, TPS, an
- [Addon Source](#addon-source)
- [Setup](#setup)
- [Quick Start](#quick-start)
- [CI Workflow (GitHub Actions)](#ci-workflow-github-actions)
- [Sweep Flags](#sweep-flags)
- [Prompt Cases](#prompt-cases)
- [Judge Pass](#judge-pass)
Expand Down Expand Up @@ -62,22 +63,106 @@ npm run run:param-sweep -- \
npm run run:judge
```

## CI Workflow (GitHub Actions)

Everything above runs locally. To run the benchmark on CI runners + AWS Device
Farm (desktop **and** mobile), use the **Benchmark Performance β€” LLM Parameter
Sweep** workflow (`.github/workflows/benchmark-perf-llm-llamacpp.yml`).

Trigger it from the GitHub UI: **Actions β†’ Benchmark Performance β€” LLM Parameter
Sweep β†’ Run workflow**. There is nothing to configure for a normal run β€” the
matrix (models, quantizations, reasoning-budget, KV-cache types, repeats) is
fixed in the scripts; edit those to change what runs.

The **mobile** sweep runs one Device Farm session per
`(size, quant, KV-cache)` combination. Those combinations live in a single
source of truth, `test/integration/_benchmark-matrix.js`. The per-combination
test files (`test/integration/benchmark-perf-*.test.js`) and the workflow's
mobile `test_groups` are derived from it and the shard files are **not
committed** β€” regenerate them with `npm run generate:benchmark-shards` (the CI
mobile job does this automatically before the Device Farm bundle is built, and
fails hard if any shard is missing). To change the mobile grid, edit
`_benchmark-matrix.js`, run `npm run generate:benchmark-shards` and
`npm run test:mobile:generate`, then update the workflow groups from
`node scripts/generate-benchmark-shards.js --groups` and commit
`integration.auto.cjs`. `npm run verify:benchmark-shards` checks they are all in
sync.

### Inputs

| Input | Default | Purpose |
|-------|---------|---------|
| `ref` | launch branch | Branch/tag/SHA of the benchmark code + addon to build and run |
| `run_desktop` | `true` | Run the desktop sweep (Linux GPU runner) |
| `run_mobile` | `true` | Run the mobile sweep (Android + iOS via Device Farm) |
| `summarize_only` | `false` | Re-render a previous run's report in ~1 min, skipping the ~6 h benchmarks. Needs `artifact_run_id` |
| `artifact_run_id` | β€” | Previous run ID to re-render (the number in that run's URL). Only with `summarize_only` |
| `compare_run_id` | β€” | Baseline run ID to diff against β€” adds Ξ” TTFT / TPS / ppTPS columns |

Run IDs are the number in a run's URL (`.../actions/runs/<run_id>`). You never
supply a run ID for a fresh run β€” leave them blank.

### Recipes

| Goal | Inputs |
|------|--------|
| Fresh full benchmark (desktop + mobile) | *(all blank)* |
| Desktop only | `run_mobile = false` |
| Mobile only | `run_desktop = false` |
| Benchmark a specific code version | `ref = <branch/tag/SHA>` |
| Re-render a finished run's report | `summarize_only = true`, `artifact_run_id = <run>` |
| Compare two runs (regression check) | `summarize_only = true`, `artifact_run_id = <new run>`, `compare_run_id = <baseline run>` |
| Fresh run that also diffs vs a baseline | `compare_run_id = <baseline run>` |

The comparison downloads both runs' artifacts and prints a `Ξ”` for every
metric, e.g. `122.37 Β± 0.62 | -0.52` (current value Β± stddev, then the delta vs
baseline). It works against **any** two runs.

### What the report contains

Rendered into the run summary of the `summarize` job and uploaded as the
`qwen35-benchmark-findings-<n>` artifact. One table per device, identical shape
for desktop and mobile:

- **Header** β€” addon version, prompt size, repeats per config (e.g.
`desktop=5, mobile=3`). The version is recorded into the run's artifacts at
benchmark time, so it is always the version that actually ran and a
comparison auto-reads each run's own version (nothing to type, nothing to get
wrong).
- **Columns** β€” `TTFT (ms) | TPS | ppTPS | Tokens`, each as `mean Β± stddev`
across the repeats (plus `Ξ”` columns when comparing).
- **Desktop device** β€” shows the detected GPU (e.g. `Desktop (NVIDIA RTX …)`),
preserved on re-renders.
- **`Crashed`** β€” a configuration that crashed or produced no output on that
device (e.g. quantized KV cache on Adreno GPUs).
- **Best configuration per device** β€” highest TPS and highest ppTPS.

> Note: the table shape is identical across desktop and mobile, but the
> generation length differs β€” desktop caps at `n-predict` 1024 tokens, mobile
> at 512. The rate metrics (TPS, ppTPS) stay comparable; the `Tokens` column
> and absolute TTFT reflect those different caps.

## Sweep Flags

All sweep dimensions accept comma-separated values for full-factorial grid.

Defaults below are the focused set currently pinned in
`llm-parameter-sweep.config.js` (`PARAMETER_SWEEP`). Pass a flag with
comma-separated values to widen any dimension into the full grid.

| Flag | Type | Default | Description |
|------|------|---------|-------------|
| `--models` | `str` | All in manifest | Comma-separated model IDs |
| `--quantization` | `str` | `Q4_0,Q4_K_M,Q8_0,F16` | Quantization levels |
| `--device` | `str` | `gpu` | `gpu`, `cpu` |
| `--quantization` | `str` | `Q4_0,Q4_1,Q4_K_M,Q6_K,Q8_0` | Quantization levels |
| `--reasoning-budget` | `str` | `-1,0` | Reasoning budget values |
| `--device` | `str` | `gpu` (desktop) | `gpu`, `cpu` |
| `--ctx-size` | `str` | `2048` | Context sizes |
| `--batch-size` | `str` | `512,2048` | Batch sizes |
| `--ubatch-size` | `str` | `128,512` | Micro-batch sizes (must be <= batch-size) |
| `--threads` | `str` | `2,4,8` | Thread counts |
| `--flash-attn` | `str` | `off,on` | Flash attention |
| `--cache-type-k` | `str` | `f16,q8_0,q4_0` | KV cache key type |
| `--cache-type-v` | `str` | `f16,q8_0,q4_0` | KV cache value type |
| `--batch-size` | `str` | `512` | Batch sizes |
| `--ubatch-size` | `str` | `512` | Micro-batch sizes (must be <= batch-size) |
| `--threads` | `str` | `4` | Thread counts |
| `--flash-attn` | `str` | `off` | Flash attention |
| `--cache-type-k` | `str` | `f16` | KV cache key type |
| `--cache-type-v` | `str` | `f16` | KV cache value type |
| `--repeats` | `int` | `5` | Repeats per case |
| `--results-dir` | `str` | `results/parameter-sweep/` | Output directory |
| `--prompts-file` | `str` | `test-prompts.json` | Prompts file path |
Expand All @@ -86,11 +171,14 @@ All sweep dimensions accept comma-separated values for full-factorial grid.

## Prompt Cases

Each parameter combination runs three prompt cases:
The sweep currently runs a single prompt case, `long` (the focused ~512-token
benchmark prompt) β€” `PROMPT_CASES = ['long']` in `case-runner.js`. The
`ctx-filling` / `span-fill` fixtures below still exist in `test-prompts.json`
and can be re-enabled by extending `PROMPT_CASES`.

| Case | Description | Prompt Selection |
|------|-------------|-----------------|
| `long` | Long-output generation | Static `long` prompt |
| `long` | Long-output generation (active) | Static `long` prompt |
| `ctx-filling` | Maximizes context fill | `ctx-filling__ctx={ctx-size}` |
| `span-fill` | Spans multiple prefill batches | `batch-spanning__ctx={ctx-size}__bs={batch-size}` |

Expand Down
21 changes: 16 additions & 5 deletions packages/llm-llamacpp/benchmarks/performance/case-runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ const path = require('bare-path')
const { round, average, stddev, cartesianProduct } = require('./math')
const { stripSurroundingQuotes, normalizeArgValue } = require('./utils')

const PROMPT_CASES = ['long', 'ctx-filling', 'span-fill']
// The focused sweep uses a single ~512-token prompt. Add 'ctx-filling' /
// 'span-fill' back to also sweep context-fill and batch-spanning prompts.
const PROMPT_CASES = ['long']
const PROMPTS_PER_CASE = 1

const SWEEP_OVERRIDE_KEYS = [
Expand All @@ -17,7 +19,8 @@ const SWEEP_OVERRIDE_KEYS = [
'ubatch-size',
'flash-attn',
'cache-type-k',
'cache-type-v'
'cache-type-v',
'reasoning-budget'
]

function splitCsvArg (value, key) {
Expand Down Expand Up @@ -83,6 +86,7 @@ function buildCases (modelDef, sweep) {
const threadsValues = sweep.threads || []
const cacheTypeKValues = sweep['cache-type-k'] || []
const cacheTypeVValues = sweep['cache-type-v'] || []
const reasoningBudgetValues = sweep['reasoning-budget'] || []

const cases = []
for (const promptCase of PROMPT_CASES) {
Expand All @@ -101,6 +105,7 @@ function buildCases (modelDef, sweep) {
if (devices.length > 0 && ctxSizes.length > 0 && batchSizes.length > 0 && ubatchSizes.length > 0 &&
flashAttnValues.length > 0 &&
threadsValues.length > 0 && cacheTypeKValues.length > 0 && cacheTypeVValues.length > 0) {
const rbValues = reasoningBudgetValues.length > 0 ? reasoningBudgetValues : [null]
const combos = cartesianProduct([
supportedQuants,
devices,
Expand All @@ -110,10 +115,11 @@ function buildCases (modelDef, sweep) {
flashAttnValues,
threadsValues,
cacheTypeKValues,
cacheTypeVValues
cacheTypeVValues,
rbValues
])

for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV] of combos) {
for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV, reasoningBudget] of combos) {
if (Number(ubatchSize) > Number(batchSize)) {
continue // Skip combinations where ubatchSize is greater than batchSize
}
Expand All @@ -128,8 +134,10 @@ function buildCases (modelDef, sweep) {
'cache-type-k': cacheTypeK,
'cache-type-v': cacheTypeV
}
if (reasoningBudget !== null) runtimeConfig['reasoning-budget'] = reasoningBudget

const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}`
const rbSuffix = reasoningBudget !== null ? `__rb=${reasoningBudget}` : ''
const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}${rbSuffix}`

for (const promptCase of PROMPT_CASES) {
cases.push({
Expand Down Expand Up @@ -207,6 +215,7 @@ function aggregateRunMetrics (runMetrics) {
const unloadMsValues = runMetrics.map((x) => x.unloadMs).filter((x) => x != null)
const ttftMsValues = runMetrics.map((x) => x.ttftMs).filter((x) => x != null)
const tpsValues = runMetrics.map((x) => x.tps).filter((x) => x != null)
const ppTpsValues = runMetrics.map((x) => x.ppTps).filter((x) => x != null)
const firstPromptTokens = runMetrics.find((x) => x.promptTokens != null)?.promptTokens ?? null
const firstGeneratedTokens = runMetrics.find((x) => x.generatedTokens != null)?.generatedTokens ?? null

Expand All @@ -222,6 +231,8 @@ function aggregateRunMetrics (runMetrics) {
ttftMsStd: round(stddev(ttftMsValues), 3),
tpsMean: round(average(tpsValues), 3),
tpsStd: round(stddev(tpsValues), 3),
ppTpsMean: round(average(ppTpsValues), 3),
ppTpsStd: round(stddev(ppTpsValues), 3),
promptTokens: firstPromptTokens,
generatedTokens: firstGeneratedTokens
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@
const fs = require('bare-fs')
const path = require('bare-path')
const os = require('bare-os')
const {
DEFAULT_SWEEP_CTX_SIZES,
DEFAULT_SWEEP_BATCH_SIZES
} = require('./utils')

const DEFAULT_RESULTS_DIR = path.resolve(__dirname, 'results', 'parameter-sweep')
const DEFAULT_MODELS_DIR = path.resolve(__dirname, 'models')
Expand Down Expand Up @@ -94,17 +90,20 @@ function loadModelsFromManifest () {

const MODELS = loadModelsFromManifest()

// Parameter sweep: full factorial (cartesian product)
// Parameter sweep (cartesian product). Tuned to the focused sweep:
// only quantization and reasoning-budget vary; every other dimension is
// pinned to a single value. Edit these arrays to sweep more dimensions.
const PARAMETER_SWEEP = {
quantization: ['Q4_0', 'Q4_K_M', 'Q8_0', 'F16'],
quantization: ['Q4_0', 'Q4_1', 'Q4_K_M', 'Q6_K', 'Q8_0'],
device: getDefaultSweepDevices(),
'ctx-size': DEFAULT_SWEEP_CTX_SIZES.map(String),
threads: ['2', '4', '8'],
'batch-size': DEFAULT_SWEEP_BATCH_SIZES.map(String), // max: 10k
'ubatch-size': ['128', '512'], // must be <= batch-size
'flash-attn': ['off', 'on'],
'cache-type-k': ['f16', 'q8_0', 'q4_0'],
'cache-type-v': ['f16', 'q8_0', 'q4_0']
'ctx-size': ['2048'],
threads: ['4'],
'batch-size': ['512'],
'ubatch-size': ['512'],
'flash-attn': ['off'],
'cache-type-k': ['f16'],
'cache-type-v': ['f16'],
'reasoning-budget': ['-1', '0']
// verbosity: fixed at '0' (not swept)
}

Expand Down
Loading
Loading