Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 171 additions & 40 deletions .github/workflows/benchmark-perf-llm-llamacpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ name: Benchmark Performance — LLM Parameter Sweep
#
# To change what runs, edit:
# desktop: models.manifest.json (models) + llm-parameter-sweep.config.js (sweep dims)
# mobile: mobile.config.json
# mobile: test/integration/_benchmark-perf.js (shared runner) + the
# benchmark-perf-*.test.js shards (one per model x KV-cache type)

on:
workflow_dispatch:
Expand All @@ -25,6 +26,19 @@ on:
required: false
default: true
type: boolean
summarize_only:
description: "Re-render report from a previous run's artifacts (skips benchmarks)"
required: false
default: false
type: boolean
artifact_run_number:
description: "Run number to pull artifacts from when summarize_only=true (e.g. 9)"
required: false
type: string
compare_run_number:
description: "Run number of a baseline run to diff against (shows Δ TPS / Δ TTFT columns)"
required: false
type: string

permissions:
contents: read
Expand Down Expand Up @@ -90,7 +104,7 @@ jobs:
needs:
- context
- label-gate
if: needs.label-gate.outputs.authorised == 'true' && inputs.run_desktop
if: needs.label-gate.outputs.authorised == 'true' && inputs.run_desktop && !inputs.summarize_only
name: Desktop Parameter Sweep
runs-on: ai-run-linux-gpu
timeout-minutes: 360
Expand Down Expand Up @@ -123,6 +137,12 @@ jobs:
- name: Setup LLVM
uses: tetherto/qvac/.github/actions/setup-llvm@98a6a6b6e8f3866dfdd75052a4071269ce85dc41

- name: Setup Vulkan SDK
uses: tetherto/qvac/.github/actions/setup-vulkan-sdk@0bbdca93da303a0b1634ba14a89cec085621078d
with:
platform: linux
arch: x64

- name: Build addon from source
working-directory: packages/llm-llamacpp
run: |
Expand Down Expand Up @@ -157,21 +177,6 @@ jobs:
sudo apt-get update
sudo apt-get install -y libxi-dev libxtst-dev libxrandr-dev xz-utils

echo "Installing Vulkan SDK (latest)..."
wget -q -O /tmp/vulkansdk.tar.xz https://sdk.lunarg.com/sdk/download/latest/linux/vulkan_sdk.tar.xz
mkdir -p "$HOME/vulkan" && cd "$HOME/vulkan"
tar xf /tmp/vulkansdk.tar.xz --strip-components=1
export VULKAN_SDK="$HOME/vulkan/x86_64"
export PATH="$VULKAN_SDK/bin:$PATH"
export LD_LIBRARY_PATH="$VULKAN_SDK/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
export VK_ADD_LAYER_PATH="$VULKAN_SDK/share/vulkan/explicit_layer.d"
export PKG_CONFIG_PATH="$VULKAN_SDK/share/pkgconfig:$VULKAN_SDK/lib/pkgconfig${PKG_CONFIG_PATH:+:$PKG_CONFIG_PATH}"
echo "VULKAN_SDK=$VULKAN_SDK" >> $GITHUB_ENV
echo "PATH=$PATH" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV
echo "VK_ADD_LAYER_PATH=$VK_ADD_LAYER_PATH" >> $GITHUB_ENV
echo "PKG_CONFIG_PATH=$PKG_CONFIG_PATH" >> $GITHUB_ENV

cd "$GITHUB_WORKSPACE/packages/llm-llamacpp"
npm install

Expand All @@ -186,6 +191,13 @@ jobs:

echo "=== Build complete ==="

- name: Detect GPU
id: gpu
shell: bash
run: |
gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 | tr -d '\r' || echo "GPU")
echo "name=$gpu_name" >> "$GITHUB_OUTPUT"

- name: Install benchmark dependencies
working-directory: packages/llm-llamacpp/benchmarks/performance
run: npm install
Expand All @@ -199,24 +211,8 @@ jobs:
working-directory: packages/llm-llamacpp/benchmarks/performance
run: bare ./llm-parameter-sweep.js --addon-source local

- name: Add job summary
if: always()
working-directory: packages/llm-llamacpp/benchmarks/performance
shell: bash
run: |
LATEST_MD=$(find results/parameter-sweep -name "*.md" -type f 2>/dev/null | sort | tail -1)
{
echo "## LLM Parameter Sweep — Desktop"
echo ""
echo "ref: \`${{ needs.context.outputs.ref }}\`"
echo ""
if [ -n "${LATEST_MD:-}" ]; then
cat "$LATEST_MD"
else
echo "No results file found."
fi
} >> "$GITHUB_STEP_SUMMARY"

# The run summary is rendered by the summarize job (unified desktop +
# mobile view); this job just uploads the raw sweep results.
- name: Upload results
if: always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
Expand All @@ -225,24 +221,159 @@ jobs:
path: packages/llm-llamacpp/benchmarks/performance/results/parameter-sweep/
retention-days: 90
if-no-files-found: ignore
env:
DESKTOP_GPU: ${{ steps.gpu.outputs.name }}

# Mobile is sharded one group per (model x KV-cache type) = 30 groups so each
# Device Farm session finishes inside the fixed 20-minute iOS per-test ceiling.
# All 30 in one reused-workflow call do NOT fit: Android serializes the runs
# against its device pool (>240 min) and the macOS runner fills its disk
# collecting 30 runs' logs. So we split the groups into three batches by
# KV-cache type (10 each — the proven in-budget load) and run them
# sequentially (max-parallel: 1) to avoid Device Farm pool contention. Each
# batch raises the job timeout to 180 for headroom (proven 10-shard wall ~119
# min) and gets a distinct artifact_suffix so its perf-report doesn't collide;
# summarize aggregates all three. These wrappers are deliberately absent from
# the addon's test-groups.json, so this override is the only path that runs
# them — normal mobile integration runs never trigger the benchmark.
mobile-benchmark:
needs:
- context
- prebuild
- label-gate
if: needs.label-gate.outputs.authorised == 'true' && inputs.run_mobile
if: needs.label-gate.outputs.authorised == 'true' && inputs.run_mobile && !inputs.summarize_only
permissions:
contents: read
packages: read
pull-requests: write
id-token: write
strategy:
fail-fast: false
max-parallel: 1
matrix:
include:
- cache: f16
groups: '[{"name":"BenchmarkPerf08bQ40F16","grep":"runBenchmarkPerf08bQ40F16Test"},{"name":"BenchmarkPerf08bQ41F16","grep":"runBenchmarkPerf08bQ41F16Test"},{"name":"BenchmarkPerf08bQ4KMF16","grep":"runBenchmarkPerf08bQ4KMF16Test"},{"name":"BenchmarkPerf08bQ6KF16","grep":"runBenchmarkPerf08bQ6KF16Test"},{"name":"BenchmarkPerf08bQ80F16","grep":"runBenchmarkPerf08bQ80F16Test"},{"name":"BenchmarkPerf2bQ40F16","grep":"runBenchmarkPerf2bQ40F16Test"},{"name":"BenchmarkPerf2bQ41F16","grep":"runBenchmarkPerf2bQ41F16Test"},{"name":"BenchmarkPerf2bQ4KMF16","grep":"runBenchmarkPerf2bQ4KMF16Test"},{"name":"BenchmarkPerf2bQ6KF16","grep":"runBenchmarkPerf2bQ6KF16Test"},{"name":"BenchmarkPerf2bQ80F16","grep":"runBenchmarkPerf2bQ80F16Test"}]'
- cache: q8_0
groups: '[{"name":"BenchmarkPerf08bQ40Q80","grep":"runBenchmarkPerf08bQ40Q80Test"},{"name":"BenchmarkPerf08bQ41Q80","grep":"runBenchmarkPerf08bQ41Q80Test"},{"name":"BenchmarkPerf08bQ4KMQ80","grep":"runBenchmarkPerf08bQ4KMQ80Test"},{"name":"BenchmarkPerf08bQ6KQ80","grep":"runBenchmarkPerf08bQ6KQ80Test"},{"name":"BenchmarkPerf08bQ80Q80","grep":"runBenchmarkPerf08bQ80Q80Test"},{"name":"BenchmarkPerf2bQ40Q80","grep":"runBenchmarkPerf2bQ40Q80Test"},{"name":"BenchmarkPerf2bQ41Q80","grep":"runBenchmarkPerf2bQ41Q80Test"},{"name":"BenchmarkPerf2bQ4KMQ80","grep":"runBenchmarkPerf2bQ4KMQ80Test"},{"name":"BenchmarkPerf2bQ6KQ80","grep":"runBenchmarkPerf2bQ6KQ80Test"},{"name":"BenchmarkPerf2bQ80Q80","grep":"runBenchmarkPerf2bQ80Q80Test"}]'
- cache: q4_0
groups: '[{"name":"BenchmarkPerf08bQ40Q40","grep":"runBenchmarkPerf08bQ40Q40Test"},{"name":"BenchmarkPerf08bQ41Q40","grep":"runBenchmarkPerf08bQ41Q40Test"},{"name":"BenchmarkPerf08bQ4KMQ40","grep":"runBenchmarkPerf08bQ4KMQ40Test"},{"name":"BenchmarkPerf08bQ6KQ40","grep":"runBenchmarkPerf08bQ6KQ40Test"},{"name":"BenchmarkPerf08bQ80Q40","grep":"runBenchmarkPerf08bQ80Q40Test"},{"name":"BenchmarkPerf2bQ40Q40","grep":"runBenchmarkPerf2bQ40Q40Test"},{"name":"BenchmarkPerf2bQ41Q40","grep":"runBenchmarkPerf2bQ41Q40Test"},{"name":"BenchmarkPerf2bQ4KMQ40","grep":"runBenchmarkPerf2bQ4KMQ40Test"},{"name":"BenchmarkPerf2bQ6KQ40","grep":"runBenchmarkPerf2bQ6KQ40Test"},{"name":"BenchmarkPerf2bQ80Q40","grep":"runBenchmarkPerf2bQ80Q40Test"}]'
uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml
secrets: inherit
with:
repository: ${{ github.repository }}
ref: ${{ needs.context.outputs.ref }}
# Schedule only the benchmark group. runBenchmarkPerfTest is deliberately
# absent from the addon's test-groups.json, so this override is the only
# path that runs it — normal mobile integration runs never trigger it.
test_groups: '[{"name":"benchmarkPerf","grep":"runBenchmarkPerfTest"}]'
job_timeout_minutes: 180
test_groups: ${{ matrix.groups }}
artifact_suffix: ${{ matrix.cache }}-

# Aggregates desktop + mobile artifacts into one unified markdown report.
# Runs after benchmarks finish, or standalone when summarize_only=true
# (pass artifact_run_number to pull results from a previous run).
# Pass compare_run_number to show Δ TPS / Δ TTFT regression columns.
summarize:
needs:
- context
- label-gate
- desktop-benchmark
- mobile-benchmark
if: needs.label-gate.outputs.authorised == 'true' && always() && needs.context.result == 'success'
runs-on: ubuntu-latest
timeout-minutes: 10
permissions:
contents: read
env:
# Use artifact_run_number when summarize_only, otherwise current run
ARTIFACT_RUN: ${{ inputs.artifact_run_number || github.run_number }}
steps:
- name: Checkout aggregator
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
with:
ref: ${{ needs.context.outputs.ref }}
sparse-checkout: |
packages/llm-llamacpp/benchmarks/performance/render-report.js

- name: Setup Node.js
uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
with:
node-version: lts/*

- name: Get addon version
id: addon_ver
shell: bash
run: |
ver=$(node -e "process.stdout.write(require('./packages/llm-llamacpp/package.json').version)" 2>/dev/null || true)
echo "version=${ver:+@qvac/llm-llamacpp@$ver}" >> "$GITHUB_OUTPUT"

- name: Download desktop sweep artifact
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
with:
pattern: llm-param-sweep-desktop-${{ env.ARTIFACT_RUN }}
path: combined-reports
continue-on-error: true

- name: Download mobile perf-report artifacts
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
with:
pattern: perf-report-llamacpp-llm-*-${{ env.ARTIFACT_RUN }}
path: combined-reports
continue-on-error: true

- name: Download baseline artifacts for comparison
if: inputs.compare_run_number != ''
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
with:
pattern: llm-param-sweep-desktop-${{ inputs.compare_run_number }}
path: baseline-reports
continue-on-error: true

- name: Download baseline mobile artifacts for comparison
if: inputs.compare_run_number != ''
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
with:
pattern: perf-report-llamacpp-llm-*-${{ inputs.compare_run_number }}
path: baseline-reports
continue-on-error: true

- name: Render unified benchmark report
shell: bash
run: |
if ! find combined-reports -name "*.json" -type f 2>/dev/null | grep -q .; then
echo "No benchmark reports found."
exit 0
fi
mkdir -p benchmark-artifacts

EXTRA_ARGS=""
if [ -n "${{ steps.addon_ver.outputs.version }}" ]; then
EXTRA_ARGS="$EXTRA_ARGS --addon-version ${{ steps.addon_ver.outputs.version }}"
fi
if [ -d baseline-reports ] && find baseline-reports -name "*.json" -type f 2>/dev/null | grep -q .; then
EXTRA_ARGS="$EXTRA_ARGS --compare-dir baseline-reports"
fi

node packages/llm-llamacpp/benchmarks/performance/render-report.js \
--dir combined-reports \
--output benchmark-artifacts/qwen35-benchmark-findings.md \
$EXTRA_ARGS

- name: Add to run summary
if: always()
shell: bash
run: |
set +e
MD_FILE="benchmark-artifacts/qwen35-benchmark-findings.md"
if [ -f "$MD_FILE" ]; then
cat "$MD_FILE" >> "$GITHUB_STEP_SUMMARY"
else
echo "No consolidated benchmark report available." >> "$GITHUB_STEP_SUMMARY"
fi

- name: Upload consolidated report
if: always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
with:
name: qwen35-benchmark-findings-${{ github.run_number }}
path: benchmark-artifacts/
retention-days: 90
if-no-files-found: ignore
14 changes: 12 additions & 2 deletions .github/workflows/integration-mobile-test-llm-llamacpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ on:
type: string
required: false
default: ""
job_timeout_minutes:
description: "Override the build-and-test job timeout (minutes). Default 120. Raised by Benchmark Performance (LLM) where the sharded matrix needs more headroom."
type: number
required: false
default: 120
artifact_suffix:
description: "Optional prefix inserted into the perf-report artifact name so multiple invocations in one run (e.g. Benchmark batches) don't collide. Default empty keeps the existing name."
type: string
required: false
default: ""
workflow_dispatch:
inputs:
ref:
Expand Down Expand Up @@ -74,7 +84,7 @@ jobs:
name: Build ${{ matrix.platform }} and Run E2E Tests
runs-on: ${{ matrix.runner }}
environment: release
timeout-minutes: 120
timeout-minutes: ${{ inputs.job_timeout_minutes || 120 }}
continue-on-error: true
permissions:
contents: read
Expand Down Expand Up @@ -217,7 +227,7 @@ jobs:
platform: ${{ matrix.platform }}
merge: 'true'
unzip-customer-artifacts: 'true'
artifact-name: perf-report-llamacpp-llm-${{ matrix.platform }}-${{ github.run_number }}
artifact-name: perf-report-llamacpp-llm-${{ inputs.artifact_suffix }}${{ matrix.platform }}-${{ github.run_number }}

- name: Comment results on PR
if: always() && !cancelled()
Expand Down
21 changes: 16 additions & 5 deletions packages/llm-llamacpp/benchmarks/performance/case-runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ const path = require('bare-path')
const { round, average, stddev, cartesianProduct } = require('./math')
const { stripSurroundingQuotes, normalizeArgValue } = require('./utils')

const PROMPT_CASES = ['long', 'ctx-filling', 'span-fill']
// Focused WB run uses a single ~512-token prompt. Add 'ctx-filling' /
// 'span-fill' back to also sweep context-fill and batch-spanning prompts.
const PROMPT_CASES = ['long']
const PROMPTS_PER_CASE = 1

const SWEEP_OVERRIDE_KEYS = [
Expand All @@ -17,7 +19,8 @@ const SWEEP_OVERRIDE_KEYS = [
'ubatch-size',
'flash-attn',
'cache-type-k',
'cache-type-v'
'cache-type-v',
'reasoning-budget'
]

function splitCsvArg (value, key) {
Expand Down Expand Up @@ -83,6 +86,7 @@ function buildCases (modelDef, sweep) {
const threadsValues = sweep.threads || []
const cacheTypeKValues = sweep['cache-type-k'] || []
const cacheTypeVValues = sweep['cache-type-v'] || []
const reasoningBudgetValues = sweep['reasoning-budget'] || []

const cases = []
for (const promptCase of PROMPT_CASES) {
Expand All @@ -101,6 +105,7 @@ function buildCases (modelDef, sweep) {
if (devices.length > 0 && ctxSizes.length > 0 && batchSizes.length > 0 && ubatchSizes.length > 0 &&
flashAttnValues.length > 0 &&
threadsValues.length > 0 && cacheTypeKValues.length > 0 && cacheTypeVValues.length > 0) {
const rbValues = reasoningBudgetValues.length > 0 ? reasoningBudgetValues : [null]
const combos = cartesianProduct([
supportedQuants,
devices,
Expand All @@ -110,10 +115,11 @@ function buildCases (modelDef, sweep) {
flashAttnValues,
threadsValues,
cacheTypeKValues,
cacheTypeVValues
cacheTypeVValues,
rbValues
])

for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV] of combos) {
for (const [quantization, device, ctxSize, batchSize, ubatchSize, flashAttn, threads, cacheTypeK, cacheTypeV, reasoningBudget] of combos) {
if (Number(ubatchSize) > Number(batchSize)) {
continue // Skip combinations where ubatchSize is greater than batchSize
}
Expand All @@ -128,8 +134,10 @@ function buildCases (modelDef, sweep) {
'cache-type-k': cacheTypeK,
'cache-type-v': cacheTypeV
}
if (reasoningBudget !== null) runtimeConfig['reasoning-budget'] = reasoningBudget

const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}`
const rbSuffix = reasoningBudget !== null ? `__rb=${reasoningBudget}` : ''
const caseId = `${modelDef.id}__q=${quantization}__dev=${device}__ctx=${ctxSize}__bs=${batchSize}__ubs=${ubatchSize}__fa=${flashAttn}__t=${threads}__ck=${cacheTypeK}__cv=${cacheTypeV}${rbSuffix}`

for (const promptCase of PROMPT_CASES) {
cases.push({
Expand Down Expand Up @@ -207,6 +215,7 @@ function aggregateRunMetrics (runMetrics) {
const unloadMsValues = runMetrics.map((x) => x.unloadMs).filter((x) => x != null)
const ttftMsValues = runMetrics.map((x) => x.ttftMs).filter((x) => x != null)
const tpsValues = runMetrics.map((x) => x.tps).filter((x) => x != null)
const ppTpsValues = runMetrics.map((x) => x.ppTps).filter((x) => x != null)
const firstPromptTokens = runMetrics.find((x) => x.promptTokens != null)?.promptTokens ?? null
const firstGeneratedTokens = runMetrics.find((x) => x.generatedTokens != null)?.generatedTokens ?? null

Expand All @@ -222,6 +231,8 @@ function aggregateRunMetrics (runMetrics) {
ttftMsStd: round(stddev(ttftMsValues), 3),
tpsMean: round(average(tpsValues), 3),
tpsStd: round(stddev(tpsValues), 3),
ppTpsMean: round(average(ppTpsValues), 3),
ppTpsStd: round(stddev(ppTpsValues), 3),
promptTokens: firstPromptTokens,
generatedTokens: firstGeneratedTokens
}
Expand Down
Loading
Loading