From c9fe74673ae3561e46871c982bc80a1f3bc34fd8 Mon Sep 17 00:00:00 2001 From: Oluwatobi Adelegan Date: Thu, 30 Apr 2026 18:21:51 +0100 Subject: [PATCH 1/5] QVAC-18111 infra[notask]: scaffold Benchmark Performance (LLM) workflow_dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GitHub requires a `workflow_dispatch` workflow to exist on the default branch before it shows up in the Actions tab and becomes triggerable with `--ref `. This lands the LLM benchmark workflow on `main` so the QVAC-17830 perf-metrics feature branch can be dispatched against it for end-to-end validation. Changes: - `benchmark-performance-qvac-lib-infer-llamacpp-llm.yml` (new): manual `workflow_dispatch` only — mirrors the structure of the existing Parakeet / Whispercpp benchmark workflows. Calls `prebuilds-...yml` then `integration-test-...yml` with bench-mode iteration counts (`QVAC_PERF_RUNS=3`, `QVAC_PERF_WARMUP_RUNS=1` by default), then aggregates desktop artifacts into a combined HTML / step-summary. Phase-1 scope is desktop only — mobile (Device Farm) needs a build-time hook in the test app to thread env vars through to bare and is tracked as a QVAC-18111 follow-up. - `integration-test-qvac-lib-infer-llamacpp-llm.yml`: thread `qvac_perf_runs` / `qvac_perf_warmup_runs` through `workflow_call` + `workflow_dispatch` and surface them as `QVAC_PERF_RUNS` / `QVAC_PERF_WARMUP_RUNS` on the Linux/macOS and Windows test run steps. Empty string => unset, so the umbrella PR workflow continues to honour the test-side default and PR runs are unaffected by this change. Per the perf policy agreed on Slack (2026-04-30): the umbrella on-pr workflow runs perf tests at the cheap default so we don't pay full perf cost on every PR; this dedicated workflow is the only place we crank up the iteration counts to produce mean ± std numbers. Made-with: Cursor --- ...erformance-qvac-lib-infer-llamacpp-llm.yml | 204 ++++++++++++++++++ ...ation-test-qvac-lib-infer-llamacpp-llm.yml | 28 +++ 2 files changed, 232 insertions(+) create mode 100644 .github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml diff --git a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml new file mode 100644 index 0000000000..7f3bb956e1 --- /dev/null +++ b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml @@ -0,0 +1,204 @@ +name: Benchmark Performance (LLM) + +# QVAC-18111: dedicated benchmarking workflow for the LLM addon — +# manually triggered only. +# +# Per the perf policy agreed on Slack (2026-04-30, @Olya / @Gianfranco): +# the umbrella PR workflow runs perf tests at the cheap default +# (1 warmup + 1 counted, no averaging) so we don't pay full perf +# cost on every PR. This workflow is the only place we crank +# QVAC_PERF_RUNS up to produce mean ± std numbers. +# +# Phase-1 scope: desktop matrix only. Mobile (Android / iOS Device +# Farm) needs a build-time hook in the test app to pass env vars +# through to bare — tracked as a QVAC-18111 follow-up. Mobile rows +# in PR runs continue to use the cheap 1+1 default. +# +# Mirrors the structure of the existing `Benchmark Performance +# (Parakeet)` and `Benchmark Performance (Whispercpp)` workflows on +# main: a `context` job derives repo/ref from optional inputs, then +# dispatches `prebuilds-...yml` followed by `integration-test-...yml` +# with the bench-mode iteration counts, and a `summarize` job +# aggregates the artifacts into a single combined HTML + GitHub +# step summary. + +on: + workflow_dispatch: + inputs: + repository: + description: "Repository to benchmark" + required: false + type: string + ref: + description: "Git ref (branch/tag/SHA) to benchmark" + required: false + type: string + qvac_perf_runs: + description: "QVAC_PERF_RUNS — counted iterations per perf test" + required: false + type: string + default: "3" + qvac_perf_warmup_runs: + description: "QVAC_PERF_WARMUP_RUNS — warmup iterations per perf test" + required: false + type: string + default: "1" + +permissions: + contents: read + packages: read + id-token: write + +jobs: + context: + runs-on: ubuntu-latest + outputs: + repository: ${{ steps.ctx.outputs.repository }} + ref: ${{ steps.ctx.outputs.ref }} + steps: + - id: ctx + shell: bash + env: + INPUT_REPO: ${{ inputs.repository }} + INPUT_REF: ${{ inputs.ref }} + REPO: ${{ github.repository }} + REF_NAME: ${{ github.ref_name }} + run: | + repo="${INPUT_REPO:-$REPO}" + ref="${INPUT_REF:-$REF_NAME}" + echo "repository=$repo" >> "$GITHUB_OUTPUT" + echo "ref=$ref" >> "$GITHUB_OUTPUT" + + prebuild: + needs: context + permissions: + contents: write + packages: write + pull-requests: write + id-token: write + uses: ./.github/workflows/prebuilds-qvac-lib-infer-llamacpp-llm.yml + secrets: inherit + with: + repository: ${{ needs.context.outputs.repository }} + ref: ${{ needs.context.outputs.ref }} + + desktop-benchmarks: + needs: [context, prebuild] + permissions: + contents: read + packages: read + id-token: write + uses: ./.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml + secrets: inherit + with: + repository: ${{ needs.context.outputs.repository }} + ref: ${{ needs.context.outputs.ref }} + qvac_perf_runs: ${{ inputs.qvac_perf_runs }} + qvac_perf_warmup_runs: ${{ inputs.qvac_perf_warmup_runs }} + + summarize: + needs: [context, desktop-benchmarks] + if: always() + runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 + with: + repository: ${{ needs.context.outputs.repository }} + ref: ${{ needs.context.outputs.ref }} + token: ${{ secrets.PAT_TOKEN }} + sparse-checkout: | + scripts/perf-report + packages/qvac-lib-infer-llamacpp-llm/media + + - name: Setup Node.js + uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # 4.4.0 + with: + node-version: lts/* + + - name: Download all perf report artifacts + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 + with: + pattern: perf-report-llamacpp-llm-*-${{ github.run_number }} + path: combined-reports + continue-on-error: true + + - name: Fix desktop device names + shell: bash + run: | + # Same fold as the umbrella combine-perf-reports step: + # sibling matrix legs (linux-x64-cpu+linux-x64-gpu, + # linux-arm64-u22+linux-arm64-u24) collapse onto one device + # name so [CPU]/[GPU] rows sit in the same column. + for dir in combined-reports/perf-report-llamacpp-llm-*/; do + [ -d "$dir" ] || continue + base=$(basename "$dir") + platform=$(echo "$base" | sed "s/^perf-report-llamacpp-llm-//" | sed "s/-${{ github.run_number }}$//") + + case "$platform" in Android|iOS) continue ;; esac + + case "$platform" in + linux-x64-cpu|linux-x64-gpu) device_name="linux-x64" ;; + linux-arm64-u22|linux-arm64-u24) device_name="linux-arm64" ;; + *) device_name="$platform" ;; + esac + + for json in $(find "$dir" -name "performance-report.json" 2>/dev/null); do + if command -v jq >/dev/null 2>&1; then + jq --arg name "$device_name" '.device.name = $name' "$json" > "${json}.tmp" && mv "${json}.tmp" "$json" + echo "Patched device name in $json -> $device_name (was matrix label $platform)" + fi + done + done + + - name: Generate consolidated benchmark report + run: | + if ! find combined-reports -name "performance-report.json" -type f 2>/dev/null | grep -q .; then + echo "No performance reports found." + exit 0 + fi + + mkdir -p benchmark-artifacts + + node scripts/perf-report/aggregate.js \ + --dir combined-reports \ + --addon-type vision \ + --device-details \ + --output-html benchmark-artifacts/llamacpp-llm-performance-findings.html \ + --output-json benchmark-artifacts/llamacpp-llm-performance-findings.json \ + --output benchmark-artifacts/llamacpp-llm-performance-findings.md + + - name: Add summary + if: always() + shell: bash + run: | + set +e + MD_FILE="benchmark-artifacts/llamacpp-llm-performance-findings.md" + { + echo "## LLM / VLM Benchmark Report (Desktop)" + echo "" + echo "> Triggered manually via \`workflow_dispatch\` — \`QVAC_PERF_RUNS=${{ inputs.qvac_perf_runs }}\`, \`QVAC_PERF_WARMUP_RUNS=${{ inputs.qvac_perf_warmup_runs }}\`." + echo "" + echo "> Mobile (Android / iOS) is **not** covered by this workflow yet — bench-mode iteration counts need a build-time hook in the mobile test app (QVAC-18111 follow-up). Mobile rows shown in PR runs continue to use 1 + 1." + echo "" + if [ -f "$MD_FILE" ]; then + cat "$MD_FILE" + else + echo "No combined performance report available." + fi + } >> "$GITHUB_STEP_SUMMARY" + + - name: Upload consolidated benchmark report + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0 + with: + name: llamacpp-llm-performance-findings + path: | + benchmark-artifacts/llamacpp-llm-performance-findings.md + benchmark-artifacts/llamacpp-llm-performance-findings.json + benchmark-artifacts/llamacpp-llm-performance-findings.html + retention-days: 30 + if-no-files-found: ignore diff --git a/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml index 198152d913..34613cba42 100644 --- a/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml +++ b/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml @@ -10,6 +10,16 @@ on: type: string model: type: string + qvac_perf_runs: + description: "Override QVAC_PERF_RUNS (number of counted iterations per perf test). Empty = test default." + type: string + required: false + default: "" + qvac_perf_warmup_runs: + description: "Override QVAC_PERF_WARMUP_RUNS (number of warmup iterations per perf test). Empty = test default." + type: string + required: false + default: "" workflow_dispatch: inputs: @@ -22,6 +32,16 @@ on: description: "NPM package containing prebuilds (e.g. @qvac/llm-llamacpp@1.0.0)" type: string required: true + qvac_perf_runs: + description: "Override QVAC_PERF_RUNS (number of counted iterations per perf test). Empty = test default." + type: string + required: false + default: "" + qvac_perf_warmup_runs: + description: "Override QVAC_PERF_WARMUP_RUNS (number of warmup iterations per perf test). Empty = test default." + type: string + required: false + default: "" jobs: run-integration-tests: @@ -196,6 +216,12 @@ jobs: shell: bash env: QASE_API_TOKEN: ${{ secrets.QASE_API_TOKEN }} + # QVAC-18111: empty string => unset, so the perf tests honour + # their PR default (1 warmup + 1 counted). The `Benchmark + # Performance (LLM)` workflow_dispatch passes "3" / "1" here + # to opt into mean ± std reporting. + QVAC_PERF_RUNS: ${{ inputs.qvac_perf_runs }} + QVAC_PERF_WARMUP_RUNS: ${{ inputs.qvac_perf_warmup_runs }} - name: Run integration test (Windows) if: ${{ matrix.platform == 'win32' }} @@ -206,3 +232,5 @@ jobs: shell: powershell env: QASE_API_TOKEN: ${{ secrets.QASE_API_TOKEN }} + QVAC_PERF_RUNS: ${{ inputs.qvac_perf_runs }} + QVAC_PERF_WARMUP_RUNS: ${{ inputs.qvac_perf_warmup_runs }} From b4541736b6dc8cacce1c8774bd64a4e99ad12e9e Mon Sep 17 00:00:00 2001 From: Oluwatobi Adelegan Date: Thu, 30 Apr 2026 18:43:20 +0100 Subject: [PATCH 2/5] QVAC-18111 chore[notask]: trim chatty inline comments in benchmark workflow Made-with: Cursor --- ...erformance-qvac-lib-infer-llamacpp-llm.yml | 36 +++++-------------- ...ation-test-qvac-lib-infer-llamacpp-llm.yml | 4 --- 2 files changed, 8 insertions(+), 32 deletions(-) diff --git a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml index 7f3bb956e1..7ac887013b 100644 --- a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml +++ b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml @@ -1,26 +1,9 @@ name: Benchmark Performance (LLM) -# QVAC-18111: dedicated benchmarking workflow for the LLM addon — -# manually triggered only. -# -# Per the perf policy agreed on Slack (2026-04-30, @Olya / @Gianfranco): -# the umbrella PR workflow runs perf tests at the cheap default -# (1 warmup + 1 counted, no averaging) so we don't pay full perf -# cost on every PR. This workflow is the only place we crank -# QVAC_PERF_RUNS up to produce mean ± std numbers. -# -# Phase-1 scope: desktop matrix only. Mobile (Android / iOS Device -# Farm) needs a build-time hook in the test app to pass env vars -# through to bare — tracked as a QVAC-18111 follow-up. Mobile rows -# in PR runs continue to use the cheap 1+1 default. -# -# Mirrors the structure of the existing `Benchmark Performance -# (Parakeet)` and `Benchmark Performance (Whispercpp)` workflows on -# main: a `context` job derives repo/ref from optional inputs, then -# dispatches `prebuilds-...yml` followed by `integration-test-...yml` -# with the bench-mode iteration counts, and a `summarize` job -# aggregates the artifacts into a single combined HTML + GitHub -# step summary. +# Manually-triggered benchmark workflow. The umbrella on-pr workflow +# runs perf tests at the cheap default; this is where iteration +# counts are cranked up to get mean ± std numbers. Desktop matrix +# only; mobile is a follow-up. on: workflow_dispatch: @@ -129,10 +112,9 @@ jobs: - name: Fix desktop device names shell: bash run: | - # Same fold as the umbrella combine-perf-reports step: - # sibling matrix legs (linux-x64-cpu+linux-x64-gpu, - # linux-arm64-u22+linux-arm64-u24) collapse onto one device - # name so [CPU]/[GPU] rows sit in the same column. + # Collapse sibling matrix legs (linux-x64-cpu/gpu, + # linux-arm64-u22/u24) onto one device name so [CPU]/[GPU] + # rows sit in the same column. for dir in combined-reports/perf-report-llamacpp-llm-*/; do [ -d "$dir" ] || continue base=$(basename "$dir") @@ -180,9 +162,7 @@ jobs: { echo "## LLM / VLM Benchmark Report (Desktop)" echo "" - echo "> Triggered manually via \`workflow_dispatch\` — \`QVAC_PERF_RUNS=${{ inputs.qvac_perf_runs }}\`, \`QVAC_PERF_WARMUP_RUNS=${{ inputs.qvac_perf_warmup_runs }}\`." - echo "" - echo "> Mobile (Android / iOS) is **not** covered by this workflow yet — bench-mode iteration counts need a build-time hook in the mobile test app (QVAC-18111 follow-up). Mobile rows shown in PR runs continue to use 1 + 1." + echo "> \`QVAC_PERF_RUNS=${{ inputs.qvac_perf_runs }}\`, \`QVAC_PERF_WARMUP_RUNS=${{ inputs.qvac_perf_warmup_runs }}\`. Mobile is not covered by this workflow yet." echo "" if [ -f "$MD_FILE" ]; then cat "$MD_FILE" diff --git a/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml index 34613cba42..a619538ba8 100644 --- a/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml +++ b/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml @@ -216,10 +216,6 @@ jobs: shell: bash env: QASE_API_TOKEN: ${{ secrets.QASE_API_TOKEN }} - # QVAC-18111: empty string => unset, so the perf tests honour - # their PR default (1 warmup + 1 counted). The `Benchmark - # Performance (LLM)` workflow_dispatch passes "3" / "1" here - # to opt into mean ± std reporting. QVAC_PERF_RUNS: ${{ inputs.qvac_perf_runs }} QVAC_PERF_WARMUP_RUNS: ${{ inputs.qvac_perf_warmup_runs }} From 8e27396e85f1c7ef1dc418ac99f5e33feb0478df Mon Sep 17 00:00:00 2001 From: Oluwatobi Adelegan Date: Thu, 30 Apr 2026 18:58:07 +0100 Subject: [PATCH 3/5] QVAC-18111 chore[notask]: add run_desktop toggle to benchmark workflow_dispatch Made-with: Cursor --- .../benchmark-performance-qvac-lib-infer-llamacpp-llm.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml index 7ac887013b..e32cb9adc2 100644 --- a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml +++ b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml @@ -26,6 +26,11 @@ on: required: false type: string default: "1" + run_desktop: + description: "Run desktop matrix (Linux / macOS / Windows)" + required: false + type: boolean + default: true permissions: contents: read @@ -67,6 +72,7 @@ jobs: desktop-benchmarks: needs: [context, prebuild] + if: ${{ inputs.run_desktop }} permissions: contents: read packages: read From b4b777ae9eef4577a6ef8c273c1062b0f8d903ab Mon Sep 17 00:00:00 2001 From: Oluwatobi Adelegan Date: Thu, 30 Apr 2026 18:49:54 +0100 Subject: [PATCH 4/5] QVAC-18111 infra[notask]: bridge QVAC_PERF_RUNS to mobile test app via pushFile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the mobile integration workflow with the same iteration-count inputs as the desktop reusable workflow, and adds a `mobile-benchmarks` job to the LLM benchmark dispatch so it covers Device Farm too. The bare runtime on Device Farm doesn't see GitHub Actions env vars, so we mirror the existing `testFilter.txt` pattern: when the workflow inputs are non-empty, the WDIO before-hook pushes a `qvacPerfConfig.txt` to the device (Android: `/data/local/tmp/`, iOS: `@bundleId:documents/`) with the iteration overrides as KEY=VALUE lines. The file-reading side on bare lives on the QVAC-17830 perf branch — without that branch this PR is a no-op (orphan file), so it is safe to land independently. Changes: - `integration-mobile-test-qvac-lib-infer-llamacpp-llm.yml`: add `qvac_perf_runs` / `qvac_perf_warmup_runs` to `workflow_call` and `workflow_dispatch`; add `__QVAC_PERF_RUNS__` / `__QVAC_PERF_WARMUP_RUNS__` placeholders to the Android + iOS WDIO config blobs and the corresponding pushFile block in the `before` hook; substitute the placeholders in `make_split`. - `benchmark-performance-qvac-lib-infer-llamacpp-llm.yml`: add a `mobile-benchmarks` job calling the mobile workflow with the bench-mode iteration counts; have `summarize` `needs:` it; drop the "desktop only" caveat in the step-summary blurb. PR runs are unchanged: empty input ⇒ empty placeholder ⇒ before-hook skips the perf-config push. Made-with: Cursor --- ...erformance-qvac-lib-infer-llamacpp-llm.yml | 25 +++++++++++--- ...obile-test-qvac-lib-infer-llamacpp-llm.yml | 33 +++++++++++++++++-- 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml index e32cb9adc2..e3072dfa5e 100644 --- a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml +++ b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml @@ -2,8 +2,8 @@ name: Benchmark Performance (LLM) # Manually-triggered benchmark workflow. The umbrella on-pr workflow # runs perf tests at the cheap default; this is where iteration -# counts are cranked up to get mean ± std numbers. Desktop matrix -# only; mobile is a follow-up. +# counts are cranked up to get mean ± std numbers. Covers desktop + +# mobile (Android / iOS via Device Farm). on: workflow_dispatch: @@ -85,8 +85,23 @@ jobs: qvac_perf_runs: ${{ inputs.qvac_perf_runs }} qvac_perf_warmup_runs: ${{ inputs.qvac_perf_warmup_runs }} + mobile-benchmarks: + needs: [context, prebuild] + permissions: + contents: read + packages: read + pull-requests: write + id-token: write + uses: ./.github/workflows/integration-mobile-test-qvac-lib-infer-llamacpp-llm.yml + secrets: inherit + with: + repository: ${{ needs.context.outputs.repository }} + ref: ${{ needs.context.outputs.ref }} + qvac_perf_runs: ${{ inputs.qvac_perf_runs }} + qvac_perf_warmup_runs: ${{ inputs.qvac_perf_warmup_runs }} + summarize: - needs: [context, desktop-benchmarks] + needs: [context, desktop-benchmarks, mobile-benchmarks] if: always() runs-on: ubuntu-latest timeout-minutes: 10 @@ -166,9 +181,9 @@ jobs: set +e MD_FILE="benchmark-artifacts/llamacpp-llm-performance-findings.md" { - echo "## LLM / VLM Benchmark Report (Desktop)" + echo "## LLM / VLM Benchmark Report" echo "" - echo "> \`QVAC_PERF_RUNS=${{ inputs.qvac_perf_runs }}\`, \`QVAC_PERF_WARMUP_RUNS=${{ inputs.qvac_perf_warmup_runs }}\`. Mobile is not covered by this workflow yet." + echo "> \`QVAC_PERF_RUNS=${{ inputs.qvac_perf_runs }}\`, \`QVAC_PERF_WARMUP_RUNS=${{ inputs.qvac_perf_warmup_runs }}\`." echo "" if [ -f "$MD_FILE" ]; then cat "$MD_FILE" diff --git a/.github/workflows/integration-mobile-test-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/integration-mobile-test-qvac-lib-infer-llamacpp-llm.yml index 898b552aea..2bcff3af4e 100644 --- a/.github/workflows/integration-mobile-test-qvac-lib-infer-llamacpp-llm.yml +++ b/.github/workflows/integration-mobile-test-qvac-lib-infer-llamacpp-llm.yml @@ -11,6 +11,16 @@ on: description: "Repository to checkout" type: string required: false + qvac_perf_runs: + description: "Override QVAC_PERF_RUNS (number of counted iterations per perf test). Empty = test default." + type: string + required: false + default: "" + qvac_perf_warmup_runs: + description: "Override QVAC_PERF_WARMUP_RUNS (number of warmup iterations per perf test). Empty = test default." + type: string + required: false + default: "" workflow_dispatch: inputs: ref: @@ -23,6 +33,16 @@ on: type: string required: true default: "@qvac/llm-llamacpp@latest" + qvac_perf_runs: + description: "Override QVAC_PERF_RUNS (number of counted iterations per perf test). Empty = test default." + type: string + required: false + default: "" + qvac_perf_warmup_runs: + description: "Override QVAC_PERF_WARMUP_RUNS (number of warmup iterations per perf test). Empty = test default." + type: string + required: false + default: "" env: NODE_VERSION: "lts/*" @@ -923,8 +943,9 @@ jobs: # delay so logcat has time to drain bare stdout / native logs that # would otherwise be lost when process.exit(1) tears down Appium # before Device Farm finalises the artifact bundle. - # __TEST_FILTER__ placeholder is replaced per-split by make_split() - WDIO_CONFIG='exports.config={runner:"local",hostname:"127.0.0.1",port:4723,path:"/wd/hub",specs:["*.spec.js","*.test.js"],maxInstances:1,bail:0,capabilities:[{platformName:"Android","appium:automationName":"UiAutomator2","appium:appPackage":"'${{ env.APP_BUNDLE_ID }}'","appium:appActivity":"'${{ env.APP_BUNDLE_ID }}'.MainActivity","appium:newCommandTimeout":300,"appium:autoGrantPermissions":true,"appium:autoAcceptAlerts":true,"appium:noReset":true,"appium:dontStopAppOnReset":true,"appium:forceAppLaunch":false}],logLevel:"debug",waitforTimeout:120000,connectionRetryTimeout:30000,connectionRetryCount:3,services:[],framework:"mocha",reporters:["spec"],mochaOpts:{ui:"bdd",timeout:1800000,grep:"__MOCHA_GREP__"},before:async function(capabilities,specs,browser){const BUNDLE_ID="'${{ env.APP_BUNDLE_ID }}'";const TEST_FILTER="__TEST_FILTER__";global.appCrashed=false;global.checkAppCrash=async(stage)=>{try{const state=await browser.queryAppState(BUNDLE_ID);console.log("["+stage+"] App state: "+state+" (4=foreground,3=background,1=not running)");if(state<3){console.error("\\n🛑 APP CRASHED at "+stage+"! State="+state);console.error("Check device logs for BareKit/native errors.\\n");global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);}return state;}catch(e){console.log("["+stage+"] queryAppState error: "+e.message);return-1;}};console.log("Checking initial app state...");await global.checkAppCrash("startup");console.log("Waiting for app to initialize...");await browser.pause(5000);await global.checkAppCrash("after-pause");const initText=await browser.$("android=new UiSelector().textContains(\"INITIALIZED\")");await initText.waitForDisplayed({timeout:60000});await global.checkAppCrash("after-init");if(TEST_FILTER!=="__TEST_FILTER__"){try{const b64=Buffer.from(TEST_FILTER).toString("base64");await browser.pushFile("/data/local/tmp/testFilter.txt",b64);console.log("Pushed test filter: "+TEST_FILTER);}catch(e){console.log("pushFile failed: "+e.message);}}console.log("App initialized, clicking Run Automated Tests...");const button=await browser.$("android=new UiSelector().textContains(\"Run Automated Tests\")");await button.waitForDisplayed({timeout:15000});await button.click();console.log("Button clicked!");await browser.pause(5000);await global.checkAppCrash("after-click");global.crashMonitor=setInterval(async()=>{if(global.appCrashed)return;try{const s=await browser.queryAppState(BUNDLE_ID);if(s<3){console.error("\\n🛑 BACKGROUND CRASH DETECTED! App state="+s);global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);}}catch(e){}},15000);},after:async function(){if(global.crashMonitor)clearInterval(global.crashMonitor);},afterTest:async function(test,context,{error}){if(global.appCrashed)return;await global.checkAppCrash("after-test:"+test.title);}};' + # __TEST_FILTER__, __QVAC_PERF_RUNS__, __QVAC_PERF_WARMUP_RUNS__ + # placeholders are replaced per-split by make_split(). + WDIO_CONFIG='exports.config={runner:"local",hostname:"127.0.0.1",port:4723,path:"/wd/hub",specs:["*.spec.js","*.test.js"],maxInstances:1,bail:0,capabilities:[{platformName:"Android","appium:automationName":"UiAutomator2","appium:appPackage":"'${{ env.APP_BUNDLE_ID }}'","appium:appActivity":"'${{ env.APP_BUNDLE_ID }}'.MainActivity","appium:newCommandTimeout":300,"appium:autoGrantPermissions":true,"appium:autoAcceptAlerts":true,"appium:noReset":true,"appium:dontStopAppOnReset":true,"appium:forceAppLaunch":false}],logLevel:"debug",waitforTimeout:120000,connectionRetryTimeout:30000,connectionRetryCount:3,services:[],framework:"mocha",reporters:["spec"],mochaOpts:{ui:"bdd",timeout:1800000,grep:"__MOCHA_GREP__"},before:async function(capabilities,specs,browser){const BUNDLE_ID="'${{ env.APP_BUNDLE_ID }}'";const TEST_FILTER="__TEST_FILTER__";const QVAC_PERF_RUNS_VALUE="__QVAC_PERF_RUNS__";const QVAC_PERF_WARMUP_RUNS_VALUE="__QVAC_PERF_WARMUP_RUNS__";global.appCrashed=false;global.checkAppCrash=async(stage)=>{try{const state=await browser.queryAppState(BUNDLE_ID);console.log("["+stage+"] App state: "+state+" (4=foreground,3=background,1=not running)");if(state<3){console.error("\\n🛑 APP CRASHED at "+stage+"! State="+state);console.error("Check device logs for BareKit/native errors.\\n");global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);}return state;}catch(e){console.log("["+stage+"] queryAppState error: "+e.message);return-1;}};console.log("Checking initial app state...");await global.checkAppCrash("startup");console.log("Waiting for app to initialize...");await browser.pause(5000);await global.checkAppCrash("after-pause");const initText=await browser.$("android=new UiSelector().textContains(\"INITIALIZED\")");await initText.waitForDisplayed({timeout:60000});await global.checkAppCrash("after-init");if(TEST_FILTER!=="__TEST_FILTER__"){try{const b64=Buffer.from(TEST_FILTER).toString("base64");await browser.pushFile("/data/local/tmp/testFilter.txt",b64);console.log("Pushed test filter: "+TEST_FILTER);}catch(e){console.log("pushFile failed: "+e.message);}}if(QVAC_PERF_RUNS_VALUE.length>0||QVAC_PERF_WARMUP_RUNS_VALUE.length>0){try{var perfCfg="QVAC_PERF_RUNS="+QVAC_PERF_RUNS_VALUE+"\\nQVAC_PERF_WARMUP_RUNS="+QVAC_PERF_WARMUP_RUNS_VALUE+"\\n";var pcb64=Buffer.from(perfCfg).toString("base64");await browser.pushFile("/data/local/tmp/qvacPerfConfig.txt",pcb64);console.log("Pushed perf config: "+perfCfg.replace(/\\n/g," "));}catch(e){console.log("perfConfig pushFile failed: "+e.message);}}console.log("App initialized, clicking Run Automated Tests...");const button=await browser.$("android=new UiSelector().textContains(\"Run Automated Tests\")");await button.waitForDisplayed({timeout:15000});await button.click();console.log("Button clicked!");await browser.pause(5000);await global.checkAppCrash("after-click");global.crashMonitor=setInterval(async()=>{if(global.appCrashed)return;try{const s=await browser.queryAppState(BUNDLE_ID);if(s<3){console.error("\\n🛑 BACKGROUND CRASH DETECTED! App state="+s);global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);}}catch(e){}},15000);},after:async function(){if(global.crashMonitor)clearInterval(global.crashMonitor);},afterTest:async function(test,context,{error}){if(global.appCrashed)return;await global.checkAppCrash("after-test:"+test.title);}};' else PLATFORM="iOS" AUTOMATION="XCUITest" @@ -946,7 +967,7 @@ jobs: # on the normal completion path. # usePrebuiltWDA uses Device Farm's pre-built WebDriverAgent # Increased timeout to 30 minutes (1800000ms) for long-running LLM tests - WDIO_CONFIG='exports.config={runner:"local",hostname:"127.0.0.1",port:4723,path:"/wd/hub",specs:["*.spec.js","*.test.js"],maxInstances:1,bail:0,capabilities:[{platformName:"iOS","appium:automationName":"XCUITest","appium:bundleId":"'${{ env.APP_BUNDLE_ID }}'","appium:newCommandTimeout":300,"appium:noReset":true,"appium:forceAppLaunch":false,"appium:usePrebuiltWDA":true,"appium:wdaLocalPort":8100,"appium:showIOSLog":true,"appium:realDeviceLogger":"/usr/local/lib/node_modules/appium/node_modules/deviceconsole/deviceconsole"}],logLevel:"debug",waitforTimeout:120000,connectionRetryTimeout:30000,connectionRetryCount:3,services:[],framework:"mocha",reporters:["spec"],mochaOpts:{ui:"bdd",timeout:1800000,grep:"__MOCHA_GREP__"},before:async function(capabilities,specs,browser){const BUNDLE_ID="'${{ env.APP_BUNDLE_ID }}'";const TEST_FILTER="__TEST_FILTER__";global.appCrashed=false;global.flushBareLog=async function(reason){try{var _h=require("http");var lb64=await new Promise(function(ok,fail){var bd=JSON.stringify({path:"@"+BUNDLE_ID+":documents/bare_console.log"});var rq=_h.request({hostname:"127.0.0.1",port:4723,path:"/wd/hub/session/"+browser.sessionId+"/appium/device/pull_file",method:"POST",headers:{"Content-Type":"application/json","Content-Length":Buffer.byteLength(bd)}},function(rs){var d="";rs.on("data",function(c){d+=c;});rs.on("end",function(){try{ok(JSON.parse(d).value);}catch(e){fail(e);}});});rq.on("error",fail);rq.write(bd);rq.end();});var logTxt=Buffer.from(lb64,"base64").toString();var logDir=process.env.DEVICEFARM_LOG_DIR||".";require("fs").writeFileSync(logDir+"/bare_console.log",logTxt);console.log("[bare-log] "+reason+" flush ok ("+logTxt.length+" bytes)");}catch(e){console.log("[bare-log] "+reason+" flush failed: "+e.message);}};global.checkAppCrash=async(stage)=>{try{const state=await browser.queryAppState(BUNDLE_ID);console.log("["+stage+"] App state: "+state+" (4=foreground,3=background,1=not running)");if(state<3){console.error("\\n🛑 APP CRASHED at "+stage+"! State="+state);console.error("Check device logs for BareKit/native errors.\\n");global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);try{await browser.pause(1500);await Promise.race([global.flushBareLog("crash-"+stage),new Promise(function(_,rj){setTimeout(function(){rj(new Error("bare-log flush timed out"));},3000);})]);}catch(_){}}return state;}catch(e){console.log("["+stage+"] queryAppState error: "+e.message);return-1;}};console.log("Checking initial app state...");await global.checkAppCrash("startup");console.log("Waiting for app to initialize...");await browser.pause(5000);await global.checkAppCrash("after-pause");const initText=await browser.$("-ios predicate string:label CONTAINS \"INITIALIZED\"");await initText.waitForDisplayed({timeout:60000});await global.checkAppCrash("after-init");if(TEST_FILTER!=="__TEST_FILTER__"){try{const b64=Buffer.from(TEST_FILTER).toString("base64");await browser.pushFile("@"+BUNDLE_ID+":documents/testFilter.txt",b64);console.log("Pushed test filter: "+TEST_FILTER);}catch(e){console.log("pushFile failed: "+e.message);}}console.log("App initialized, clicking Run Automated Tests...");const button=await browser.$("-ios predicate string:label CONTAINS \"Run Automated Tests\"");await button.waitForDisplayed({timeout:15000});await button.click();console.log("Button clicked!");await browser.pause(5000);await global.checkAppCrash("after-click");global.crashMonitor=setInterval(async()=>{if(global.appCrashed)return;try{const s=await browser.queryAppState(BUNDLE_ID);if(s<3){console.error("\\n🛑 BACKGROUND CRASH DETECTED! App state="+s);global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);try{await browser.pause(1500);await Promise.race([global.flushBareLog("crash-bg"),new Promise(function(_,rj){setTimeout(function(){rj(new Error("bare-log flush timed out"));},3000);})]);}catch(_){}}}catch(e){}},15000);},after:async function(){if(global.crashMonitor)clearInterval(global.crashMonitor);console.log("[bare-log] Waiting for log flush...");await browser.pause(3000);if(global.flushBareLog)await global.flushBareLog("after");},afterTest:async function(test,context,{error}){if(global.appCrashed)return;await global.checkAppCrash("after-test:"+test.title);}};' + WDIO_CONFIG='exports.config={runner:"local",hostname:"127.0.0.1",port:4723,path:"/wd/hub",specs:["*.spec.js","*.test.js"],maxInstances:1,bail:0,capabilities:[{platformName:"iOS","appium:automationName":"XCUITest","appium:bundleId":"'${{ env.APP_BUNDLE_ID }}'","appium:newCommandTimeout":300,"appium:noReset":true,"appium:forceAppLaunch":false,"appium:usePrebuiltWDA":true,"appium:wdaLocalPort":8100,"appium:showIOSLog":true,"appium:realDeviceLogger":"/usr/local/lib/node_modules/appium/node_modules/deviceconsole/deviceconsole"}],logLevel:"debug",waitforTimeout:120000,connectionRetryTimeout:30000,connectionRetryCount:3,services:[],framework:"mocha",reporters:["spec"],mochaOpts:{ui:"bdd",timeout:1800000,grep:"__MOCHA_GREP__"},before:async function(capabilities,specs,browser){const BUNDLE_ID="'${{ env.APP_BUNDLE_ID }}'";const TEST_FILTER="__TEST_FILTER__";const QVAC_PERF_RUNS_VALUE="__QVAC_PERF_RUNS__";const QVAC_PERF_WARMUP_RUNS_VALUE="__QVAC_PERF_WARMUP_RUNS__";global.appCrashed=false;global.flushBareLog=async function(reason){try{var _h=require("http");var lb64=await new Promise(function(ok,fail){var bd=JSON.stringify({path:"@"+BUNDLE_ID+":documents/bare_console.log"});var rq=_h.request({hostname:"127.0.0.1",port:4723,path:"/wd/hub/session/"+browser.sessionId+"/appium/device/pull_file",method:"POST",headers:{"Content-Type":"application/json","Content-Length":Buffer.byteLength(bd)}},function(rs){var d="";rs.on("data",function(c){d+=c;});rs.on("end",function(){try{ok(JSON.parse(d).value);}catch(e){fail(e);}});});rq.on("error",fail);rq.write(bd);rq.end();});var logTxt=Buffer.from(lb64,"base64").toString();var logDir=process.env.DEVICEFARM_LOG_DIR||".";require("fs").writeFileSync(logDir+"/bare_console.log",logTxt);console.log("[bare-log] "+reason+" flush ok ("+logTxt.length+" bytes)");}catch(e){console.log("[bare-log] "+reason+" flush failed: "+e.message);}};global.checkAppCrash=async(stage)=>{try{const state=await browser.queryAppState(BUNDLE_ID);console.log("["+stage+"] App state: "+state+" (4=foreground,3=background,1=not running)");if(state<3){console.error("\\n🛑 APP CRASHED at "+stage+"! State="+state);console.error("Check device logs for BareKit/native errors.\\n");global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);try{await browser.pause(1500);await Promise.race([global.flushBareLog("crash-"+stage),new Promise(function(_,rj){setTimeout(function(){rj(new Error("bare-log flush timed out"));},3000);})]);}catch(_){}}return state;}catch(e){console.log("["+stage+"] queryAppState error: "+e.message);return-1;}};console.log("Checking initial app state...");await global.checkAppCrash("startup");console.log("Waiting for app to initialize...");await browser.pause(5000);await global.checkAppCrash("after-pause");const initText=await browser.$("-ios predicate string:label CONTAINS \"INITIALIZED\"");await initText.waitForDisplayed({timeout:60000});await global.checkAppCrash("after-init");if(TEST_FILTER!=="__TEST_FILTER__"){try{const b64=Buffer.from(TEST_FILTER).toString("base64");await browser.pushFile("@"+BUNDLE_ID+":documents/testFilter.txt",b64);console.log("Pushed test filter: "+TEST_FILTER);}catch(e){console.log("pushFile failed: "+e.message);}}if(QVAC_PERF_RUNS_VALUE.length>0||QVAC_PERF_WARMUP_RUNS_VALUE.length>0){try{var perfCfg="QVAC_PERF_RUNS="+QVAC_PERF_RUNS_VALUE+"\\nQVAC_PERF_WARMUP_RUNS="+QVAC_PERF_WARMUP_RUNS_VALUE+"\\n";var pcb64=Buffer.from(perfCfg).toString("base64");await browser.pushFile("@"+BUNDLE_ID+":documents/qvacPerfConfig.txt",pcb64);console.log("Pushed perf config: "+perfCfg.replace(/\\n/g," "));}catch(e){console.log("perfConfig pushFile failed: "+e.message);}}console.log("App initialized, clicking Run Automated Tests...");const button=await browser.$("-ios predicate string:label CONTAINS \"Run Automated Tests\"");await button.waitForDisplayed({timeout:15000});await button.click();console.log("Button clicked!");await browser.pause(5000);await global.checkAppCrash("after-click");global.crashMonitor=setInterval(async()=>{if(global.appCrashed)return;try{const s=await browser.queryAppState(BUNDLE_ID);if(s<3){console.error("\\n🛑 BACKGROUND CRASH DETECTED! App state="+s);global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);try{await browser.pause(1500);await Promise.race([global.flushBareLog("crash-bg"),new Promise(function(_,rj){setTimeout(function(){rj(new Error("bare-log flush timed out"));},3000);})]);}catch(_){}}}catch(e){}},15000);},after:async function(){if(global.crashMonitor)clearInterval(global.crashMonitor);console.log("[bare-log] Waiting for log flush...");await browser.pause(3000);if(global.flushBareLog)await global.flushBareLog("after");},afterTest:async function(test,context,{error}){if(global.appCrashed)return;await global.checkAppCrash("after-test:"+test.title);}};' fi WDIO_B64=$(echo "$WDIO_CONFIG" | base64 | tr -d '\n') @@ -1063,11 +1084,17 @@ jobs: # For each split: inject mocha grep AND replace __TEST_FILTER__ so # the before-hook pushes a testFilter.txt the app reads at runtime. # This ensures the app only executes matching tests (real splitting). + # When set, the perf inputs also get substituted so the before-hook + # pushes a qvacPerfConfig.txt with the iteration overrides. + QVAC_PERF_RUNS_INPUT="${{ inputs.qvac_perf_runs }}" + QVAC_PERF_WARMUP_RUNS_INPUT="${{ inputs.qvac_perf_warmup_runs }}" make_split() { local pattern="$1" output="$2" local cfg cfg=$(echo "$WDIO_CONFIG" | sed "s#__MOCHA_GREP__#$pattern#") cfg=$(echo "$cfg" | sed "s#__TEST_FILTER__#$pattern#") + cfg=$(echo "$cfg" | sed "s#__QVAC_PERF_RUNS__#$QVAC_PERF_RUNS_INPUT#") + cfg=$(echo "$cfg" | sed "s#__QVAC_PERF_WARMUP_RUNS__#$QVAC_PERF_WARMUP_RUNS_INPUT#") local b64 b64=$(echo "$cfg" | base64 | tr -d '\n') generate_spec "$output" "$b64" From 9cdf07be453dea3c150224b29ba0cbc6c130c440 Mon Sep 17 00:00:00 2001 From: Oluwatobi Adelegan Date: Thu, 30 Apr 2026 19:00:25 +0100 Subject: [PATCH 5/5] QVAC-18111 chore[notask]: add run_mobile toggle to benchmark workflow_dispatch Made-with: Cursor --- ...chmark-performance-qvac-lib-infer-llamacpp-llm.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml index e3072dfa5e..9abc517c23 100644 --- a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml +++ b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml @@ -31,6 +31,11 @@ on: required: false type: boolean default: true + run_mobile: + description: "Run mobile matrix (Android / iOS via Device Farm)" + required: false + type: boolean + default: true permissions: contents: read @@ -87,6 +92,7 @@ jobs: mobile-benchmarks: needs: [context, prebuild] + if: ${{ inputs.run_mobile }} permissions: contents: read packages: read @@ -101,8 +107,11 @@ jobs: qvac_perf_warmup_runs: ${{ inputs.qvac_perf_warmup_runs }} summarize: + # `if: always()` lets summarize run even when one of the benchmark + # jobs was skipped via the run_desktop / run_mobile toggles or + # failed mid-run; we still want the partial report. needs: [context, desktop-benchmarks, mobile-benchmarks] - if: always() + if: ${{ always() && needs.context.result == 'success' }} runs-on: ubuntu-latest timeout-minutes: 10 permissions: