From c9fe74673ae3561e46871c982bc80a1f3bc34fd8 Mon Sep 17 00:00:00 2001
From: Oluwatobi Adelegan <tobiadelegan70@gmail.com>
Date: Thu, 30 Apr 2026 18:21:51 +0100
Subject: [PATCH 1/5] QVAC-18111 infra[notask]: scaffold Benchmark Performance
 (LLM) workflow_dispatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GitHub requires a `workflow_dispatch` workflow to exist on the
default branch before it shows up in the Actions tab and becomes
triggerable with `--ref <feature-branch>`. This lands the LLM
benchmark workflow on `main` so the QVAC-17830 perf-metrics feature
branch can be dispatched against it for end-to-end validation.

Changes:
- `benchmark-performance-qvac-lib-infer-llamacpp-llm.yml` (new):
  manual `workflow_dispatch` only — mirrors the structure of the
  existing Parakeet / Whispercpp benchmark workflows. Calls
  `prebuilds-...yml` then `integration-test-...yml` with
  bench-mode iteration counts (`QVAC_PERF_RUNS=3`,
  `QVAC_PERF_WARMUP_RUNS=1` by default), then aggregates desktop
  artifacts into a combined HTML / step-summary. Phase-1 scope is
  desktop only — mobile (Device Farm) needs a build-time hook in
  the test app to thread env vars through to bare and is tracked
  as a QVAC-18111 follow-up.
- `integration-test-qvac-lib-infer-llamacpp-llm.yml`: thread
  `qvac_perf_runs` / `qvac_perf_warmup_runs` through `workflow_call`
  + `workflow_dispatch` and surface them as `QVAC_PERF_RUNS` /
  `QVAC_PERF_WARMUP_RUNS` on the Linux/macOS and Windows test run
  steps. Empty string => unset, so the umbrella PR workflow
  continues to honour the test-side default and PR runs are
  unaffected by this change.

Per the perf policy agreed on Slack (2026-04-30): the umbrella
on-pr workflow runs perf tests at the cheap default so we don't pay
full perf cost on every PR; this dedicated workflow is the only
place we crank up the iteration counts to produce mean ± std
numbers.

Made-with: Cursor
---
 ...erformance-qvac-lib-infer-llamacpp-llm.yml | 204 ++++++++++++++++++
 ...ation-test-qvac-lib-infer-llamacpp-llm.yml |  28 +++
 2 files changed, 232 insertions(+)
 create mode 100644 .github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
diff --git a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
new file mode 100644
index 0000000000..7f3bb956e1
--- /dev/null
+++ b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
@@ -0,0 +1,204 @@
+name: Benchmark Performance (LLM)
+
+# QVAC-18111: dedicated benchmarking workflow for the LLM addon —
+# manually triggered only.
+#
+# Per the perf policy agreed on Slack (2026-04-30, @Olya / @Gianfranco):
+# the umbrella PR workflow runs perf tests at the cheap default
+# (1 warmup + 1 counted, no averaging) so we don't pay full perf
+# cost on every PR. This workflow is the only place we crank
+# QVAC_PERF_RUNS up to produce mean ± std numbers.
+#
+# Phase-1 scope: desktop matrix only. Mobile (Android / iOS Device
+# Farm) needs a build-time hook in the test app to pass env vars
+# through to bare — tracked as a QVAC-18111 follow-up. Mobile rows
+# in PR runs continue to use the cheap 1+1 default.
+#
+# Mirrors the structure of the existing `Benchmark Performance
+# (Parakeet)` and `Benchmark Performance (Whispercpp)` workflows on
+# main: a `context` job derives repo/ref from optional inputs, then
+# dispatches `prebuilds-...yml` followed by `integration-test-...yml`
+# with the bench-mode iteration counts, and a `summarize` job
+# aggregates the artifacts into a single combined HTML + GitHub
+# step summary.
+
+on:
+  workflow_dispatch:
+    inputs:
+      repository:
+        description: "Repository to benchmark"
+        required: false
+        type: string
+      ref:
+        description: "Git ref (branch/tag/SHA) to benchmark"
+        required: false
+        type: string
+      qvac_perf_runs:
+        description: "QVAC_PERF_RUNS — counted iterations per perf test"
+        required: false
+        type: string
+        default: "3"
+      qvac_perf_warmup_runs:
+        description: "QVAC_PERF_WARMUP_RUNS — warmup iterations per perf test"
+        required: false
+        type: string
+        default: "1"
+
+permissions:
+  contents: read
+  packages: read
+  id-token: write
+
+jobs:
+  context:
+    runs-on: ubuntu-latest
+    outputs:
+      repository: ${{ steps.ctx.outputs.repository }}
+      ref: ${{ steps.ctx.outputs.ref }}
+    steps:
+      - id: ctx
+        shell: bash
+        env:
+          INPUT_REPO: ${{ inputs.repository }}
+          INPUT_REF: ${{ inputs.ref }}
+          REPO: ${{ github.repository }}
+          REF_NAME: ${{ github.ref_name }}
+        run: |
+          repo="${INPUT_REPO:-$REPO}"
+          ref="${INPUT_REF:-$REF_NAME}"
+          echo "repository=$repo" >> "$GITHUB_OUTPUT"
+          echo "ref=$ref" >> "$GITHUB_OUTPUT"
+
+  prebuild:
+    needs: context
+    permissions:
+      contents: write
+      packages: write
+      pull-requests: write
+      id-token: write
+    uses: ./.github/workflows/prebuilds-qvac-lib-infer-llamacpp-llm.yml
+    secrets: inherit
+    with:
+      repository: ${{ needs.context.outputs.repository }}
+      ref: ${{ needs.context.outputs.ref }}
+
+  desktop-benchmarks:
+    needs: [context, prebuild]
+    permissions:
+      contents: read
+      packages: read
+      id-token: write
+    uses: ./.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml
+    secrets: inherit
+    with:
+      repository: ${{ needs.context.outputs.repository }}
+      ref: ${{ needs.context.outputs.ref }}
+      qvac_perf_runs: ${{ inputs.qvac_perf_runs }}
+      qvac_perf_warmup_runs: ${{ inputs.qvac_perf_warmup_runs }}
+
+  summarize:
+    needs: [context, desktop-benchmarks]
+    if: always()
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    permissions:
+      contents: read
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+        with:
+          repository: ${{ needs.context.outputs.repository }}
+          ref: ${{ needs.context.outputs.ref }}
+          token: ${{ secrets.PAT_TOKEN }}
+          sparse-checkout: |
+            scripts/perf-report
+            packages/qvac-lib-infer-llamacpp-llm/media
+
+      - name: Setup Node.js
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # 4.4.0
+        with:
+          node-version: lts/*
+
+      - name: Download all perf report artifacts
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
+        with:
+          pattern: perf-report-llamacpp-llm-*-${{ github.run_number }}
+          path: combined-reports
+        continue-on-error: true
+
+      - name: Fix desktop device names
+        shell: bash
+        run: |
+          # Same fold as the umbrella combine-perf-reports step:
+          # sibling matrix legs (linux-x64-cpu+linux-x64-gpu,
+          # linux-arm64-u22+linux-arm64-u24) collapse onto one device
+          # name so [CPU]/[GPU] rows sit in the same column.
+          for dir in combined-reports/perf-report-llamacpp-llm-*/; do
+            [ -d "$dir" ] || continue
+            base=$(basename "$dir")
+            platform=$(echo "$base" | sed "s/^perf-report-llamacpp-llm-//" | sed "s/-${{ github.run_number }}$//")
+
+            case "$platform" in Android|iOS) continue ;; esac
+
+            case "$platform" in
+              linux-x64-cpu|linux-x64-gpu) device_name="linux-x64" ;;
+              linux-arm64-u22|linux-arm64-u24) device_name="linux-arm64" ;;
+              *) device_name="$platform" ;;
+            esac
+
+            for json in $(find "$dir" -name "performance-report.json" 2>/dev/null); do
+              if command -v jq >/dev/null 2>&1; then
+                jq --arg name "$device_name" '.device.name = $name' "$json" > "${json}.tmp" && mv "${json}.tmp" "$json"
+                echo "Patched device name in $json -> $device_name (was matrix label $platform)"
+              fi
+            done
+          done
+
+      - name: Generate consolidated benchmark report
+        run: |
+          if ! find combined-reports -name "performance-report.json" -type f 2>/dev/null | grep -q .; then
+            echo "No performance reports found."
+            exit 0
+          fi
+
+          mkdir -p benchmark-artifacts
+
+          node scripts/perf-report/aggregate.js \
+            --dir combined-reports \
+            --addon-type vision \
+            --device-details \
+            --output-html benchmark-artifacts/llamacpp-llm-performance-findings.html \
+            --output-json benchmark-artifacts/llamacpp-llm-performance-findings.json \
+            --output benchmark-artifacts/llamacpp-llm-performance-findings.md
+
+      - name: Add summary
+        if: always()
+        shell: bash
+        run: |
+          set +e
+          MD_FILE="benchmark-artifacts/llamacpp-llm-performance-findings.md"
+          {
+            echo "## LLM / VLM Benchmark Report (Desktop)"
+            echo ""
+            echo "> Triggered manually via \`workflow_dispatch\` — \`QVAC_PERF_RUNS=${{ inputs.qvac_perf_runs }}\`, \`QVAC_PERF_WARMUP_RUNS=${{ inputs.qvac_perf_warmup_runs }}\`."
+            echo ""
+            echo "> Mobile (Android / iOS) is **not** covered by this workflow yet — bench-mode iteration counts need a build-time hook in the mobile test app (QVAC-18111 follow-up). Mobile rows shown in PR runs continue to use 1 + 1."
+            echo ""
+            if [ -f "$MD_FILE" ]; then
+              cat "$MD_FILE"
+            else
+              echo "No combined performance report available."
+            fi
+          } >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Upload consolidated benchmark report
+        if: always()
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
+        with:
+          name: llamacpp-llm-performance-findings
+          path: |
+            benchmark-artifacts/llamacpp-llm-performance-findings.md
+            benchmark-artifacts/llamacpp-llm-performance-findings.json
+            benchmark-artifacts/llamacpp-llm-performance-findings.html
+          retention-days: 30
+          if-no-files-found: ignore
diff --git a/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml
index 198152d913..34613cba42 100644
--- a/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml
+++ b/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml
@@ -10,6 +10,16 @@ on:
         type: string
       model:
         type: string
+      qvac_perf_runs:
+        description: "Override QVAC_PERF_RUNS (number of counted iterations per perf test). Empty = test default."
+        type: string
+        required: false
+        default: ""
+      qvac_perf_warmup_runs:
+        description: "Override QVAC_PERF_WARMUP_RUNS (number of warmup iterations per perf test). Empty = test default."
+        type: string
+        required: false
+        default: ""
 
   workflow_dispatch:
     inputs:
@@ -22,6 +32,16 @@ on:
         description: "NPM package containing prebuilds (e.g. @qvac/llm-llamacpp@1.0.0)"
         type: string
         required: true
+      qvac_perf_runs:
+        description: "Override QVAC_PERF_RUNS (number of counted iterations per perf test). Empty = test default."
+        type: string
+        required: false
+        default: ""
+      qvac_perf_warmup_runs:
+        description: "Override QVAC_PERF_WARMUP_RUNS (number of warmup iterations per perf test). Empty = test default."
+        type: string
+        required: false
+        default: ""
 
 jobs:
   run-integration-tests:
@@ -196,6 +216,12 @@ jobs:
         shell: bash
         env:
           QASE_API_TOKEN: ${{ secrets.QASE_API_TOKEN }}
+          # QVAC-18111: empty string => unset, so the perf tests honour
+          # their PR default (1 warmup + 1 counted). The `Benchmark
+          # Performance (LLM)` workflow_dispatch passes "3" / "1" here
+          # to opt into mean ± std reporting.
+          QVAC_PERF_RUNS: ${{ inputs.qvac_perf_runs }}
+          QVAC_PERF_WARMUP_RUNS: ${{ inputs.qvac_perf_warmup_runs }}
 
       - name: Run integration test (Windows)
         if: ${{ matrix.platform == 'win32' }}
@@ -206,3 +232,5 @@ jobs:
         shell: powershell
         env:
           QASE_API_TOKEN: ${{ secrets.QASE_API_TOKEN }}
+          QVAC_PERF_RUNS: ${{ inputs.qvac_perf_runs }}
+          QVAC_PERF_WARMUP_RUNS: ${{ inputs.qvac_perf_warmup_runs }}

From b4541736b6dc8cacce1c8774bd64a4e99ad12e9e Mon Sep 17 00:00:00 2001
From: Oluwatobi Adelegan <tobiadelegan70@gmail.com>
Date: Thu, 30 Apr 2026 18:43:20 +0100
Subject: [PATCH 2/5] QVAC-18111 chore[notask]: trim chatty inline comments in
 benchmark workflow

Made-with: Cursor
---
 ...erformance-qvac-lib-infer-llamacpp-llm.yml | 36 +++++--------------
 ...ation-test-qvac-lib-infer-llamacpp-llm.yml |  4 ---
 2 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
index 7f3bb956e1..7ac887013b 100644
--- a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
+++ b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
@@ -1,26 +1,9 @@
 name: Benchmark Performance (LLM)
 
-# QVAC-18111: dedicated benchmarking workflow for the LLM addon —
-# manually triggered only.
-#
-# Per the perf policy agreed on Slack (2026-04-30, @Olya / @Gianfranco):
-# the umbrella PR workflow runs perf tests at the cheap default
-# (1 warmup + 1 counted, no averaging) so we don't pay full perf
-# cost on every PR. This workflow is the only place we crank
-# QVAC_PERF_RUNS up to produce mean ± std numbers.
-#
-# Phase-1 scope: desktop matrix only. Mobile (Android / iOS Device
-# Farm) needs a build-time hook in the test app to pass env vars
-# through to bare — tracked as a QVAC-18111 follow-up. Mobile rows
-# in PR runs continue to use the cheap 1+1 default.
-#
-# Mirrors the structure of the existing `Benchmark Performance
-# (Parakeet)` and `Benchmark Performance (Whispercpp)` workflows on
-# main: a `context` job derives repo/ref from optional inputs, then
-# dispatches `prebuilds-...yml` followed by `integration-test-...yml`
-# with the bench-mode iteration counts, and a `summarize` job
-# aggregates the artifacts into a single combined HTML + GitHub
-# step summary.
+# Manually-triggered benchmark workflow. The umbrella on-pr workflow
+# runs perf tests at the cheap default; this is where iteration
+# counts are cranked up to get mean ± std numbers. Desktop matrix
+# only; mobile is a follow-up.
 
 on:
   workflow_dispatch:
@@ -129,10 +112,9 @@ jobs:
       - name: Fix desktop device names
         shell: bash
         run: |
-          # Same fold as the umbrella combine-perf-reports step:
-          # sibling matrix legs (linux-x64-cpu+linux-x64-gpu,
-          # linux-arm64-u22+linux-arm64-u24) collapse onto one device
-          # name so [CPU]/[GPU] rows sit in the same column.
+          # Collapse sibling matrix legs (linux-x64-cpu/gpu,
+          # linux-arm64-u22/u24) onto one device name so [CPU]/[GPU]
+          # rows sit in the same column.
           for dir in combined-reports/perf-report-llamacpp-llm-*/; do
             [ -d "$dir" ] || continue
             base=$(basename "$dir")
@@ -180,9 +162,7 @@ jobs:
           {
             echo "## LLM / VLM Benchmark Report (Desktop)"
             echo ""
-            echo "> Triggered manually via \`workflow_dispatch\` — \`QVAC_PERF_RUNS=${{ inputs.qvac_perf_runs }}\`, \`QVAC_PERF_WARMUP_RUNS=${{ inputs.qvac_perf_warmup_runs }}\`."
-            echo ""
-            echo "> Mobile (Android / iOS) is **not** covered by this workflow yet — bench-mode iteration counts need a build-time hook in the mobile test app (QVAC-18111 follow-up). Mobile rows shown in PR runs continue to use 1 + 1."
+            echo "> \`QVAC_PERF_RUNS=${{ inputs.qvac_perf_runs }}\`, \`QVAC_PERF_WARMUP_RUNS=${{ inputs.qvac_perf_warmup_runs }}\`. Mobile is not covered by this workflow yet."
             echo ""
             if [ -f "$MD_FILE" ]; then
               cat "$MD_FILE"
diff --git a/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml
index 34613cba42..a619538ba8 100644
--- a/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml
+++ b/.github/workflows/integration-test-qvac-lib-infer-llamacpp-llm.yml
@@ -216,10 +216,6 @@ jobs:
         shell: bash
         env:
           QASE_API_TOKEN: ${{ secrets.QASE_API_TOKEN }}
-          # QVAC-18111: empty string => unset, so the perf tests honour
-          # their PR default (1 warmup + 1 counted). The `Benchmark
-          # Performance (LLM)` workflow_dispatch passes "3" / "1" here
-          # to opt into mean ± std reporting.
           QVAC_PERF_RUNS: ${{ inputs.qvac_perf_runs }}
           QVAC_PERF_WARMUP_RUNS: ${{ inputs.qvac_perf_warmup_runs }}
 

From 8e27396e85f1c7ef1dc418ac99f5e33feb0478df Mon Sep 17 00:00:00 2001
From: Oluwatobi Adelegan <tobiadelegan70@gmail.com>
Date: Thu, 30 Apr 2026 18:58:07 +0100
Subject: [PATCH 3/5] QVAC-18111 chore[notask]: add run_desktop toggle to
 benchmark workflow_dispatch

Made-with: Cursor
---
 .../benchmark-performance-qvac-lib-infer-llamacpp-llm.yml   | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
index 7ac887013b..e32cb9adc2 100644
--- a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
+++ b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
@@ -26,6 +26,11 @@ on:
         required: false
         type: string
         default: "1"
+      run_desktop:
+        description: "Run desktop matrix (Linux / macOS / Windows)"
+        required: false
+        type: boolean
+        default: true
 
 permissions:
   contents: read
@@ -67,6 +72,7 @@ jobs:
 
   desktop-benchmarks:
     needs: [context, prebuild]
+    if: ${{ inputs.run_desktop }}
     permissions:
       contents: read
       packages: read

From b4b777ae9eef4577a6ef8c273c1062b0f8d903ab Mon Sep 17 00:00:00 2001
From: Oluwatobi Adelegan <tobiadelegan70@gmail.com>
Date: Thu, 30 Apr 2026 18:49:54 +0100
Subject: [PATCH 4/5] QVAC-18111 infra[notask]: bridge QVAC_PERF_RUNS to mobile
 test app via pushFile
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends the mobile integration workflow with the same iteration-count
inputs as the desktop reusable workflow, and adds a `mobile-benchmarks`
job to the LLM benchmark dispatch so it covers Device Farm too.

The bare runtime on Device Farm doesn't see GitHub Actions env vars,
so we mirror the existing `testFilter.txt` pattern: when the workflow
inputs are non-empty, the WDIO before-hook pushes a `qvacPerfConfig.txt`
to the device (Android: `/data/local/tmp/`, iOS:
`@bundleId:documents/`) with the iteration overrides as KEY=VALUE
lines. The file-reading side on bare lives on the QVAC-17830 perf
branch — without that branch this PR is a no-op (orphan file), so it
is safe to land independently.

Changes:
- `integration-mobile-test-qvac-lib-infer-llamacpp-llm.yml`: add
  `qvac_perf_runs` / `qvac_perf_warmup_runs` to `workflow_call` and
  `workflow_dispatch`; add `__QVAC_PERF_RUNS__` /
  `__QVAC_PERF_WARMUP_RUNS__` placeholders to the Android + iOS
  WDIO config blobs and the corresponding pushFile block in the
  `before` hook; substitute the placeholders in `make_split`.
- `benchmark-performance-qvac-lib-infer-llamacpp-llm.yml`: add a
  `mobile-benchmarks` job calling the mobile workflow with the
  bench-mode iteration counts; have `summarize` `needs:` it; drop
  the "desktop only" caveat in the step-summary blurb.

PR runs are unchanged: empty input ⇒ empty placeholder ⇒ before-hook
skips the perf-config push.

Made-with: Cursor
---
 ...erformance-qvac-lib-infer-llamacpp-llm.yml | 25 +++++++++++---
 ...obile-test-qvac-lib-infer-llamacpp-llm.yml | 33 +++++++++++++++++--
 2 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
index e32cb9adc2..e3072dfa5e 100644
--- a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
+++ b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
@@ -2,8 +2,8 @@ name: Benchmark Performance (LLM)
 
 # Manually-triggered benchmark workflow. The umbrella on-pr workflow
 # runs perf tests at the cheap default; this is where iteration
-# counts are cranked up to get mean ± std numbers. Desktop matrix
-# only; mobile is a follow-up.
+# counts are cranked up to get mean ± std numbers. Covers desktop +
+# mobile (Android / iOS via Device Farm).
 
 on:
   workflow_dispatch:
@@ -85,8 +85,23 @@ jobs:
       qvac_perf_runs: ${{ inputs.qvac_perf_runs }}
       qvac_perf_warmup_runs: ${{ inputs.qvac_perf_warmup_runs }}
 
+  mobile-benchmarks:
+    needs: [context, prebuild]
+    permissions:
+      contents: read
+      packages: read
+      pull-requests: write
+      id-token: write
+    uses: ./.github/workflows/integration-mobile-test-qvac-lib-infer-llamacpp-llm.yml
+    secrets: inherit
+    with:
+      repository: ${{ needs.context.outputs.repository }}
+      ref: ${{ needs.context.outputs.ref }}
+      qvac_perf_runs: ${{ inputs.qvac_perf_runs }}
+      qvac_perf_warmup_runs: ${{ inputs.qvac_perf_warmup_runs }}
+
   summarize:
-    needs: [context, desktop-benchmarks]
+    needs: [context, desktop-benchmarks, mobile-benchmarks]
     if: always()
     runs-on: ubuntu-latest
     timeout-minutes: 10
@@ -166,9 +181,9 @@ jobs:
           set +e
           MD_FILE="benchmark-artifacts/llamacpp-llm-performance-findings.md"
           {
-            echo "## LLM / VLM Benchmark Report (Desktop)"
+            echo "## LLM / VLM Benchmark Report"
             echo ""
-            echo "> \`QVAC_PERF_RUNS=${{ inputs.qvac_perf_runs }}\`, \`QVAC_PERF_WARMUP_RUNS=${{ inputs.qvac_perf_warmup_runs }}\`. Mobile is not covered by this workflow yet."
+            echo "> \`QVAC_PERF_RUNS=${{ inputs.qvac_perf_runs }}\`, \`QVAC_PERF_WARMUP_RUNS=${{ inputs.qvac_perf_warmup_runs }}\`."
             echo ""
             if [ -f "$MD_FILE" ]; then
               cat "$MD_FILE"
diff --git a/.github/workflows/integration-mobile-test-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/integration-mobile-test-qvac-lib-infer-llamacpp-llm.yml
index 898b552aea..2bcff3af4e 100644
--- a/.github/workflows/integration-mobile-test-qvac-lib-infer-llamacpp-llm.yml
+++ b/.github/workflows/integration-mobile-test-qvac-lib-infer-llamacpp-llm.yml
@@ -11,6 +11,16 @@ on:
         description: "Repository to checkout"
         type: string
         required: false
+      qvac_perf_runs:
+        description: "Override QVAC_PERF_RUNS (number of counted iterations per perf test). Empty = test default."
+        type: string
+        required: false
+        default: ""
+      qvac_perf_warmup_runs:
+        description: "Override QVAC_PERF_WARMUP_RUNS (number of warmup iterations per perf test). Empty = test default."
+        type: string
+        required: false
+        default: ""
   workflow_dispatch:
     inputs:
       ref:
@@ -23,6 +33,16 @@ on:
         type: string
         required: true
         default: "@qvac/llm-llamacpp@latest"
+      qvac_perf_runs:
+        description: "Override QVAC_PERF_RUNS (number of counted iterations per perf test). Empty = test default."
+        type: string
+        required: false
+        default: ""
+      qvac_perf_warmup_runs:
+        description: "Override QVAC_PERF_WARMUP_RUNS (number of warmup iterations per perf test). Empty = test default."
+        type: string
+        required: false
+        default: ""
 
 env:
   NODE_VERSION: "lts/*"
@@ -923,8 +943,9 @@ jobs:
             # delay so logcat has time to drain bare stdout / native logs that
             # would otherwise be lost when process.exit(1) tears down Appium
             # before Device Farm finalises the artifact bundle.
-            # __TEST_FILTER__ placeholder is replaced per-split by make_split()
-            WDIO_CONFIG='exports.config={runner:"local",hostname:"127.0.0.1",port:4723,path:"/wd/hub",specs:["*.spec.js","*.test.js"],maxInstances:1,bail:0,capabilities:[{platformName:"Android","appium:automationName":"UiAutomator2","appium:appPackage":"'${{ env.APP_BUNDLE_ID }}'","appium:appActivity":"'${{ env.APP_BUNDLE_ID }}'.MainActivity","appium:newCommandTimeout":300,"appium:autoGrantPermissions":true,"appium:autoAcceptAlerts":true,"appium:noReset":true,"appium:dontStopAppOnReset":true,"appium:forceAppLaunch":false}],logLevel:"debug",waitforTimeout:120000,connectionRetryTimeout:30000,connectionRetryCount:3,services:[],framework:"mocha",reporters:["spec"],mochaOpts:{ui:"bdd",timeout:1800000,grep:"__MOCHA_GREP__"},before:async function(capabilities,specs,browser){const BUNDLE_ID="'${{ env.APP_BUNDLE_ID }}'";const TEST_FILTER="__TEST_FILTER__";global.appCrashed=false;global.checkAppCrash=async(stage)=>{try{const state=await browser.queryAppState(BUNDLE_ID);console.log("["+stage+"] App state: "+state+" (4=foreground,3=background,1=not running)");if(state<3){console.error("\\n🛑 APP CRASHED at "+stage+"! State="+state);console.error("Check device logs for BareKit/native errors.\\n");global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);}return state;}catch(e){console.log("["+stage+"] queryAppState error: "+e.message);return-1;}};console.log("Checking initial app state...");await global.checkAppCrash("startup");console.log("Waiting for app to initialize...");await browser.pause(5000);await global.checkAppCrash("after-pause");const initText=await browser.$("android=new UiSelector().textContains(\"INITIALIZED\")");await initText.waitForDisplayed({timeout:60000});await global.checkAppCrash("after-init");if(TEST_FILTER!=="__TEST_FILTER__"){try{const b64=Buffer.from(TEST_FILTER).toString("base64");await browser.pushFile("/data/local/tmp/testFilter.txt",b64);console.log("Pushed test filter: "+TEST_FILTER);}catch(e){console.log("pushFile failed: "+e.message);}}console.log("App initialized, clicking Run Automated Tests...");const button=await browser.$("android=new UiSelector().textContains(\"Run Automated Tests\")");await button.waitForDisplayed({timeout:15000});await button.click();console.log("Button clicked!");await browser.pause(5000);await global.checkAppCrash("after-click");global.crashMonitor=setInterval(async()=>{if(global.appCrashed)return;try{const s=await browser.queryAppState(BUNDLE_ID);if(s<3){console.error("\\n🛑 BACKGROUND CRASH DETECTED! App state="+s);global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);}}catch(e){}},15000);},after:async function(){if(global.crashMonitor)clearInterval(global.crashMonitor);},afterTest:async function(test,context,{error}){if(global.appCrashed)return;await global.checkAppCrash("after-test:"+test.title);}};'
+            # __TEST_FILTER__, __QVAC_PERF_RUNS__, __QVAC_PERF_WARMUP_RUNS__
+            # placeholders are replaced per-split by make_split().
+            WDIO_CONFIG='exports.config={runner:"local",hostname:"127.0.0.1",port:4723,path:"/wd/hub",specs:["*.spec.js","*.test.js"],maxInstances:1,bail:0,capabilities:[{platformName:"Android","appium:automationName":"UiAutomator2","appium:appPackage":"'${{ env.APP_BUNDLE_ID }}'","appium:appActivity":"'${{ env.APP_BUNDLE_ID }}'.MainActivity","appium:newCommandTimeout":300,"appium:autoGrantPermissions":true,"appium:autoAcceptAlerts":true,"appium:noReset":true,"appium:dontStopAppOnReset":true,"appium:forceAppLaunch":false}],logLevel:"debug",waitforTimeout:120000,connectionRetryTimeout:30000,connectionRetryCount:3,services:[],framework:"mocha",reporters:["spec"],mochaOpts:{ui:"bdd",timeout:1800000,grep:"__MOCHA_GREP__"},before:async function(capabilities,specs,browser){const BUNDLE_ID="'${{ env.APP_BUNDLE_ID }}'";const TEST_FILTER="__TEST_FILTER__";const QVAC_PERF_RUNS_VALUE="__QVAC_PERF_RUNS__";const QVAC_PERF_WARMUP_RUNS_VALUE="__QVAC_PERF_WARMUP_RUNS__";global.appCrashed=false;global.checkAppCrash=async(stage)=>{try{const state=await browser.queryAppState(BUNDLE_ID);console.log("["+stage+"] App state: "+state+" (4=foreground,3=background,1=not running)");if(state<3){console.error("\\n🛑 APP CRASHED at "+stage+"! State="+state);console.error("Check device logs for BareKit/native errors.\\n");global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);}return state;}catch(e){console.log("["+stage+"] queryAppState error: "+e.message);return-1;}};console.log("Checking initial app state...");await global.checkAppCrash("startup");console.log("Waiting for app to initialize...");await browser.pause(5000);await global.checkAppCrash("after-pause");const initText=await browser.$("android=new UiSelector().textContains(\"INITIALIZED\")");await initText.waitForDisplayed({timeout:60000});await global.checkAppCrash("after-init");if(TEST_FILTER!=="__TEST_FILTER__"){try{const b64=Buffer.from(TEST_FILTER).toString("base64");await browser.pushFile("/data/local/tmp/testFilter.txt",b64);console.log("Pushed test filter: "+TEST_FILTER);}catch(e){console.log("pushFile failed: "+e.message);}}if(QVAC_PERF_RUNS_VALUE.length>0||QVAC_PERF_WARMUP_RUNS_VALUE.length>0){try{var perfCfg="QVAC_PERF_RUNS="+QVAC_PERF_RUNS_VALUE+"\\nQVAC_PERF_WARMUP_RUNS="+QVAC_PERF_WARMUP_RUNS_VALUE+"\\n";var pcb64=Buffer.from(perfCfg).toString("base64");await browser.pushFile("/data/local/tmp/qvacPerfConfig.txt",pcb64);console.log("Pushed perf config: "+perfCfg.replace(/\\n/g," "));}catch(e){console.log("perfConfig pushFile failed: "+e.message);}}console.log("App initialized, clicking Run Automated Tests...");const button=await browser.$("android=new UiSelector().textContains(\"Run Automated Tests\")");await button.waitForDisplayed({timeout:15000});await button.click();console.log("Button clicked!");await browser.pause(5000);await global.checkAppCrash("after-click");global.crashMonitor=setInterval(async()=>{if(global.appCrashed)return;try{const s=await browser.queryAppState(BUNDLE_ID);if(s<3){console.error("\\n🛑 BACKGROUND CRASH DETECTED! App state="+s);global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);}}catch(e){}},15000);},after:async function(){if(global.crashMonitor)clearInterval(global.crashMonitor);},afterTest:async function(test,context,{error}){if(global.appCrashed)return;await global.checkAppCrash("after-test:"+test.title);}};'
           else
             PLATFORM="iOS"
             AUTOMATION="XCUITest"
@@ -946,7 +967,7 @@ jobs:
             # on the normal completion path.
             # usePrebuiltWDA uses Device Farm's pre-built WebDriverAgent
             # Increased timeout to 30 minutes (1800000ms) for long-running LLM tests
-            WDIO_CONFIG='exports.config={runner:"local",hostname:"127.0.0.1",port:4723,path:"/wd/hub",specs:["*.spec.js","*.test.js"],maxInstances:1,bail:0,capabilities:[{platformName:"iOS","appium:automationName":"XCUITest","appium:bundleId":"'${{ env.APP_BUNDLE_ID }}'","appium:newCommandTimeout":300,"appium:noReset":true,"appium:forceAppLaunch":false,"appium:usePrebuiltWDA":true,"appium:wdaLocalPort":8100,"appium:showIOSLog":true,"appium:realDeviceLogger":"/usr/local/lib/node_modules/appium/node_modules/deviceconsole/deviceconsole"}],logLevel:"debug",waitforTimeout:120000,connectionRetryTimeout:30000,connectionRetryCount:3,services:[],framework:"mocha",reporters:["spec"],mochaOpts:{ui:"bdd",timeout:1800000,grep:"__MOCHA_GREP__"},before:async function(capabilities,specs,browser){const BUNDLE_ID="'${{ env.APP_BUNDLE_ID }}'";const TEST_FILTER="__TEST_FILTER__";global.appCrashed=false;global.flushBareLog=async function(reason){try{var _h=require("http");var lb64=await new Promise(function(ok,fail){var bd=JSON.stringify({path:"@"+BUNDLE_ID+":documents/bare_console.log"});var rq=_h.request({hostname:"127.0.0.1",port:4723,path:"/wd/hub/session/"+browser.sessionId+"/appium/device/pull_file",method:"POST",headers:{"Content-Type":"application/json","Content-Length":Buffer.byteLength(bd)}},function(rs){var d="";rs.on("data",function(c){d+=c;});rs.on("end",function(){try{ok(JSON.parse(d).value);}catch(e){fail(e);}});});rq.on("error",fail);rq.write(bd);rq.end();});var logTxt=Buffer.from(lb64,"base64").toString();var logDir=process.env.DEVICEFARM_LOG_DIR||".";require("fs").writeFileSync(logDir+"/bare_console.log",logTxt);console.log("[bare-log] "+reason+" flush ok ("+logTxt.length+" bytes)");}catch(e){console.log("[bare-log] "+reason+" flush failed: "+e.message);}};global.checkAppCrash=async(stage)=>{try{const state=await browser.queryAppState(BUNDLE_ID);console.log("["+stage+"] App state: "+state+" (4=foreground,3=background,1=not running)");if(state<3){console.error("\\n🛑 APP CRASHED at "+stage+"! State="+state);console.error("Check device logs for BareKit/native errors.\\n");global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);try{await browser.pause(1500);await Promise.race([global.flushBareLog("crash-"+stage),new Promise(function(_,rj){setTimeout(function(){rj(new Error("bare-log flush timed out"));},3000);})]);}catch(_){}}return state;}catch(e){console.log("["+stage+"] queryAppState error: "+e.message);return-1;}};console.log("Checking initial app state...");await global.checkAppCrash("startup");console.log("Waiting for app to initialize...");await browser.pause(5000);await global.checkAppCrash("after-pause");const initText=await browser.$("-ios predicate string:label CONTAINS \"INITIALIZED\"");await initText.waitForDisplayed({timeout:60000});await global.checkAppCrash("after-init");if(TEST_FILTER!=="__TEST_FILTER__"){try{const b64=Buffer.from(TEST_FILTER).toString("base64");await browser.pushFile("@"+BUNDLE_ID+":documents/testFilter.txt",b64);console.log("Pushed test filter: "+TEST_FILTER);}catch(e){console.log("pushFile failed: "+e.message);}}console.log("App initialized, clicking Run Automated Tests...");const button=await browser.$("-ios predicate string:label CONTAINS \"Run Automated Tests\"");await button.waitForDisplayed({timeout:15000});await button.click();console.log("Button clicked!");await browser.pause(5000);await global.checkAppCrash("after-click");global.crashMonitor=setInterval(async()=>{if(global.appCrashed)return;try{const s=await browser.queryAppState(BUNDLE_ID);if(s<3){console.error("\\n🛑 BACKGROUND CRASH DETECTED! App state="+s);global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);try{await browser.pause(1500);await Promise.race([global.flushBareLog("crash-bg"),new Promise(function(_,rj){setTimeout(function(){rj(new Error("bare-log flush timed out"));},3000);})]);}catch(_){}}}catch(e){}},15000);},after:async function(){if(global.crashMonitor)clearInterval(global.crashMonitor);console.log("[bare-log] Waiting for log flush...");await browser.pause(3000);if(global.flushBareLog)await global.flushBareLog("after");},afterTest:async function(test,context,{error}){if(global.appCrashed)return;await global.checkAppCrash("after-test:"+test.title);}};'
+            WDIO_CONFIG='exports.config={runner:"local",hostname:"127.0.0.1",port:4723,path:"/wd/hub",specs:["*.spec.js","*.test.js"],maxInstances:1,bail:0,capabilities:[{platformName:"iOS","appium:automationName":"XCUITest","appium:bundleId":"'${{ env.APP_BUNDLE_ID }}'","appium:newCommandTimeout":300,"appium:noReset":true,"appium:forceAppLaunch":false,"appium:usePrebuiltWDA":true,"appium:wdaLocalPort":8100,"appium:showIOSLog":true,"appium:realDeviceLogger":"/usr/local/lib/node_modules/appium/node_modules/deviceconsole/deviceconsole"}],logLevel:"debug",waitforTimeout:120000,connectionRetryTimeout:30000,connectionRetryCount:3,services:[],framework:"mocha",reporters:["spec"],mochaOpts:{ui:"bdd",timeout:1800000,grep:"__MOCHA_GREP__"},before:async function(capabilities,specs,browser){const BUNDLE_ID="'${{ env.APP_BUNDLE_ID }}'";const TEST_FILTER="__TEST_FILTER__";const QVAC_PERF_RUNS_VALUE="__QVAC_PERF_RUNS__";const QVAC_PERF_WARMUP_RUNS_VALUE="__QVAC_PERF_WARMUP_RUNS__";global.appCrashed=false;global.flushBareLog=async function(reason){try{var _h=require("http");var lb64=await new Promise(function(ok,fail){var bd=JSON.stringify({path:"@"+BUNDLE_ID+":documents/bare_console.log"});var rq=_h.request({hostname:"127.0.0.1",port:4723,path:"/wd/hub/session/"+browser.sessionId+"/appium/device/pull_file",method:"POST",headers:{"Content-Type":"application/json","Content-Length":Buffer.byteLength(bd)}},function(rs){var d="";rs.on("data",function(c){d+=c;});rs.on("end",function(){try{ok(JSON.parse(d).value);}catch(e){fail(e);}});});rq.on("error",fail);rq.write(bd);rq.end();});var logTxt=Buffer.from(lb64,"base64").toString();var logDir=process.env.DEVICEFARM_LOG_DIR||".";require("fs").writeFileSync(logDir+"/bare_console.log",logTxt);console.log("[bare-log] "+reason+" flush ok ("+logTxt.length+" bytes)");}catch(e){console.log("[bare-log] "+reason+" flush failed: "+e.message);}};global.checkAppCrash=async(stage)=>{try{const state=await browser.queryAppState(BUNDLE_ID);console.log("["+stage+"] App state: "+state+" (4=foreground,3=background,1=not running)");if(state<3){console.error("\\n🛑 APP CRASHED at "+stage+"! State="+state);console.error("Check device logs for BareKit/native errors.\\n");global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);try{await browser.pause(1500);await Promise.race([global.flushBareLog("crash-"+stage),new Promise(function(_,rj){setTimeout(function(){rj(new Error("bare-log flush timed out"));},3000);})]);}catch(_){}}return state;}catch(e){console.log("["+stage+"] queryAppState error: "+e.message);return-1;}};console.log("Checking initial app state...");await global.checkAppCrash("startup");console.log("Waiting for app to initialize...");await browser.pause(5000);await global.checkAppCrash("after-pause");const initText=await browser.$("-ios predicate string:label CONTAINS \"INITIALIZED\"");await initText.waitForDisplayed({timeout:60000});await global.checkAppCrash("after-init");if(TEST_FILTER!=="__TEST_FILTER__"){try{const b64=Buffer.from(TEST_FILTER).toString("base64");await browser.pushFile("@"+BUNDLE_ID+":documents/testFilter.txt",b64);console.log("Pushed test filter: "+TEST_FILTER);}catch(e){console.log("pushFile failed: "+e.message);}}if(QVAC_PERF_RUNS_VALUE.length>0||QVAC_PERF_WARMUP_RUNS_VALUE.length>0){try{var perfCfg="QVAC_PERF_RUNS="+QVAC_PERF_RUNS_VALUE+"\\nQVAC_PERF_WARMUP_RUNS="+QVAC_PERF_WARMUP_RUNS_VALUE+"\\n";var pcb64=Buffer.from(perfCfg).toString("base64");await browser.pushFile("@"+BUNDLE_ID+":documents/qvacPerfConfig.txt",pcb64);console.log("Pushed perf config: "+perfCfg.replace(/\\n/g," "));}catch(e){console.log("perfConfig pushFile failed: "+e.message);}}console.log("App initialized, clicking Run Automated Tests...");const button=await browser.$("-ios predicate string:label CONTAINS \"Run Automated Tests\"");await button.waitForDisplayed({timeout:15000});await button.click();console.log("Button clicked!");await browser.pause(5000);await global.checkAppCrash("after-click");global.crashMonitor=setInterval(async()=>{if(global.appCrashed)return;try{const s=await browser.queryAppState(BUNDLE_ID);if(s<3){console.error("\\n🛑 BACKGROUND CRASH DETECTED! App state="+s);global.appCrashed=true;setTimeout(function(){process.exit(1);},5000);try{await browser.pause(1500);await Promise.race([global.flushBareLog("crash-bg"),new Promise(function(_,rj){setTimeout(function(){rj(new Error("bare-log flush timed out"));},3000);})]);}catch(_){}}}catch(e){}},15000);},after:async function(){if(global.crashMonitor)clearInterval(global.crashMonitor);console.log("[bare-log] Waiting for log flush...");await browser.pause(3000);if(global.flushBareLog)await global.flushBareLog("after");},afterTest:async function(test,context,{error}){if(global.appCrashed)return;await global.checkAppCrash("after-test:"+test.title);}};'
           fi
           
           WDIO_B64=$(echo "$WDIO_CONFIG" | base64 | tr -d '\n')
@@ -1063,11 +1084,17 @@ jobs:
           # For each split: inject mocha grep AND replace __TEST_FILTER__ so
           # the before-hook pushes a testFilter.txt the app reads at runtime.
           # This ensures the app only executes matching tests (real splitting).
+          # When set, the perf inputs also get substituted so the before-hook
+          # pushes a qvacPerfConfig.txt with the iteration overrides.
+          QVAC_PERF_RUNS_INPUT="${{ inputs.qvac_perf_runs }}"
+          QVAC_PERF_WARMUP_RUNS_INPUT="${{ inputs.qvac_perf_warmup_runs }}"
           make_split() {
             local pattern="$1" output="$2"
             local cfg
             cfg=$(echo "$WDIO_CONFIG" | sed "s#__MOCHA_GREP__#$pattern#")
             cfg=$(echo "$cfg" | sed "s#__TEST_FILTER__#$pattern#")
+            cfg=$(echo "$cfg" | sed "s#__QVAC_PERF_RUNS__#$QVAC_PERF_RUNS_INPUT#")
+            cfg=$(echo "$cfg" | sed "s#__QVAC_PERF_WARMUP_RUNS__#$QVAC_PERF_WARMUP_RUNS_INPUT#")
             local b64
             b64=$(echo "$cfg" | base64 | tr -d '\n')
             generate_spec "$output" "$b64"

From 9cdf07be453dea3c150224b29ba0cbc6c130c440 Mon Sep 17 00:00:00 2001
From: Oluwatobi Adelegan <tobiadelegan70@gmail.com>
Date: Thu, 30 Apr 2026 19:00:25 +0100
Subject: [PATCH 5/5] QVAC-18111 chore[notask]: add run_mobile toggle to
 benchmark workflow_dispatch

Made-with: Cursor
---
 ...chmark-performance-qvac-lib-infer-llamacpp-llm.yml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
index e3072dfa5e..9abc517c23 100644
--- a/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
+++ b/.github/workflows/benchmark-performance-qvac-lib-infer-llamacpp-llm.yml
@@ -31,6 +31,11 @@ on:
         required: false
         type: boolean
         default: true
+      run_mobile:
+        description: "Run mobile matrix (Android / iOS via Device Farm)"
+        required: false
+        type: boolean
+        default: true
 
 permissions:
   contents: read
@@ -87,6 +92,7 @@ jobs:
 
   mobile-benchmarks:
     needs: [context, prebuild]
+    if: ${{ inputs.run_mobile }}
     permissions:
       contents: read
       packages: read
@@ -101,8 +107,11 @@ jobs:
       qvac_perf_warmup_runs: ${{ inputs.qvac_perf_warmup_runs }}
 
   summarize:
+    # `if: always()` lets summarize run even when one of the benchmark
+    # jobs was skipped via the run_desktop / run_mobile toggles or
+    # failed mid-run; we still want the partial report.
     needs: [context, desktop-benchmarks, mobile-benchmarks]
-    if: always()
+    if: ${{ always() && needs.context.result == 'success' }}
     runs-on: ubuntu-latest
     timeout-minutes: 10
     permissions: