diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
new file mode 100644
index 000000000..aa16f3c6b
--- /dev/null
+++ b/.github/workflows/benchmark.yaml
@@ -0,0 +1,166 @@
+name: benchmark
+
+# Runs the benchmark matrix on every PR targeting main, on pushes to main
+# (baseline refresh), and on manual dispatch. The job compares PR ops/s
+# against the latest main baseline and flags >5% throughput regressions
+# and >10% memory regressions.
+#
+# Storage model (see benchmarks/baselines/README.md):
+#   - Artifacts hold the authoritative JSON for trend history (90-day
+#     retention for PRs, 365 days for the main baseline).
+#   - `benchmarks/baselines/main.json` is an in-repo quick-reference copy
+#     that gets refreshed by a follow-up chore PR after a merge to main.
+
+on:
+  pull_request:
+    branches: [main]
+    paths:
+      - "packages/protobuf/**"
+      - "benchmarks/**"
+      - ".github/workflows/benchmark.yaml"
+  push:
+    branches: [main]
+    paths:
+      - "packages/protobuf/**"
+      - "benchmarks/**"
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  pull-requests: write
+
+env:
+  DO_NOT_TRACK: 1
+  NODE_VERSION: "22"
+
+concurrency:
+  group: benchmark-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  bench-matrix:
+    name: bench-matrix (${{ github.event_name }})
+    runs-on: ubuntu-latest
+    timeout-minutes: 25
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          # Enough history that we can also check out the base branch for
+          # the baseline comparison pass if the artifact download fails.
+          fetch-depth: 0
+
+      - name: Setup Node ${{ env.NODE_VERSION }}
+        uses: actions/setup-node@v6
+        with:
+          node-version: ${{ env.NODE_VERSION }}
+          cache: "npm"
+
+      - name: Install
+        run: npm ci --ignore-scripts
+        env:
+          HUSKY: 0
+
+      - name: Build @bufbuild/protobuf
+        run: npx turbo run build --filter=@bufbuild/protobuf
+
+      - name: Generate benchmark code (proto + pbjs)
+        run: npx turbo run generate --filter=@bufbuild/protobuf-benchmarks
+
+      - name: Run benchmark matrix
+        working-directory: benchmarks
+        run: bash scripts/run-matrix-ci.sh bench-results.json
+
+      - name: Upload run artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: bench-results-${{ github.event.pull_request.number || github.sha }}
+          path: benchmarks/bench-results.json
+          retention-days: 90
+
+      # ------------------------------------------------------------------
+      # Baseline acquisition (PR only). For `push` to main, the PR run
+      # becomes the new baseline — the artifact upload below is sufficient.
+      # ------------------------------------------------------------------
+
+      - name: Download latest main baseline artifact
+        if: github.event_name == 'pull_request'
+        id: dl-baseline
+        uses: dawidd6/action-download-artifact@v6
+        continue-on-error: true
+        with:
+          workflow: benchmark.yaml
+          branch: main
+          name: bench-baseline-main
+          path: benchmarks/baseline-download
+          search_artifacts: true
+          if_no_artifact_found: warn
+
+      - name: Resolve baseline source
+        if: github.event_name == 'pull_request'
+        run: |
+          set -euo pipefail
+          if [[ -f benchmarks/baseline-download/bench-results.json ]]; then
+            cp benchmarks/baseline-download/bench-results.json benchmarks/baseline-results.json
+            echo "Using downloaded main artifact as baseline."
+          elif [[ -f benchmarks/baselines/main.json ]]; then
+            cp benchmarks/baselines/main.json benchmarks/baseline-results.json
+            echo "Using in-repo benchmarks/baselines/main.json as baseline."
+          else
+            echo "No baseline available — compare step will emit an informational report."
+          fi
+
+      - name: Compare PR against baseline
+        if: github.event_name == 'pull_request'
+        id: compare
+        working-directory: benchmarks
+        run: |
+          set -euo pipefail
+          if [[ -f baseline-results.json ]]; then
+            npx tsx scripts/compare-results.ts \
+              --baseline=baseline-results.json \
+              --current=bench-results.json \
+              --output=bench-report.md \
+              --threshold-ops=5 \
+              --threshold-mem=10
+          else
+            npx tsx scripts/compare-results.ts \
+              --current=bench-results.json \
+              --output=bench-report.md \
+              --threshold-ops=5 \
+              --threshold-mem=10 \
+              --no-baseline
+          fi
+          if grep -q "REGRESSION" bench-report.md 2>/dev/null; then
+            echo "status=regression" >> "$GITHUB_OUTPUT"
+          else
+            echo "status=ok" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Comment report on PR
+        if: github.event_name == 'pull_request'
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          header: benchmark-matrix
+          path: benchmarks/bench-report.md
+
+      - name: Flag regression annotation
+        if: github.event_name == 'pull_request' && steps.compare.outputs.status == 'regression'
+        run: |
+          echo "::warning::Benchmark matrix flagged a regression. See the PR comment for the full table."
+
+      # ------------------------------------------------------------------
+      # Baseline refresh (push-to-main only). The PR run becomes the new
+      # authoritative baseline and gets uploaded as a stable-named artifact
+      # so subsequent PR jobs can pull it.
+      # ------------------------------------------------------------------
+
+      - name: Upload baseline artifact (main only)
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+        uses: actions/upload-artifact@v4
+        with:
+          name: bench-baseline-main
+          path: benchmarks/bench-results.json
+          retention-days: 365
+          overwrite: true
diff --git a/benchmarks/baselines/README.md b/benchmarks/baselines/README.md
new file mode 100644
index 000000000..02bee96d7
--- /dev/null
+++ b/benchmarks/baselines/README.md
@@ -0,0 +1,59 @@
+# Benchmark Baselines
+
+This directory holds **quick-reference baselines** — last-known-good throughput numbers for the matrix, committed alongside code so a local developer can run `scripts/compare-results.ts` without having to hit GitHub.
+
+## Storage model
+
+The authoritative baseline is an **Actions artifact**, not a file in the repo:
+
+| Where                                        | Role                                  | Retention        |
+|----------------------------------------------|---------------------------------------|------------------|
+| Artifact `bench-baseline-main`               | Source of truth for CI diffs          | 365 days         |
+| `benchmarks/baselines/main.json`             | Quick-reference for local dev + fallback when the artifact API is unreachable | tracked in git |
+| Artifact `bench-results-<pr-number>`         | Historical trend per PR               | 90 days          |
+
+### Why two stores
+
+1. **Artifacts give trend history for free.** Downloading `bench-baseline-main@<run-id>` at any point in the past reconstructs the baseline of that day. No file churn in git.
+2. **A committed fallback de-risks the artifact dependency.** If GitHub artifact downloads rate-limit or the action times out, CI falls back to `main.json` so PRs are never blocked on infrastructure. The file does not have to be fresh to be useful — being roughly right on order of magnitude is enough to flag a regression that a human can investigate.
+3. **Local dev needs a zero-network path.** `npm run bench:matrix:ci` followed by `npm run bench:matrix:compare -- --baseline=baselines/main.json --current=bench-results.json` works entirely offline.
+
+## Update procedure
+
+`main.json` is refreshed **by hand via a one-line PR** after every merge to `main` whose benchmark numbers moved materially (>5% on any row). A follow-up iteration will automate this via a `benchmark-baseline-refresh` workflow that opens the PR from the push-to-main run, but until that lands, manual refresh is the policy.
+
+```bash
+# After a merge to main, pull the latest artifact:
+gh run download --name bench-baseline-main --dir /tmp/baseline
+cp /tmp/baseline/bench-results.json benchmarks/baselines/main.json
+
+# Commit on a chore/ branch and open a PR:
+git checkout -b chore/refresh-benchmark-baseline
+git add benchmarks/baselines/main.json
+git commit -m "chore(benchmarks): refresh main baseline"
+git push -u origin HEAD
+gh pr create --title "chore(benchmarks): refresh main baseline" \
+  --body "Auto-refresh from the bench-baseline-main CI artifact."
+```
+
+## Format
+
+Every `*.json` in this directory is the structured payload written by `bench-matrix.ts` (last line of its stdout when run standalone, or the full file when run via `scripts/run-matrix-ci.sh`):
+
+```json
+{
+  "node": "v22.11.0",
+  "platform": "linux/x64",
+  "timestamp": "2026-04-19T18:00:00.000Z",
+  "results": [
+    {
+      "name": "SimpleMessage :: toBinary (pre-built, 19 B)",
+      "opsPerSec": 1065000,
+      "rme": 1.3,
+      "samples": 512
+    }
+  ]
+}
+```
+
+Field names are stable; additional fields are additive (e.g. a future `bytesPerOp` for memory tracking will not break existing consumers).
diff --git a/benchmarks/package.json b/benchmarks/package.json
index 55efeb1f6..bbff26b13 100644
--- a/benchmarks/package.json
+++ b/benchmarks/package.json
@@ -11,6 +11,8 @@
     "bench:fromBinary": "tsx src/bench-fromBinary.ts",
     "bench:comparison": "tsx src/bench-comparison-protobufjs.ts",
     "bench:matrix": "tsx src/bench-matrix.ts",
+    "bench:matrix:ci": "bash scripts/run-matrix-ci.sh bench-results.json",
+    "bench:matrix:compare": "tsx scripts/compare-results.ts",
     "bench:memory": "node --expose-gc --import tsx src/bench-memory.ts",
     "bench:report": "tsx src/report.ts",
     "build": "../node_modules/typescript/bin/tsc --noEmit",
diff --git a/benchmarks/scripts/compare-results.ts b/benchmarks/scripts/compare-results.ts
new file mode 100644
index 000000000..ae9d4fbb5
--- /dev/null
+++ b/benchmarks/scripts/compare-results.ts
@@ -0,0 +1,286 @@
+// Copyright 2021-2026 Buf Technologies, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// compare-results.ts — diff two bench-matrix JSON dumps and emit a
+// Markdown regression table. Input is the structured payload written by
+// src/bench-matrix.ts (the last `=== Matrix JSON ===` block when run
+// standalone; run-matrix-ci.sh strips everything except the JSON object
+// so this script can fs.readFileSync() it directly).
+//
+// Thresholds are configurable on the CLI so CI can tune leniency without
+// touching this file. Defaults match the Phase 2 contract: 5% throughput,
+// 10% memory.
+
+import { readFileSync, writeFileSync } from "node:fs";
+import { argv, exit } from "node:process";
+
+interface ResultRow {
+  name: string;
+  opsPerSec: number;
+  rme: number;
+  samples: number;
+  /**
+   * Optional — bench-matrix currently does not emit per-op memory. The
+   * field is wired here so a follow-up fixture addition is a one-line
+   * change in the runner, not a script rewrite.
+   */
+  bytesPerOp?: number;
+}
+
+interface BenchPayload {
+  node: string;
+  platform: string;
+  timestamp: string;
+  results: ResultRow[];
+}
+
+interface Options {
+  baseline: string | null;
+  current: string;
+  output: string;
+  thresholdOps: number;
+  thresholdMem: number;
+  noBaseline: boolean;
+}
+
+function parseArgs(): Options {
+  const opts: Options = {
+    baseline: null,
+    current: "bench-results.json",
+    output: "bench-report.md",
+    thresholdOps: 5,
+    thresholdMem: 10,
+    noBaseline: false,
+  };
+  for (const arg of argv.slice(2)) {
+    if (arg.startsWith("--baseline=")) opts.baseline = arg.slice(11);
+    else if (arg.startsWith("--current=")) opts.current = arg.slice(10);
+    else if (arg.startsWith("--output=")) opts.output = arg.slice(9);
+    else if (arg.startsWith("--threshold-ops=")) opts.thresholdOps = Number(arg.slice(16));
+    else if (arg.startsWith("--threshold-mem=")) opts.thresholdMem = Number(arg.slice(16));
+    else if (arg === "--no-baseline") opts.noBaseline = true;
+    else if (arg === "--help" || arg === "-h") {
+      printUsage();
+      exit(0);
+    } else {
+      console.error(`Unknown argument: ${arg}`);
+      printUsage();
+      exit(2);
+    }
+  }
+  return opts;
+}
+
+function printUsage(): void {
+  console.error(`Usage: compare-results.ts [options]
+  --baseline=PATH        Path to baseline JSON (omit for --no-baseline)
+  --current=PATH         Path to current run JSON (default bench-results.json)
+  --output=PATH          Markdown report path (default bench-report.md)
+  --threshold-ops=N      Throughput regression threshold, % (default 5)
+  --threshold-mem=N      Memory regression threshold, % (default 10)
+  --no-baseline          Emit current-only report (first run on a fork)`);
+}
+
+function loadPayload(path: string): BenchPayload {
+  const raw = readFileSync(path, "utf8").trim();
+  // run-matrix-ci.sh produces a pure JSON file. When a developer points
+  // the script at a raw bench-matrix stdout dump, tolerate the "=== Matrix
+  // JSON ===" sentinel by locating the last top-level '{' in the text.
+  if (raw.startsWith("{")) {
+    return JSON.parse(raw) as BenchPayload;
+  }
+  const jsonStart = raw.lastIndexOf("\n{");
+  if (jsonStart === -1) {
+    throw new Error(`compare-results: no JSON payload found in ${path}`);
+  }
+  return JSON.parse(raw.slice(jsonStart + 1)) as BenchPayload;
+}
+
+function fmtOps(n: number): string {
+  if (!Number.isFinite(n) || n === 0) return "–";
+  if (n >= 1000) return n.toLocaleString("en-US", { maximumFractionDigits: 0 });
+  return n.toFixed(1);
+}
+
+function fmtBytes(n: number | undefined): string {
+  if (n === undefined || !Number.isFinite(n)) return "–";
+  return n.toLocaleString("en-US", { maximumFractionDigits: 0 });
+}
+
+function fmtDelta(pct: number): string {
+  if (!Number.isFinite(pct)) return "–";
+  const sign = pct >= 0 ? "+" : "";
+  return `${sign}${pct.toFixed(1)}%`;
+}
+
+interface CompareRow {
+  name: string;
+  baselineOps: number | null;
+  currentOps: number;
+  opsDeltaPct: number | null;
+  baselineMem: number | null;
+  currentMem: number | null;
+  memDeltaPct: number | null;
+  status: "ok" | "improved" | "regression" | "new";
+}
+
+function compare(
+  baseline: BenchPayload | null,
+  current: BenchPayload,
+  thresholdOps: number,
+  thresholdMem: number,
+): CompareRow[] {
+  const baseMap = new Map<string, ResultRow>();
+  if (baseline) {
+    for (const r of baseline.results) baseMap.set(r.name, r);
+  }
+
+  const rows: CompareRow[] = [];
+  for (const cur of current.results) {
+    const base = baseMap.get(cur.name);
+    let opsDelta: number | null = null;
+    let memDelta: number | null = null;
+    if (base && base.opsPerSec > 0) {
+      opsDelta = ((cur.opsPerSec - base.opsPerSec) / base.opsPerSec) * 100;
+    }
+    if (
+      base?.bytesPerOp !== undefined &&
+      cur.bytesPerOp !== undefined &&
+      base.bytesPerOp > 0
+    ) {
+      memDelta = ((cur.bytesPerOp - base.bytesPerOp) / base.bytesPerOp) * 100;
+    }
+
+    let status: CompareRow["status"] = "ok";
+    if (!base) {
+      status = "new";
+    } else if (opsDelta !== null && opsDelta <= -thresholdOps) {
+      status = "regression";
+    } else if (memDelta !== null && memDelta >= thresholdMem) {
+      status = "regression";
+    } else if (opsDelta !== null && opsDelta >= thresholdOps) {
+      status = "improved";
+    }
+
+    rows.push({
+      name: cur.name,
+      baselineOps: base?.opsPerSec ?? null,
+      currentOps: cur.opsPerSec,
+      opsDeltaPct: opsDelta,
+      baselineMem: base?.bytesPerOp ?? null,
+      currentMem: cur.bytesPerOp ?? null,
+      memDeltaPct: memDelta,
+      status,
+    });
+  }
+  return rows;
+}
+
+function statusBadge(s: CompareRow["status"]): string {
+  switch (s) {
+    case "regression":
+      return "REGRESSION";
+    case "improved":
+      return "improved";
+    case "new":
+      return "new";
+    case "ok":
+      return "ok";
+  }
+}
+
+function renderMarkdown(
+  rows: CompareRow[],
+  opts: {
+    baseline: BenchPayload | null;
+    current: BenchPayload;
+    thresholdOps: number;
+    thresholdMem: number;
+  },
+): string {
+  const regressionCount = rows.filter((r) => r.status === "regression").length;
+  const improvedCount = rows.filter((r) => r.status === "improved").length;
+  const newCount = rows.filter((r) => r.status === "new").length;
+  const unchangedCount =
+    rows.length - regressionCount - improvedCount - newCount;
+
+  const summaryTitle =
+    regressionCount > 0
+      ? `Benchmark: ${regressionCount} regression(s)`
+      : "Benchmark: no regressions";
+
+  const out: string[] = [];
+  out.push(`## ${summaryTitle}`);
+  out.push("");
+  out.push(
+    `Thresholds: throughput regression \`>${opts.thresholdOps}%\`, memory regression \`>${opts.thresholdMem}%\`. Current run on \`${opts.current.platform}\`, Node \`${opts.current.node}\`, captured \`${opts.current.timestamp}\`.`,
+  );
+  if (opts.baseline) {
+    out.push(
+      `Baseline captured \`${opts.baseline.timestamp}\` on \`${opts.baseline.platform}\`, Node \`${opts.baseline.node}\`.`,
+    );
+  } else {
+    out.push("No baseline available — this report is informational only.");
+  }
+  out.push("");
+  out.push(
+    `Summary: \`${regressionCount}\` regressed, \`${improvedCount}\` improved, \`${newCount}\` new, \`${unchangedCount}\` unchanged.`,
+  );
+  out.push("");
+  out.push(
+    "| Fixture | Baseline ops/s | PR ops/s | Δ ops | Baseline B/op | PR B/op | Δ mem | Status |",
+  );
+  out.push(
+    "|---------|---------------:|---------:|------:|--------------:|--------:|------:|:-------|",
+  );
+  for (const r of rows) {
+    out.push(
+      `| ${r.name} | ${r.baselineOps === null ? "–" : fmtOps(r.baselineOps)} | ${fmtOps(r.currentOps)} | ${r.opsDeltaPct === null ? "–" : fmtDelta(r.opsDeltaPct)} | ${fmtBytes(r.baselineMem ?? undefined)} | ${fmtBytes(r.currentMem ?? undefined)} | ${r.memDeltaPct === null ? "–" : fmtDelta(r.memDeltaPct)} | ${statusBadge(r.status)} |`,
+    );
+  }
+  out.push("");
+  out.push(
+    "_Produced by `benchmarks/scripts/compare-results.ts`. Artifacts: `bench-results-<pr>` (current), `bench-baseline-main` (baseline)._",
+  );
+  out.push("");
+  return out.join("\n");
+}
+
+function main(): void {
+  const opts = parseArgs();
+  const current = loadPayload(opts.current);
+  const baseline =
+    opts.noBaseline || !opts.baseline ? null : loadPayload(opts.baseline);
+  const rows = compare(baseline, current, opts.thresholdOps, opts.thresholdMem);
+  const md = renderMarkdown(rows, {
+    baseline,
+    current,
+    thresholdOps: opts.thresholdOps,
+    thresholdMem: opts.thresholdMem,
+  });
+  writeFileSync(opts.output, md, "utf8");
+  // Print to stdout for local/dev runs so developers don't have to open
+  // the file to see the result.
+  console.log(md);
+  const hasRegression = rows.some((r) => r.status === "regression");
+  // Exit 0 even on regression — the workflow surfaces the flag via the
+  // PR comment + a ::warning:: annotation. Hard-failing the job would
+  // block legitimate PRs that intentionally trade throughput for another
+  // gain (e.g. bundle size). Tech-lead can promote to hard-fail later.
+  if (hasRegression) {
+    console.error("compare-results: REGRESSION flagged (non-fatal).");
+  }
+}
+
+main();
diff --git a/benchmarks/scripts/run-matrix-ci.sh b/benchmarks/scripts/run-matrix-ci.sh
new file mode 100755
index 000000000..f5ffc3c62
--- /dev/null
+++ b/benchmarks/scripts/run-matrix-ci.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+# Copyright 2021-2026 Buf Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# run-matrix-ci.sh — stable wrapper around `tsx src/bench-matrix.ts`.
+#
+# Intent
+# ------
+# bench-matrix.ts's default knobs (1000 ms measurement, 200 ms warmup) are
+# tuned for local development — fast feedback, noisy numbers. CI needs the
+# opposite: longer warmup so the V8 tier-up settles, longer measurement so
+# RME shrinks, and a clean stdout stream that contains only the JSON
+# payload so compare-results.ts can read it with fs.readFileSync.
+#
+# This wrapper:
+#   1. Logs the host profile (Node version, CPU, RAM) for trace records.
+#   2. Does a throwaway warmup run of the matrix so JIT + ICs are warm on
+#      the main benchmark functions.
+#   3. Runs the real matrix with CI-sized time budgets.
+#   4. Extracts the last JSON object from stdout and writes it to the
+#      caller-specified output file.
+#
+# Usage: benchmarks/scripts/run-matrix-ci.sh [output.json]
+#        defaults to bench-results.json in the current working directory.
+
+set -euo pipefail
+
+out="${1:-bench-results.json}"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCH_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+cd "${BENCH_DIR}"
+
+# -------- 1. Host profile (trace only, never fails the job) --------
+echo "::group::Host profile"
+echo "node:        $(node --version)"
+echo "platform:    $(uname -srm)"
+if command -v nproc >/dev/null 2>&1; then
+  echo "cpus:        $(nproc)"
+fi
+if [[ -r /proc/meminfo ]]; then
+  echo "mem:         $(awk '/MemTotal/ {printf "%.1f GB", $2/1024/1024}' /proc/meminfo)"
+fi
+if command -v lscpu >/dev/null 2>&1; then
+  lscpu | grep -E "Model name|CPU MHz|CPU max MHz" || true
+fi
+echo "::endgroup::"
+
+# -------- 2. Warmup pass (discarded) --------
+echo "::group::Warmup"
+BENCH_MATRIX_TIME="${BENCH_MATRIX_WARMUP_TIME:-500}" \
+BENCH_MATRIX_WARMUP="${BENCH_MATRIX_WARMUP_TIME:-200}" \
+  npx tsx src/bench-matrix.ts >/dev/null 2>&1 || true
+echo "Warmup complete."
+echo "::endgroup::"
+
+# -------- 3. Measurement pass --------
+echo "::group::Measurement"
+BENCH_MATRIX_TIME="${BENCH_MATRIX_CI_TIME:-3000}" \
+BENCH_MATRIX_WARMUP="${BENCH_MATRIX_CI_WARMUP:-1000}" \
+  npx tsx src/bench-matrix.ts | tee ".bench-stdout.log"
+echo "::endgroup::"
+
+# -------- 4. Extract JSON payload --------
+# bench-matrix.ts prints human-readable tables and then one line of
+# `=== Matrix JSON ===` followed by a single-line JSON object. Grab the
+# last line that starts with '{' as the payload.
+node -e '
+const fs = require("node:fs");
+const out = process.argv[1];
+const lines = fs.readFileSync(".bench-stdout.log", "utf8").split("\n");
+let payload = null;
+for (let i = lines.length - 1; i >= 0; i--) {
+  const ln = lines[i].trim();
+  if (ln.startsWith("{") && ln.endsWith("}")) {
+    try { payload = JSON.parse(ln); break; } catch { /* keep looking */ }
+  }
+}
+if (!payload) {
+  console.error("run-matrix-ci: could not locate JSON payload in bench-matrix output.");
+  process.exit(1);
+}
+fs.writeFileSync(out, JSON.stringify(payload, null, 2) + "\n");
+console.error(`run-matrix-ci: wrote ${payload.results.length} result rows to ${out}.`);
+' "$out"
+
+rm -f .bench-stdout.log
diff --git a/benchmarks/src/bench-matrix.ts b/benchmarks/src/bench-matrix.ts
index 6266d6d78..0f1e22f45 100644
--- a/benchmarks/src/bench-matrix.ts
+++ b/benchmarks/src/bench-matrix.ts
@@ -154,7 +154,12 @@ const cases: MatrixCase[] = [
 ];
 
 export async function runMatrixBench() {
-  const bench = new Bench({ time: 1000, warmupTime: 200 });
+  // Time budgets: defaults tuned for local dev feedback. CI sets
+  // BENCH_MATRIX_TIME / BENCH_MATRIX_WARMUP via run-matrix-ci.sh to get
+  // tighter RME on long-running hosts. Values are ms.
+  const time = Number(process.env.BENCH_MATRIX_TIME) || 1000;
+  const warmupTime = Number(process.env.BENCH_MATRIX_WARMUP) || 200;
+  const bench = new Bench({ time, warmupTime });
 
   // Pre-build every message + pre-encoded bytes outside the measurement
   // loop so the encoder/decoder benchmarks reflect the encode/decode walk