diff --git a/benchmarks/baselines/main.json b/benchmarks/baselines/main.json
new file mode 100644
index 000000000..5a159517d
--- /dev/null
+++ b/benchmarks/baselines/main.json
@@ -0,0 +1,127 @@
+{
+  "node": "v25.8.1",
+  "platform": "linux/x64",
+  "timestamp": "2026-04-20T14:03:09.404Z",
+  "results": [
+    {
+      "name": "SimpleMessage :: toBinary (pre-built, 19 B)",
+      "opsPerSec": 720572.6467325875,
+      "rme": 4.586523984485405,
+      "samples": 584677
+    },
+    {
+      "name": "ExportTraceRequest (100 spans) :: toBinary (pre-built, 32926 B)",
+      "opsPerSec": 942.728512279284,
+      "rme": 5.20843148311372,
+      "samples": 905
+    },
+    {
+      "name": "ExportMetricsRequest (50 series) :: toBinary (pre-built, 17696 B)",
+      "opsPerSec": 1690.2655826286807,
+      "rme": 1.2287491782968762,
+      "samples": 1663
+    },
+    {
+      "name": "ExportLogsRequest (100 records) :: toBinary (pre-built, 21319 B)",
+      "opsPerSec": 1769.099144200511,
+      "rme": 0.7934304535540673,
+      "samples": 1741
+    },
+    {
+      "name": "K8sPodList (20 pods) :: toBinary (pre-built, 28900 B)",
+      "opsPerSec": 1761.1402301413486,
+      "rme": 3.1648732429277913,
+      "samples": 1662
+    },
+    {
+      "name": "GraphQLRequest :: toBinary (pre-built, 624 B)",
+      "opsPerSec": 145455.54923227328,
+      "rme": 1.939681764666347,
+      "samples": 136959
+    },
+    {
+      "name": "GraphQLResponse :: toBinary (pre-built, 1366 B)",
+      "opsPerSec": 207739.65617123304,
+      "rme": 7.000451540273928,
+      "samples": 166418
+    },
+    {
+      "name": "RpcRequest :: toBinary (pre-built, 501 B)",
+      "opsPerSec": 250839.90037953644,
+      "rme": 2.3455244711213687,
+      "samples": 233170
+    },
+    {
+      "name": "RpcResponse :: toBinary (pre-built, 602 B)",
+      "opsPerSec": 393930.1377351528,
+      "rme": 3.2692980105549525,
+      "samples": 344566
+    },
+    {
+      "name": "StressMessage (depth=8, width=200) :: toBinary (pre-built, 12868 B)",
+      "opsPerSec": 6329.2079657042395,
+      "rme": 1.2442163953887613,
+      "samples": 6167
+    },
+    {
+      "name": "SimpleMessage :: fromBinary (19 B)",
+      "opsPerSec": 884713.3936586891,
+      "rme": 0.16714493485586263,
+      "samples": 859830
+    },
+    {
+      "name": "ExportTraceRequest (100 spans) :: fromBinary (32926 B)",
+      "opsPerSec": 545.5350076541376,
+      "rme": 1.202558278680317,
+      "samples": 538
+    },
+    {
+      "name": "ExportMetricsRequest (50 series) :: fromBinary (17696 B)",
+      "opsPerSec": 888.3503061250073,
+      "rme": 1.2969980310240832,
+      "samples": 864
+    },
+    {
+      "name": "ExportLogsRequest (100 records) :: fromBinary (21319 B)",
+      "opsPerSec": 882.9676497121711,
+      "rme": 1.2257471502567474,
+      "samples": 862
+    },
+    {
+      "name": "K8sPodList (20 pods) :: fromBinary (28900 B)",
+      "opsPerSec": 1007.335044928595,
+      "rme": 0.8788453963613843,
+      "samples": 993
+    },
+    {
+      "name": "GraphQLRequest :: fromBinary (624 B)",
+      "opsPerSec": 221605.2437830516,
+      "rme": 0.3787980957928544,
+      "samples": 211417
+    },
+    {
+      "name": "GraphQLResponse :: fromBinary (1366 B)",
+      "opsPerSec": 205348.95710481095,
+      "rme": 1.6772806334555086,
+      "samples": 190009
+    },
+    {
+      "name": "RpcRequest :: fromBinary (501 B)",
+      "opsPerSec": 208822.5527629229,
+      "rme": 0.34718286675914534,
+      "samples": 204013
+    },
+    {
+      "name": "RpcResponse :: fromBinary (602 B)",
+      "opsPerSec": 295797.20225724357,
+      "rme": 2.029660371713425,
+      "samples": 281191
+    },
+    {
+      "name": "StressMessage (depth=8, width=200) :: fromBinary (12868 B)",
+      "opsPerSec": 2874.297533564573,
+      "rme": 1.0153087164867896,
+      "samples": 2797
+    }
+  ]
+}
diff --git a/benchmarks/scripts/compare-results.ts b/benchmarks/scripts/compare-results.ts
index ae9d4fbb5..a40c9308d 100644
--- a/benchmarks/scripts/compare-results.ts
+++ b/benchmarks/scripts/compare-results.ts
@@ -67,8 +67,10 @@ function parseArgs(): Options {
     if (arg.startsWith("--baseline=")) opts.baseline = arg.slice(11);
     else if (arg.startsWith("--current=")) opts.current = arg.slice(10);
     else if (arg.startsWith("--output=")) opts.output = arg.slice(9);
-    else if (arg.startsWith("--threshold-ops=")) opts.thresholdOps = Number(arg.slice(16));
-    else if (arg.startsWith("--threshold-mem=")) opts.thresholdMem = Number(arg.slice(16));
+    else if (arg.startsWith("--threshold-ops="))
+      opts.thresholdOps = Number(arg.slice(16));
+    else if (arg.startsWith("--threshold-mem="))
+      opts.thresholdMem = Number(arg.slice(16));
     else if (arg === "--no-baseline") opts.noBaseline = true;
     else if (arg === "--help" || arg === "-h") {
       printUsage();
@@ -135,6 +137,12 @@ interface CompareRow {
   status: "ok" | "improved" | "regression" | "new";
 }
 
+// Flat thresholds (ops %, memory %). Variance on CI runners is now
+// controlled upstream in run-matrix-ci.sh via `taskset -c 0` CPU pinning
+// + median-of-5 runs; see analysis/benchmark-variance-root-cause.md for
+// the measurement that showed 76% -> 7% spread after pinning. Keeping
+// thresholds flat lets real algorithmic regressions (>5% ops, >10% mem)
+// surface without bucket-dependent policy the reviewer has to interpret.
 function compare(
   baseline: BenchPayload | null,
   current: BenchPayload,
@@ -224,7 +232,8 @@ function renderMarkdown(
   out.push(`## ${summaryTitle}`);
   out.push("");
   out.push(
-    `Thresholds: throughput regression \`>${opts.thresholdOps}%\`, memory regression \`>${opts.thresholdMem}%\`. Current run on \`${opts.current.platform}\`, Node \`${opts.current.node}\`, captured \`${opts.current.timestamp}\`.`,
+    `Thresholds: throughput regression \`>${opts.thresholdOps}%\`, memory regression \`>${opts.thresholdMem}%\`. ` +
+      `Runner pinned to CPU 0 via taskset. Current run on \`${opts.current.platform}\`, Node \`${opts.current.node}\`, captured \`${opts.current.timestamp}\`.`,
   );
   if (opts.baseline) {
     out.push(
diff --git a/benchmarks/scripts/median-results.ts b/benchmarks/scripts/median-results.ts
new file mode 100644
index 000000000..b50098177
--- /dev/null
+++ b/benchmarks/scripts/median-results.ts
@@ -0,0 +1,165 @@
+// Copyright 2021-2026 Buf Technologies, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// median-results.ts — combine N bench-matrix JSON dumps into a single
+// payload whose ops/sec per fixture is the median across runs.
+//
+// Why
+// ---
+// Local 5-run measurements on main showed 2x host-level spread on fast
+// fixtures (SimpleMessage, GraphQLRequest) even with tinybench's own RME
+// under 0.2%. A single-run comparison therefore produces false-positive
+// "regressions" whose magnitude is entirely noise. Median-of-N is the
+// standard, cheap mitigation: one outlier cannot move the reported number.
+//
+// Usage
+// -----
+//   node scripts/median-results.ts runs/run-1.json runs/run-2.json ... > baseline.json
+//
+// Behaviour
+// ---------
+// - With a single input file, passes the payload through unchanged — this
+//   keeps the script safe to use as a no-op step in CI pipelines that
+//   occasionally reduce to one run (e.g. local development).
+// - With N >= 2 inputs, groups rows by `name`, takes the numeric median of
+//   `opsPerSec` per fixture, and attaches the `rme` / `samples` fields
+//   from the run whose ops/sec is closest to that median — so downstream
+//   consumers still see a representative (not synthetic) confidence
+//   interval.
+// - Fixtures missing from some runs are included if they appear in >= 1
+//   input; the median is computed across whatever subset is present and a
+//   warning is emitted to stderr so drift is visible.
+// - Output JSON structure matches bench-matrix.ts's payload exactly.
+
+import { readFileSync } from "node:fs";
+import { argv, exit, stderr, stdout } from "node:process";
+
+interface ResultRow {
+  name: string;
+  opsPerSec: number;
+  rme?: number;
+  samples?: number;
+  bytesPerOp?: number;
+  encodedSize?: number;
+}
+
+interface BenchPayload {
+  node: string;
+  platform: string;
+  timestamp: string;
+  results: ResultRow[];
+}
+
+function loadPayload(path: string): BenchPayload {
+  const raw = readFileSync(path, "utf8").trim();
+  if (raw.startsWith("{")) {
+    return JSON.parse(raw) as BenchPayload;
+  }
+  // Tolerate raw bench-matrix stdout (with table output before the JSON
+  // payload) — same forgiveness rule as compare-results.ts.
+  const jsonStart = raw.lastIndexOf("\n{");
+  if (jsonStart === -1) {
+    throw new Error(`median-results: no JSON payload found in ${path}`);
+  }
+  return JSON.parse(raw.slice(jsonStart + 1)) as BenchPayload;
+}
+
+/**
+ * Numeric median. For even N we return the lower of the two middle values
+ * instead of interpolating — this keeps the output row anchored to an
+ * actually-observed run (so the attached rme/samples remain meaningful).
+ */
+function median(values: number[]): number {
+  if (values.length === 0) return 0;
+  const sorted = [...values].sort((a, b) => a - b);
+  return sorted[Math.floor((sorted.length - 1) / 2)];
+}
+
+function main(): void {
+  const paths = argv.slice(2).filter((a) => !a.startsWith("-"));
+  if (paths.length === 0) {
+    stderr.write(
+      "Usage: median-results.ts <run-1.json> [<run-2.json> ...] > out.json\n",
+    );
+    exit(2);
+  }
+
+  const payloads = paths.map(loadPayload);
+
+  // Single-run fallback: nothing to median, just pass through.
+  if (payloads.length === 1) {
+    stdout.write(`${JSON.stringify(payloads[0], null, 2)}\n`);
+    return;
+  }
+
+  // Collect rows by fixture name across all runs.
+  const byName = new Map<string, ResultRow[]>();
+  for (const payload of payloads) {
+    for (const row of payload.results) {
+      const rows = byName.get(row.name) ?? [];
+      rows.push(row);
+      byName.set(row.name, rows);
+    }
+  }
+
+  const merged: ResultRow[] = [];
+  for (const [name, rows] of byName) {
+    if (rows.length < payloads.length) {
+      stderr.write(
+        `median-results: fixture "${name}" present in ${rows.length}/${payloads.length} runs; median computed across subset.\n`,
+      );
+    }
+    const opsValues = rows.map((r) => r.opsPerSec);
+    const medianOps = median(opsValues);
+    // Pick the row closest to median so rme/samples/bytesPerOp/encodedSize
+    // reflect an actually-observed run, not a synthesized one.
+    let closest = rows[0];
+    let bestDistance = Math.abs(rows[0].opsPerSec - medianOps);
+    for (const r of rows) {
+      const d = Math.abs(r.opsPerSec - medianOps);
+      if (d < bestDistance) {
+        closest = r;
+        bestDistance = d;
+      }
+    }
+    merged.push({
+      name,
+      opsPerSec: medianOps,
+      rme: closest.rme,
+      samples: closest.samples,
+      bytesPerOp: closest.bytesPerOp,
+      encodedSize: closest.encodedSize,
+    });
+  }
+
+  // Envelope metadata: keep node/platform from the first run (they must
+  // match across runs to be comparable; divergence means the operator
+  // did something wrong) and use the latest timestamp.
+  const first = payloads[0];
+  const timestamp = payloads
+    .map((p) => p.timestamp)
+    .sort()
+    .at(-1) as string;
+
+  const out: BenchPayload = {
+    node: first.node,
+    platform: first.platform,
+    timestamp,
+    results: merged,
+  };
+
+  stdout.write(`${JSON.stringify(out, null, 2)}\n`);
+}
+
+main();
diff --git a/benchmarks/scripts/run-matrix-ci.sh b/benchmarks/scripts/run-matrix-ci.sh
index f5ffc3c62..9aafe1502 100755
--- a/benchmarks/scripts/run-matrix-ci.sh
+++ b/benchmarks/scripts/run-matrix-ci.sh
@@ -23,25 +23,51 @@
 # RME shrinks, and a clean stdout stream that contains only the JSON
 # payload so compare-results.ts can read it with fs.readFileSync.
 #
+# Additionally, this wrapper runs the matrix N times (default 5) and feeds
+# the per-run JSON outputs through `scripts/median-results.ts` so the
+# reported number is the median across runs.
+#
+# Variance control — the root-cause investigation
+# (analysis/benchmark-variance-root-cause.md) measured a +76% run-to-run
+# spread on `ExportTrace::toBinary` unpinned on a heterogeneous P/E-core
+# host. Pinning the process to CPU 0 (`taskset -c 0`) collapsed the same
+# workload to +7% spread — a 10x reduction. Frame proportions in the CPU
+# profiles were identical across slow and fast runs, confirming the
+# variance was pure environmental (scheduler migration + intel_pstate
+# frequency scaling), not algorithmic. Pinning is therefore the primary
+# noise reduction; median-of-5 is the secondary filter.
+#
 # This wrapper:
 #   1. Logs the host profile (Node version, CPU, RAM) for trace records.
-#   2. Does a throwaway warmup run of the matrix so JIT + ICs are warm on
+#   2. Detects `taskset` and pins each invocation to CPU 0 when available.
+#   3. Does a throwaway warmup run of the matrix so JIT + ICs are warm on
 #      the main benchmark functions.
-#   3. Runs the real matrix with CI-sized time budgets.
-#   4. Extracts the last JSON object from stdout and writes it to the
-#      caller-specified output file.
+#   4. Runs the real matrix N times (default 5) with CI-sized time budgets.
+#   5. Extracts the JSON payload from each run's stdout.
+#   6. Computes the per-fixture median and writes it to the output file.
 #
 # Usage: benchmarks/scripts/run-matrix-ci.sh [output.json]
 #        defaults to bench-results.json in the current working directory.
+#
+# Env overrides:
+#   BENCH_MATRIX_RUNS           number of measurement runs (default 5)
+#   BENCH_MATRIX_CI_TIME        per-run measurement ms (default 3000)
+#   BENCH_MATRIX_CI_WARMUP      per-run warmup ms (default 1000)
+#   BENCH_MATRIX_WARMUP_TIME    throwaway warmup pass ms (default 500/200)
 
 set -euo pipefail
 
 out="${1:-bench-results.json}"
+runs="${BENCH_MATRIX_RUNS:-5}"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BENCH_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
 
 cd "${BENCH_DIR}"
 
+runs_dir=".bench-runs"
+rm -rf "${runs_dir}"
+mkdir -p "${runs_dir}"
+
 # -------- 1. Host profile (trace only, never fails the job) --------
 echo "::group::Host profile"
 echo "node:        $(node --version)"
@@ -55,44 +81,70 @@ fi
 if command -v lscpu >/dev/null 2>&1; then
   lscpu | grep -E "Model name|CPU MHz|CPU max MHz" || true
 fi
+echo "runs:        ${runs}"
+
+# -------- 1b. CPU pinning detection --------
+# Pin each measurement invocation to a single CPU to eliminate scheduler
+# migration jitter (primary source of >50% run-to-run variance on hosts
+# with heterogeneous P/E-core topologies). CPU 0 is a P-core on Intel
+# Core Ultra and the first available core on the GitHub ubuntu-latest
+# runner fleet.
+if command -v taskset >/dev/null 2>&1; then
+  pin_prefix=(taskset -c 0)
+  echo "cpu pinning: enabled (taskset -c 0)"
+else
+  pin_prefix=()
+  echo "cpu pinning: DISABLED (taskset not available) — results will be noisy"
+fi
 echo "::endgroup::"
 
 # -------- 2. Warmup pass (discarded) --------
 echo "::group::Warmup"
 BENCH_MATRIX_TIME="${BENCH_MATRIX_WARMUP_TIME:-500}" \
 BENCH_MATRIX_WARMUP="${BENCH_MATRIX_WARMUP_TIME:-200}" \
-  npx tsx src/bench-matrix.ts >/dev/null 2>&1 || true
+  "${pin_prefix[@]}" npx tsx src/bench-matrix.ts >/dev/null 2>&1 || true
 echo "Warmup complete."
 echo "::endgroup::"
 
-# -------- 3. Measurement pass --------
-echo "::group::Measurement"
-BENCH_MATRIX_TIME="${BENCH_MATRIX_CI_TIME:-3000}" \
-BENCH_MATRIX_WARMUP="${BENCH_MATRIX_CI_WARMUP:-1000}" \
-  npx tsx src/bench-matrix.ts | tee ".bench-stdout.log"
-echo "::endgroup::"
-
-# -------- 4. Extract JSON payload --------
-# bench-matrix.ts prints human-readable tables and then one line of
-# `=== Matrix JSON ===` followed by a single-line JSON object. Grab the
-# last line that starts with '{' as the payload.
-node -e '
-const fs = require("node:fs");
-const out = process.argv[1];
-const lines = fs.readFileSync(".bench-stdout.log", "utf8").split("\n");
-let payload = null;
-for (let i = lines.length - 1; i >= 0; i--) {
-  const ln = lines[i].trim();
-  if (ln.startsWith("{") && ln.endsWith("}")) {
-    try { payload = JSON.parse(ln); break; } catch { /* keep looking */ }
-  }
-}
-if (!payload) {
-  console.error("run-matrix-ci: could not locate JSON payload in bench-matrix output.");
-  process.exit(1);
+# -------- 3. Measurement passes --------
+extract_json() {
+  # $1 = stdout log, $2 = output json path
+  node -e '
+    const fs = require("node:fs");
+    const src = process.argv[1];
+    const dst = process.argv[2];
+    const lines = fs.readFileSync(src, "utf8").split("\n");
+    let payload = null;
+    for (let i = lines.length - 1; i >= 0; i--) {
+      const ln = lines[i].trim();
+      if (ln.startsWith("{") && ln.endsWith("}")) {
+        try { payload = JSON.parse(ln); break; } catch { /* keep looking */ }
+      }
+    }
+    if (!payload) {
+      console.error(`run-matrix-ci: could not locate JSON payload in ${src}.`);
+      process.exit(1);
+    }
+    fs.writeFileSync(dst, JSON.stringify(payload, null, 2) + "\n");
+    console.error(`run-matrix-ci: wrote ${payload.results.length} rows to ${dst}.`);
+  ' "$1" "$2"
 }
-fs.writeFileSync(out, JSON.stringify(payload, null, 2) + "\n");
-console.error(`run-matrix-ci: wrote ${payload.results.length} result rows to ${out}.`);
-' "$out"
 
-rm -f .bench-stdout.log
+for i in $(seq 1 "${runs}"); do
+  echo "::group::Measurement run ${i}/${runs}"
+  log=".bench-stdout-${i}.log"
+  BENCH_MATRIX_TIME="${BENCH_MATRIX_CI_TIME:-3000}" \
+  BENCH_MATRIX_WARMUP="${BENCH_MATRIX_CI_WARMUP:-1000}" \
+    "${pin_prefix[@]}" npx tsx src/bench-matrix.ts | tee "${log}"
+  extract_json "${log}" "${runs_dir}/run-${i}.json"
+  rm -f "${log}"
+  echo "::endgroup::"
+done
+
+# -------- 4. Compute median across runs --------
+echo "::group::Compute median across ${runs} run(s)"
+npx tsx scripts/median-results.ts "${runs_dir}"/run-*.json > "${out}"
+echo "run-matrix-ci: wrote median payload to ${out}."
+echo "::endgroup::"
+
+rm -rf "${runs_dir}"