Connectum-Framework · intech · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/benchmarks/baselines/main.json b/benchmarks/baselines/main.json
@@ -0,0 +1,127 @@
+{
+  "node": "v25.8.1",
+  "platform": "linux/x64",
+  "timestamp": "2026-04-20T14:03:09.404Z",
+  "results": [
+    {
+      "name": "SimpleMessage :: toBinary (pre-built, 19 B)",
+      "opsPerSec": 720572.6467325875,
+      "rme": 4.586523984485405,
+      "samples": 584677
+    },
+    {
+      "name": "ExportTraceRequest (100 spans) :: toBinary (pre-built, 32926 B)",
+      "opsPerSec": 942.728512279284,
+      "rme": 5.20843148311372,
+      "samples": 905
+    },
+    {
+      "name": "ExportMetricsRequest (50 series) :: toBinary (pre-built, 17696 B)",
+      "opsPerSec": 1690.2655826286807,
+      "rme": 1.2287491782968762,
+      "samples": 1663
+    },
+    {
+      "name": "ExportLogsRequest (100 records) :: toBinary (pre-built, 21319 B)",
+      "opsPerSec": 1769.099144200511,
+      "rme": 0.7934304535540673,
+      "samples": 1741
+    },
+    {
+      "name": "K8sPodList (20 pods) :: toBinary (pre-built, 28900 B)",
+      "opsPerSec": 1761.1402301413486,
+      "rme": 3.1648732429277913,
+      "samples": 1662
+    },
+    {
+      "name": "GraphQLRequest :: toBinary (pre-built, 624 B)",
+      "opsPerSec": 145455.54923227328,
+      "rme": 1.939681764666347,
+      "samples": 136959
+    },
+    {
+      "name": "GraphQLResponse :: toBinary (pre-built, 1366 B)",
+      "opsPerSec": 207739.65617123304,
+      "rme": 7.000451540273928,
+      "samples": 166418
+    },
+    {
+      "name": "RpcRequest :: toBinary (pre-built, 501 B)",
+      "opsPerSec": 250839.90037953644,
+      "rme": 2.3455244711213687,
+      "samples": 233170
+    },
+    {
+      "name": "RpcResponse :: toBinary (pre-built, 602 B)",
+      "opsPerSec": 393930.1377351528,
+      "rme": 3.2692980105549525,
+      "samples": 344566
+    },
+    {
+      "name": "StressMessage (depth=8, width=200) :: toBinary (pre-built, 12868 B)",
+      "opsPerSec": 6329.2079657042395,
+      "rme": 1.2442163953887613,
+      "samples": 6167
+    },
+    {
+      "name": "SimpleMessage :: fromBinary (19 B)",
+      "opsPerSec": 884713.3936586891,
+      "rme": 0.16714493485586263,
+      "samples": 859830
+    },
+    {
+      "name": "ExportTraceRequest (100 spans) :: fromBinary (32926 B)",
+      "opsPerSec": 545.5350076541376,
+      "rme": 1.202558278680317,
+      "samples": 538
+    },
+    {
+      "name": "ExportMetricsRequest (50 series) :: fromBinary (17696 B)",
+      "opsPerSec": 888.3503061250073,
+      "rme": 1.2969980310240832,
+      "samples": 864
+    },
+    {
+      "name": "ExportLogsRequest (100 records) :: fromBinary (21319 B)",
+      "opsPerSec": 882.9676497121711,
+      "rme": 1.2257471502567474,
+      "samples": 862
+    },
+    {
+      "name": "K8sPodList (20 pods) :: fromBinary (28900 B)",
+      "opsPerSec": 1007.335044928595,
+      "rme": 0.8788453963613843,
+      "samples": 993
+    },
+    {
+      "name": "GraphQLRequest :: fromBinary (624 B)",
+      "opsPerSec": 221605.2437830516,
+      "rme": 0.3787980957928544,
+      "samples": 211417
+    },
+    {
+      "name": "GraphQLResponse :: fromBinary (1366 B)",
+      "opsPerSec": 205348.95710481095,
+      "rme": 1.6772806334555086,
+      "samples": 190009
+    },
+    {
+      "name": "RpcRequest :: fromBinary (501 B)",
+      "opsPerSec": 208822.5527629229,
+      "rme": 0.34718286675914534,
+      "samples": 204013
+    },
+    {
+      "name": "RpcResponse :: fromBinary (602 B)",
+      "opsPerSec": 295797.20225724357,
+      "rme": 2.029660371713425,
+      "samples": 281191
+    },
+    {
+      "name": "StressMessage (depth=8, width=200) :: fromBinary (12868 B)",
+      "opsPerSec": 2874.297533564573,
+      "rme": 1.0153087164867896,
+      "samples": 2797
+    }
+  ]
+}
diff --git a/benchmarks/scripts/compare-results.ts b/benchmarks/scripts/compare-results.ts
@@ -67,8 +67,10 @@ function parseArgs(): Options {
     if (arg.startsWith("--baseline=")) opts.baseline = arg.slice(11);
     else if (arg.startsWith("--current=")) opts.current = arg.slice(10);
     else if (arg.startsWith("--output=")) opts.output = arg.slice(9);
-    else if (arg.startsWith("--threshold-ops=")) opts.thresholdOps = Number(arg.slice(16));
-    else if (arg.startsWith("--threshold-mem=")) opts.thresholdMem = Number(arg.slice(16));
+    else if (arg.startsWith("--threshold-ops="))
+      opts.thresholdOps = Number(arg.slice(16));
+    else if (arg.startsWith("--threshold-mem="))
+      opts.thresholdMem = Number(arg.slice(16));
     else if (arg === "--no-baseline") opts.noBaseline = true;
     else if (arg === "--help" || arg === "-h") {
       printUsage();
@@ -135,6 +137,12 @@ interface CompareRow {
   status: "ok" | "improved" | "regression" | "new";
 }
 
+// Flat thresholds (ops %, memory %). Variance on CI runners is now
+// controlled upstream in run-matrix-ci.sh via `taskset -c 0` CPU pinning
+// + median-of-5 runs; see analysis/benchmark-variance-root-cause.md for
+// the measurement that showed 76% -> 7% spread after pinning. Keeping
+// thresholds flat lets real algorithmic regressions (>5% ops, >10% mem)
+// surface without bucket-dependent policy the reviewer has to interpret.
 function compare(
   baseline: BenchPayload | null,
   current: BenchPayload,
@@ -224,7 +232,8 @@ function renderMarkdown(
   out.push(`## ${summaryTitle}`);
   out.push("");
   out.push(
-    `Thresholds: throughput regression \`>${opts.thresholdOps}%\`, memory regression \`>${opts.thresholdMem}%\`. Current run on \`${opts.current.platform}\`, Node \`${opts.current.node}\`, captured \`${opts.current.timestamp}\`.`,
+    `Thresholds: throughput regression \`>${opts.thresholdOps}%\`, memory regression \`>${opts.thresholdMem}%\`. ` +
+      `Runner pinned to CPU 0 via taskset. Current run on \`${opts.current.platform}\`, Node \`${opts.current.node}\`, captured \`${opts.current.timestamp}\`.`,
   );
   if (opts.baseline) {
     out.push(

diff --git a/benchmarks/scripts/median-results.ts b/benchmarks/scripts/median-results.ts
@@ -0,0 +1,165 @@
+// Copyright 2021-2026 Buf Technologies, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// median-results.ts — combine N bench-matrix JSON dumps into a single
+// payload whose ops/sec per fixture is the median across runs.
+//
+// Why
+// ---
+// Local 5-run measurements on main showed 2x host-level spread on fast
+// fixtures (SimpleMessage, GraphQLRequest) even with tinybench's own RME
+// under 0.2%. A single-run comparison therefore produces false-positive
+// "regressions" whose magnitude is entirely noise. Median-of-N is the
+// standard, cheap mitigation: one outlier cannot move the reported number.
+//
+// Usage
+// -----
+//   node scripts/median-results.ts runs/run-1.json runs/run-2.json ... > baseline.json
+//
+// Behaviour
+// ---------
+// - With a single input file, passes the payload through unchanged — this
+//   keeps the script safe to use as a no-op step in CI pipelines that
+//   occasionally reduce to one run (e.g. local development).
+// - With N >= 2 inputs, groups rows by `name`, takes the numeric median of
+//   `opsPerSec` per fixture, and attaches the `rme` / `samples` fields
+//   from the run whose ops/sec is closest to that median — so downstream
+//   consumers still see a representative (not synthetic) confidence
+//   interval.
+// - Fixtures missing from some runs are included if they appear in >= 1
+//   input; the median is computed across whatever subset is present and a
+//   warning is emitted to stderr so drift is visible.
+// - Output JSON structure matches bench-matrix.ts's payload exactly.
+
+import { readFileSync } from "node:fs";
+import { argv, exit, stderr, stdout } from "node:process";
+
+interface ResultRow {
+  name: string;
+  opsPerSec: number;
+  rme?: number;
+  samples?: number;
+  bytesPerOp?: number;
+  encodedSize?: number;
+}
+
+interface BenchPayload {
+  node: string;
+  platform: string;
+  timestamp: string;
+  results: ResultRow[];
+}
+
+function loadPayload(path: string): BenchPayload {
+  const raw = readFileSync(path, "utf8").trim();
+  if (raw.startsWith("{")) {
+    return JSON.parse(raw) as BenchPayload;
+  }
+  // Tolerate raw bench-matrix stdout (with table output before the JSON
+  // payload) — same forgiveness rule as compare-results.ts.
+  const jsonStart = raw.lastIndexOf("\n{");
+  if (jsonStart === -1) {
+    throw new Error(`median-results: no JSON payload found in ${path}`);
+  }
+  return JSON.parse(raw.slice(jsonStart + 1)) as BenchPayload;
+}
+
+/**
+ * Numeric median. For even N we return the lower of the two middle values
+ * instead of interpolating — this keeps the output row anchored to an
+ * actually-observed run (so the attached rme/samples remain meaningful).
+ */
+function median(values: number[]): number {
+  if (values.length === 0) return 0;
+  const sorted = [...values].sort((a, b) => a - b);
+  return sorted[Math.floor((sorted.length - 1) / 2)];
+}
+
+function main(): void {
+  const paths = argv.slice(2).filter((a) => !a.startsWith("-"));
+  if (paths.length === 0) {
+    stderr.write(
+      "Usage: median-results.ts <run-1.json> [<run-2.json> ...] > out.json\n",
+    );
+    exit(2);
+  }
+
+  const payloads = paths.map(loadPayload);
+
+  // Single-run fallback: nothing to median, just pass through.
+  if (payloads.length === 1) {
+    stdout.write(`${JSON.stringify(payloads[0], null, 2)}\n`);
+    return;
+  }
+
+  // Collect rows by fixture name across all runs.
+  const byName = new Map<string, ResultRow[]>();
+  for (const payload of payloads) {
+    for (const row of payload.results) {
+      const rows = byName.get(row.name) ?? [];
+      rows.push(row);
+      byName.set(row.name, rows);
+    }
+  }
+
+  const merged: ResultRow[] = [];
+  for (const [name, rows] of byName) {
+    if (rows.length < payloads.length) {
+      stderr.write(
+        `median-results: fixture "${name}" present in ${rows.length}/${payloads.length} runs; median computed across subset.\n`,
+      );
+    }
+    const opsValues = rows.map((r) => r.opsPerSec);
+    const medianOps = median(opsValues);
+    // Pick the row closest to median so rme/samples/bytesPerOp/encodedSize
+    // reflect an actually-observed run, not a synthesized one.
+    let closest = rows[0];
+    let bestDistance = Math.abs(rows[0].opsPerSec - medianOps);
+    for (const r of rows) {
+      const d = Math.abs(r.opsPerSec - medianOps);
+      if (d < bestDistance) {
+        closest = r;
+        bestDistance = d;
+      }
+    }
+    merged.push({
+      name,
+      opsPerSec: medianOps,
+      rme: closest.rme,
+      samples: closest.samples,
+      bytesPerOp: closest.bytesPerOp,
+      encodedSize: closest.encodedSize,
+    });
+  }
+
+  // Envelope metadata: keep node/platform from the first run (they must
+  // match across runs to be comparable; divergence means the operator
+  // did something wrong) and use the latest timestamp.
+  const first = payloads[0];
+  const timestamp = payloads
+    .map((p) => p.timestamp)
+    .sort()
+    .at(-1) as string;
+
+  const out: BenchPayload = {
+    node: first.node,
+    platform: first.platform,
+    timestamp,
+    results: merged,
+  };
+
+  stdout.write(`${JSON.stringify(out, null, 2)}\n`);
+}
+
+main();