Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions benchmarks/baselines/main.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
{
"node": "v25.8.1",
"platform": "linux/x64",
"timestamp": "2026-04-20T14:03:09.404Z",
"results": [
{
"name": "SimpleMessage :: toBinary (pre-built, 19 B)",
"opsPerSec": 720572.6467325875,
"rme": 4.586523984485405,
"samples": 584677
},
{
"name": "ExportTraceRequest (100 spans) :: toBinary (pre-built, 32926 B)",
"opsPerSec": 942.728512279284,
"rme": 5.20843148311372,
"samples": 905
},
{
"name": "ExportMetricsRequest (50 series) :: toBinary (pre-built, 17696 B)",
"opsPerSec": 1690.2655826286807,
"rme": 1.2287491782968762,
"samples": 1663
},
{
"name": "ExportLogsRequest (100 records) :: toBinary (pre-built, 21319 B)",
"opsPerSec": 1769.099144200511,
"rme": 0.7934304535540673,
"samples": 1741
},
{
"name": "K8sPodList (20 pods) :: toBinary (pre-built, 28900 B)",
"opsPerSec": 1761.1402301413486,
"rme": 3.1648732429277913,
"samples": 1662
},
{
"name": "GraphQLRequest :: toBinary (pre-built, 624 B)",
"opsPerSec": 145455.54923227328,
"rme": 1.939681764666347,
"samples": 136959
},
{
"name": "GraphQLResponse :: toBinary (pre-built, 1366 B)",
"opsPerSec": 207739.65617123304,
"rme": 7.000451540273928,
"samples": 166418
},
{
"name": "RpcRequest :: toBinary (pre-built, 501 B)",
"opsPerSec": 250839.90037953644,
"rme": 2.3455244711213687,
"samples": 233170
},
{
"name": "RpcResponse :: toBinary (pre-built, 602 B)",
"opsPerSec": 393930.1377351528,
"rme": 3.2692980105549525,
"samples": 344566
},
{
"name": "StressMessage (depth=8, width=200) :: toBinary (pre-built, 12868 B)",
"opsPerSec": 6329.2079657042395,
"rme": 1.2442163953887613,
"samples": 6167
},
{
"name": "SimpleMessage :: fromBinary (19 B)",
"opsPerSec": 884713.3936586891,
"rme": 0.16714493485586263,
"samples": 859830
},
{
"name": "ExportTraceRequest (100 spans) :: fromBinary (32926 B)",
"opsPerSec": 545.5350076541376,
"rme": 1.202558278680317,
"samples": 538
},
{
"name": "ExportMetricsRequest (50 series) :: fromBinary (17696 B)",
"opsPerSec": 888.3503061250073,
"rme": 1.2969980310240832,
"samples": 864
},
{
"name": "ExportLogsRequest (100 records) :: fromBinary (21319 B)",
"opsPerSec": 882.9676497121711,
"rme": 1.2257471502567474,
"samples": 862
},
{
"name": "K8sPodList (20 pods) :: fromBinary (28900 B)",
"opsPerSec": 1007.335044928595,
"rme": 0.8788453963613843,
"samples": 993
},
{
"name": "GraphQLRequest :: fromBinary (624 B)",
"opsPerSec": 221605.2437830516,
"rme": 0.3787980957928544,
"samples": 211417
},
{
"name": "GraphQLResponse :: fromBinary (1366 B)",
"opsPerSec": 205348.95710481095,
"rme": 1.6772806334555086,
"samples": 190009
},
{
"name": "RpcRequest :: fromBinary (501 B)",
"opsPerSec": 208822.5527629229,
"rme": 0.34718286675914534,
"samples": 204013
},
{
"name": "RpcResponse :: fromBinary (602 B)",
"opsPerSec": 295797.20225724357,
"rme": 2.029660371713425,
"samples": 281191
},
{
"name": "StressMessage (depth=8, width=200) :: fromBinary (12868 B)",
"opsPerSec": 2874.297533564573,
"rme": 1.0153087164867896,
"samples": 2797
}
]
}
15 changes: 12 additions & 3 deletions benchmarks/scripts/compare-results.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,10 @@ function parseArgs(): Options {
if (arg.startsWith("--baseline=")) opts.baseline = arg.slice(11);
else if (arg.startsWith("--current=")) opts.current = arg.slice(10);
else if (arg.startsWith("--output=")) opts.output = arg.slice(9);
else if (arg.startsWith("--threshold-ops=")) opts.thresholdOps = Number(arg.slice(16));
else if (arg.startsWith("--threshold-mem=")) opts.thresholdMem = Number(arg.slice(16));
else if (arg.startsWith("--threshold-ops="))
opts.thresholdOps = Number(arg.slice(16));
else if (arg.startsWith("--threshold-mem="))
opts.thresholdMem = Number(arg.slice(16));
else if (arg === "--no-baseline") opts.noBaseline = true;
else if (arg === "--help" || arg === "-h") {
printUsage();
Expand Down Expand Up @@ -135,6 +137,12 @@ interface CompareRow {
status: "ok" | "improved" | "regression" | "new";
}

// Flat thresholds (ops %, memory %). Variance on CI runners is now
// controlled upstream in run-matrix-ci.sh via `taskset -c 0` CPU pinning
// + median-of-5 runs; see analysis/benchmark-variance-root-cause.md for
// the measurement that showed 76% -> 7% spread after pinning. Keeping
// thresholds flat lets real algorithmic regressions (>5% ops, >10% mem)
// surface without bucket-dependent policy the reviewer has to interpret.
function compare(
baseline: BenchPayload | null,
current: BenchPayload,
Expand Down Expand Up @@ -224,7 +232,8 @@ function renderMarkdown(
out.push(`## ${summaryTitle}`);
out.push("");
out.push(
`Thresholds: throughput regression \`>${opts.thresholdOps}%\`, memory regression \`>${opts.thresholdMem}%\`. Current run on \`${opts.current.platform}\`, Node \`${opts.current.node}\`, captured \`${opts.current.timestamp}\`.`,
`Thresholds: throughput regression \`>${opts.thresholdOps}%\`, memory regression \`>${opts.thresholdMem}%\`. ` +
`Runner pinned to CPU 0 via taskset. Current run on \`${opts.current.platform}\`, Node \`${opts.current.node}\`, captured \`${opts.current.timestamp}\`.`,
);
if (opts.baseline) {
out.push(
Expand Down
165 changes: 165 additions & 0 deletions benchmarks/scripts/median-results.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
// Copyright 2021-2026 Buf Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// median-results.ts — combine N bench-matrix JSON dumps into a single
// payload whose ops/sec per fixture is the median across runs.
//
// Why
// ---
// Local 5-run measurements on main showed 2x host-level spread on fast
// fixtures (SimpleMessage, GraphQLRequest) even with tinybench's own RME
// under 0.2%. A single-run comparison therefore produces false-positive
// "regressions" whose magnitude is entirely noise. Median-of-N is the
// standard, cheap mitigation: one outlier cannot move the reported number.
//
// Usage
// -----
// node scripts/median-results.ts runs/run-1.json runs/run-2.json ... > baseline.json
//
// Behaviour
// ---------
// - With a single input file, passes the payload through unchanged — this
// keeps the script safe to use as a no-op step in CI pipelines that
// occasionally reduce to one run (e.g. local development).
// - With N >= 2 inputs, groups rows by `name`, takes the numeric median of
// `opsPerSec` per fixture, and attaches the `rme` / `samples` fields
// from the run whose ops/sec is closest to that median — so downstream
// consumers still see a representative (not synthetic) confidence
// interval.
// - Fixtures missing from some runs are included if they appear in >= 1
// input; the median is computed across whatever subset is present and a
// warning is emitted to stderr so drift is visible.
// - Output JSON structure matches bench-matrix.ts's payload exactly.

import { readFileSync } from "node:fs";
import { argv, exit, stderr, stdout } from "node:process";

interface ResultRow {
name: string;
opsPerSec: number;
rme?: number;
samples?: number;
bytesPerOp?: number;
encodedSize?: number;
}

interface BenchPayload {
node: string;
platform: string;
timestamp: string;
results: ResultRow[];
}

function loadPayload(path: string): BenchPayload {
const raw = readFileSync(path, "utf8").trim();
if (raw.startsWith("{")) {
return JSON.parse(raw) as BenchPayload;
}
// Tolerate raw bench-matrix stdout (with table output before the JSON
// payload) — same forgiveness rule as compare-results.ts.
const jsonStart = raw.lastIndexOf("\n{");
if (jsonStart === -1) {
throw new Error(`median-results: no JSON payload found in ${path}`);
}
return JSON.parse(raw.slice(jsonStart + 1)) as BenchPayload;
}

/**
* Numeric median. For even N we return the lower of the two middle values
* instead of interpolating — this keeps the output row anchored to an
* actually-observed run (so the attached rme/samples remain meaningful).
*/
function median(values: number[]): number {
if (values.length === 0) return 0;
const sorted = [...values].sort((a, b) => a - b);
return sorted[Math.floor((sorted.length - 1) / 2)];
}

function main(): void {
const paths = argv.slice(2).filter((a) => !a.startsWith("-"));
if (paths.length === 0) {
stderr.write(
"Usage: median-results.ts <run-1.json> [<run-2.json> ...] > out.json\n",
);
exit(2);
}

const payloads = paths.map(loadPayload);

// Single-run fallback: nothing to median, just pass through.
if (payloads.length === 1) {
stdout.write(`${JSON.stringify(payloads[0], null, 2)}\n`);
return;
}

// Collect rows by fixture name across all runs.
const byName = new Map<string, ResultRow[]>();
for (const payload of payloads) {
for (const row of payload.results) {
const rows = byName.get(row.name) ?? [];
rows.push(row);
byName.set(row.name, rows);
}
}

const merged: ResultRow[] = [];
for (const [name, rows] of byName) {
if (rows.length < payloads.length) {
stderr.write(
`median-results: fixture "${name}" present in ${rows.length}/${payloads.length} runs; median computed across subset.\n`,
);
}
const opsValues = rows.map((r) => r.opsPerSec);
const medianOps = median(opsValues);
// Pick the row closest to median so rme/samples/bytesPerOp/encodedSize
// reflect an actually-observed run, not a synthesized one.
let closest = rows[0];
let bestDistance = Math.abs(rows[0].opsPerSec - medianOps);
for (const r of rows) {
const d = Math.abs(r.opsPerSec - medianOps);
if (d < bestDistance) {
closest = r;
bestDistance = d;
}
}
merged.push({
name,
opsPerSec: medianOps,
rme: closest.rme,
samples: closest.samples,
bytesPerOp: closest.bytesPerOp,
encodedSize: closest.encodedSize,
});
}

// Envelope metadata: keep node/platform from the first run (they must
// match across runs to be comparable; divergence means the operator
// did something wrong) and use the latest timestamp.
const first = payloads[0];
const timestamp = payloads
.map((p) => p.timestamp)
.sort()
.at(-1) as string;

const out: BenchPayload = {
node: first.node,
platform: first.platform,
timestamp,
results: merged,
};

stdout.write(`${JSON.stringify(out, null, 2)}\n`);
}

main();
Loading
Loading