diff --git a/benchmarks/baselines/main.json b/benchmarks/baselines/main.json new file mode 100644 index 000000000..5a159517d --- /dev/null +++ b/benchmarks/baselines/main.json @@ -0,0 +1,127 @@ +{ + "node": "v25.8.1", + "platform": "linux/x64", + "timestamp": "2026-04-20T14:03:09.404Z", + "results": [ + { + "name": "SimpleMessage :: toBinary (pre-built, 19 B)", + "opsPerSec": 720572.6467325875, + "rme": 4.586523984485405, + "samples": 584677 + }, + { + "name": "ExportTraceRequest (100 spans) :: toBinary (pre-built, 32926 B)", + "opsPerSec": 942.728512279284, + "rme": 5.20843148311372, + "samples": 905 + }, + { + "name": "ExportMetricsRequest (50 series) :: toBinary (pre-built, 17696 B)", + "opsPerSec": 1690.2655826286807, + "rme": 1.2287491782968762, + "samples": 1663 + }, + { + "name": "ExportLogsRequest (100 records) :: toBinary (pre-built, 21319 B)", + "opsPerSec": 1769.099144200511, + "rme": 0.7934304535540673, + "samples": 1741 + }, + { + "name": "K8sPodList (20 pods) :: toBinary (pre-built, 28900 B)", + "opsPerSec": 1761.1402301413486, + "rme": 3.1648732429277913, + "samples": 1662 + }, + { + "name": "GraphQLRequest :: toBinary (pre-built, 624 B)", + "opsPerSec": 145455.54923227328, + "rme": 1.939681764666347, + "samples": 136959 + }, + { + "name": "GraphQLResponse :: toBinary (pre-built, 1366 B)", + "opsPerSec": 207739.65617123304, + "rme": 7.000451540273928, + "samples": 166418 + }, + { + "name": "RpcRequest :: toBinary (pre-built, 501 B)", + "opsPerSec": 250839.90037953644, + "rme": 2.3455244711213687, + "samples": 233170 + }, + { + "name": "RpcResponse :: toBinary (pre-built, 602 B)", + "opsPerSec": 393930.1377351528, + "rme": 3.2692980105549525, + "samples": 344566 + }, + { + "name": "StressMessage (depth=8, width=200) :: toBinary (pre-built, 12868 B)", + "opsPerSec": 6329.2079657042395, + "rme": 1.2442163953887613, + "samples": 6167 + }, + { + "name": "SimpleMessage :: fromBinary (19 B)", + "opsPerSec": 884713.3936586891, + "rme": 0.16714493485586263, + "samples": 859830 + }, + { + "name": "ExportTraceRequest (100 spans) :: fromBinary (32926 B)", + "opsPerSec": 545.5350076541376, + "rme": 1.202558278680317, + "samples": 538 + }, + { + "name": "ExportMetricsRequest (50 series) :: fromBinary (17696 B)", + "opsPerSec": 888.3503061250073, + "rme": 1.2969980310240832, + "samples": 864 + }, + { + "name": "ExportLogsRequest (100 records) :: fromBinary (21319 B)", + "opsPerSec": 882.9676497121711, + "rme": 1.2257471502567474, + "samples": 862 + }, + { + "name": "K8sPodList (20 pods) :: fromBinary (28900 B)", + "opsPerSec": 1007.335044928595, + "rme": 0.8788453963613843, + "samples": 993 + }, + { + "name": "GraphQLRequest :: fromBinary (624 B)", + "opsPerSec": 221605.2437830516, + "rme": 0.3787980957928544, + "samples": 211417 + }, + { + "name": "GraphQLResponse :: fromBinary (1366 B)", + "opsPerSec": 205348.95710481095, + "rme": 1.6772806334555086, + "samples": 190009 + }, + { + "name": "RpcRequest :: fromBinary (501 B)", + "opsPerSec": 208822.5527629229, + "rme": 0.34718286675914534, + "samples": 204013 + }, + { + "name": "RpcResponse :: fromBinary (602 B)", + "opsPerSec": 295797.20225724357, + "rme": 2.029660371713425, + "samples": 281191 + }, + { + "name": "StressMessage (depth=8, width=200) :: fromBinary (12868 B)", + "opsPerSec": 2874.297533564573, + "rme": 1.0153087164867896, + "samples": 2797 + } + ] +} diff --git a/benchmarks/scripts/compare-results.ts b/benchmarks/scripts/compare-results.ts index ae9d4fbb5..a40c9308d 100644 --- a/benchmarks/scripts/compare-results.ts +++ b/benchmarks/scripts/compare-results.ts @@ -67,8 +67,10 @@ function parseArgs(): Options { if (arg.startsWith("--baseline=")) opts.baseline = arg.slice(11); else if (arg.startsWith("--current=")) opts.current = arg.slice(10); else if (arg.startsWith("--output=")) opts.output = arg.slice(9); - else if (arg.startsWith("--threshold-ops=")) opts.thresholdOps = Number(arg.slice(16)); - else if (arg.startsWith("--threshold-mem=")) opts.thresholdMem = Number(arg.slice(16)); + else if (arg.startsWith("--threshold-ops=")) + opts.thresholdOps = Number(arg.slice(16)); + else if (arg.startsWith("--threshold-mem=")) + opts.thresholdMem = Number(arg.slice(16)); else if (arg === "--no-baseline") opts.noBaseline = true; else if (arg === "--help" || arg === "-h") { printUsage(); @@ -135,6 +137,12 @@ interface CompareRow { status: "ok" | "improved" | "regression" | "new"; } +// Flat thresholds (ops %, memory %). Variance on CI runners is now +// controlled upstream in run-matrix-ci.sh via `taskset -c 0` CPU pinning +// + median-of-5 runs; see analysis/benchmark-variance-root-cause.md for +// the measurement that showed 76% -> 7% spread after pinning. Keeping +// thresholds flat lets real algorithmic regressions (>5% ops, >10% mem) +// surface without bucket-dependent policy the reviewer has to interpret. function compare( baseline: BenchPayload | null, current: BenchPayload, @@ -224,7 +232,8 @@ function renderMarkdown( out.push(`## ${summaryTitle}`); out.push(""); out.push( - `Thresholds: throughput regression \`>${opts.thresholdOps}%\`, memory regression \`>${opts.thresholdMem}%\`. Current run on \`${opts.current.platform}\`, Node \`${opts.current.node}\`, captured \`${opts.current.timestamp}\`.`, + `Thresholds: throughput regression \`>${opts.thresholdOps}%\`, memory regression \`>${opts.thresholdMem}%\`. ` + + `Runner pinned to CPU 0 via taskset. Current run on \`${opts.current.platform}\`, Node \`${opts.current.node}\`, captured \`${opts.current.timestamp}\`.`, ); if (opts.baseline) { out.push( diff --git a/benchmarks/scripts/median-results.ts b/benchmarks/scripts/median-results.ts new file mode 100644 index 000000000..b50098177 --- /dev/null +++ b/benchmarks/scripts/median-results.ts @@ -0,0 +1,165 @@ +// Copyright 2021-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// median-results.ts — combine N bench-matrix JSON dumps into a single +// payload whose ops/sec per fixture is the median across runs. +// +// Why +// --- +// Local 5-run measurements on main showed 2x host-level spread on fast +// fixtures (SimpleMessage, GraphQLRequest) even with tinybench's own RME +// under 0.2%. A single-run comparison therefore produces false-positive +// "regressions" whose magnitude is entirely noise. Median-of-N is the +// standard, cheap mitigation: one outlier cannot move the reported number. +// +// Usage +// ----- +// node scripts/median-results.ts runs/run-1.json runs/run-2.json ... > baseline.json +// +// Behaviour +// --------- +// - With a single input file, passes the payload through unchanged — this +// keeps the script safe to use as a no-op step in CI pipelines that +// occasionally reduce to one run (e.g. local development). +// - With N >= 2 inputs, groups rows by `name`, takes the numeric median of +// `opsPerSec` per fixture, and attaches the `rme` / `samples` fields +// from the run whose ops/sec is closest to that median — so downstream +// consumers still see a representative (not synthetic) confidence +// interval. +// - Fixtures missing from some runs are included if they appear in >= 1 +// input; the median is computed across whatever subset is present and a +// warning is emitted to stderr so drift is visible. +// - Output JSON structure matches bench-matrix.ts's payload exactly. + +import { readFileSync } from "node:fs"; +import { argv, exit, stderr, stdout } from "node:process"; + +interface ResultRow { + name: string; + opsPerSec: number; + rme?: number; + samples?: number; + bytesPerOp?: number; + encodedSize?: number; +} + +interface BenchPayload { + node: string; + platform: string; + timestamp: string; + results: ResultRow[]; +} + +function loadPayload(path: string): BenchPayload { + const raw = readFileSync(path, "utf8").trim(); + if (raw.startsWith("{")) { + return JSON.parse(raw) as BenchPayload; + } + // Tolerate raw bench-matrix stdout (with table output before the JSON + // payload) — same forgiveness rule as compare-results.ts. + const jsonStart = raw.lastIndexOf("\n{"); + if (jsonStart === -1) { + throw new Error(`median-results: no JSON payload found in ${path}`); + } + return JSON.parse(raw.slice(jsonStart + 1)) as BenchPayload; +} + +/** + * Numeric median. For even N we return the lower of the two middle values + * instead of interpolating — this keeps the output row anchored to an + * actually-observed run (so the attached rme/samples remain meaningful). + */ +function median(values: number[]): number { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + return sorted[Math.floor((sorted.length - 1) / 2)]; +} + +function main(): void { + const paths = argv.slice(2).filter((a) => !a.startsWith("-")); + if (paths.length === 0) { + stderr.write( + "Usage: median-results.ts [ ...] > out.json\n", + ); + exit(2); + } + + const payloads = paths.map(loadPayload); + + // Single-run fallback: nothing to median, just pass through. + if (payloads.length === 1) { + stdout.write(`${JSON.stringify(payloads[0], null, 2)}\n`); + return; + } + + // Collect rows by fixture name across all runs. + const byName = new Map(); + for (const payload of payloads) { + for (const row of payload.results) { + const rows = byName.get(row.name) ?? []; + rows.push(row); + byName.set(row.name, rows); + } + } + + const merged: ResultRow[] = []; + for (const [name, rows] of byName) { + if (rows.length < payloads.length) { + stderr.write( + `median-results: fixture "${name}" present in ${rows.length}/${payloads.length} runs; median computed across subset.\n`, + ); + } + const opsValues = rows.map((r) => r.opsPerSec); + const medianOps = median(opsValues); + // Pick the row closest to median so rme/samples/bytesPerOp/encodedSize + // reflect an actually-observed run, not a synthesized one. + let closest = rows[0]; + let bestDistance = Math.abs(rows[0].opsPerSec - medianOps); + for (const r of rows) { + const d = Math.abs(r.opsPerSec - medianOps); + if (d < bestDistance) { + closest = r; + bestDistance = d; + } + } + merged.push({ + name, + opsPerSec: medianOps, + rme: closest.rme, + samples: closest.samples, + bytesPerOp: closest.bytesPerOp, + encodedSize: closest.encodedSize, + }); + } + + // Envelope metadata: keep node/platform from the first run (they must + // match across runs to be comparable; divergence means the operator + // did something wrong) and use the latest timestamp. + const first = payloads[0]; + const timestamp = payloads + .map((p) => p.timestamp) + .sort() + .at(-1) as string; + + const out: BenchPayload = { + node: first.node, + platform: first.platform, + timestamp, + results: merged, + }; + + stdout.write(`${JSON.stringify(out, null, 2)}\n`); +} + +main(); diff --git a/benchmarks/scripts/run-matrix-ci.sh b/benchmarks/scripts/run-matrix-ci.sh index f5ffc3c62..9aafe1502 100755 --- a/benchmarks/scripts/run-matrix-ci.sh +++ b/benchmarks/scripts/run-matrix-ci.sh @@ -23,25 +23,51 @@ # RME shrinks, and a clean stdout stream that contains only the JSON # payload so compare-results.ts can read it with fs.readFileSync. # +# Additionally, this wrapper runs the matrix N times (default 5) and feeds +# the per-run JSON outputs through `scripts/median-results.ts` so the +# reported number is the median across runs. +# +# Variance control — the root-cause investigation +# (analysis/benchmark-variance-root-cause.md) measured a +76% run-to-run +# spread on `ExportTrace::toBinary` unpinned on a heterogeneous P/E-core +# host. Pinning the process to CPU 0 (`taskset -c 0`) collapsed the same +# workload to +7% spread — a 10x reduction. Frame proportions in the CPU +# profiles were identical across slow and fast runs, confirming the +# variance was pure environmental (scheduler migration + intel_pstate +# frequency scaling), not algorithmic. Pinning is therefore the primary +# noise reduction; median-of-5 is the secondary filter. +# # This wrapper: # 1. Logs the host profile (Node version, CPU, RAM) for trace records. -# 2. Does a throwaway warmup run of the matrix so JIT + ICs are warm on +# 2. Detects `taskset` and pins each invocation to CPU 0 when available. +# 3. Does a throwaway warmup run of the matrix so JIT + ICs are warm on # the main benchmark functions. -# 3. Runs the real matrix with CI-sized time budgets. -# 4. Extracts the last JSON object from stdout and writes it to the -# caller-specified output file. +# 4. Runs the real matrix N times (default 5) with CI-sized time budgets. +# 5. Extracts the JSON payload from each run's stdout. +# 6. Computes the per-fixture median and writes it to the output file. # # Usage: benchmarks/scripts/run-matrix-ci.sh [output.json] # defaults to bench-results.json in the current working directory. +# +# Env overrides: +# BENCH_MATRIX_RUNS number of measurement runs (default 5) +# BENCH_MATRIX_CI_TIME per-run measurement ms (default 3000) +# BENCH_MATRIX_CI_WARMUP per-run warmup ms (default 1000) +# BENCH_MATRIX_WARMUP_TIME throwaway warmup pass ms (default 500/200) set -euo pipefail out="${1:-bench-results.json}" +runs="${BENCH_MATRIX_RUNS:-5}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BENCH_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" cd "${BENCH_DIR}" +runs_dir=".bench-runs" +rm -rf "${runs_dir}" +mkdir -p "${runs_dir}" + # -------- 1. Host profile (trace only, never fails the job) -------- echo "::group::Host profile" echo "node: $(node --version)" @@ -55,44 +81,70 @@ fi if command -v lscpu >/dev/null 2>&1; then lscpu | grep -E "Model name|CPU MHz|CPU max MHz" || true fi +echo "runs: ${runs}" + +# -------- 1b. CPU pinning detection -------- +# Pin each measurement invocation to a single CPU to eliminate scheduler +# migration jitter (primary source of >50% run-to-run variance on hosts +# with heterogeneous P/E-core topologies). CPU 0 is a P-core on Intel +# Core Ultra and the first available core on the GitHub ubuntu-latest +# runner fleet. +if command -v taskset >/dev/null 2>&1; then + pin_prefix=(taskset -c 0) + echo "cpu pinning: enabled (taskset -c 0)" +else + pin_prefix=() + echo "cpu pinning: DISABLED (taskset not available) — results will be noisy" +fi echo "::endgroup::" # -------- 2. Warmup pass (discarded) -------- echo "::group::Warmup" BENCH_MATRIX_TIME="${BENCH_MATRIX_WARMUP_TIME:-500}" \ BENCH_MATRIX_WARMUP="${BENCH_MATRIX_WARMUP_TIME:-200}" \ - npx tsx src/bench-matrix.ts >/dev/null 2>&1 || true + "${pin_prefix[@]}" npx tsx src/bench-matrix.ts >/dev/null 2>&1 || true echo "Warmup complete." echo "::endgroup::" -# -------- 3. Measurement pass -------- -echo "::group::Measurement" -BENCH_MATRIX_TIME="${BENCH_MATRIX_CI_TIME:-3000}" \ -BENCH_MATRIX_WARMUP="${BENCH_MATRIX_CI_WARMUP:-1000}" \ - npx tsx src/bench-matrix.ts | tee ".bench-stdout.log" -echo "::endgroup::" - -# -------- 4. Extract JSON payload -------- -# bench-matrix.ts prints human-readable tables and then one line of -# `=== Matrix JSON ===` followed by a single-line JSON object. Grab the -# last line that starts with '{' as the payload. -node -e ' -const fs = require("node:fs"); -const out = process.argv[1]; -const lines = fs.readFileSync(".bench-stdout.log", "utf8").split("\n"); -let payload = null; -for (let i = lines.length - 1; i >= 0; i--) { - const ln = lines[i].trim(); - if (ln.startsWith("{") && ln.endsWith("}")) { - try { payload = JSON.parse(ln); break; } catch { /* keep looking */ } - } -} -if (!payload) { - console.error("run-matrix-ci: could not locate JSON payload in bench-matrix output."); - process.exit(1); +# -------- 3. Measurement passes -------- +extract_json() { + # $1 = stdout log, $2 = output json path + node -e ' + const fs = require("node:fs"); + const src = process.argv[1]; + const dst = process.argv[2]; + const lines = fs.readFileSync(src, "utf8").split("\n"); + let payload = null; + for (let i = lines.length - 1; i >= 0; i--) { + const ln = lines[i].trim(); + if (ln.startsWith("{") && ln.endsWith("}")) { + try { payload = JSON.parse(ln); break; } catch { /* keep looking */ } + } + } + if (!payload) { + console.error(`run-matrix-ci: could not locate JSON payload in ${src}.`); + process.exit(1); + } + fs.writeFileSync(dst, JSON.stringify(payload, null, 2) + "\n"); + console.error(`run-matrix-ci: wrote ${payload.results.length} rows to ${dst}.`); + ' "$1" "$2" } -fs.writeFileSync(out, JSON.stringify(payload, null, 2) + "\n"); -console.error(`run-matrix-ci: wrote ${payload.results.length} result rows to ${out}.`); -' "$out" -rm -f .bench-stdout.log +for i in $(seq 1 "${runs}"); do + echo "::group::Measurement run ${i}/${runs}" + log=".bench-stdout-${i}.log" + BENCH_MATRIX_TIME="${BENCH_MATRIX_CI_TIME:-3000}" \ + BENCH_MATRIX_WARMUP="${BENCH_MATRIX_CI_WARMUP:-1000}" \ + "${pin_prefix[@]}" npx tsx src/bench-matrix.ts | tee "${log}" + extract_json "${log}" "${runs_dir}/run-${i}.json" + rm -f "${log}" + echo "::endgroup::" +done + +# -------- 4. Compute median across runs -------- +echo "::group::Compute median across ${runs} run(s)" +npx tsx scripts/median-results.ts "${runs_dir}"/run-*.json > "${out}" +echo "run-matrix-ci: wrote median payload to ${out}." +echo "::endgroup::" + +rm -rf "${runs_dir}"