diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml new file mode 100644 index 000000000..aa16f3c6b --- /dev/null +++ b/.github/workflows/benchmark.yaml @@ -0,0 +1,166 @@ +name: benchmark + +# Runs the benchmark matrix on every PR targeting main, on pushes to main +# (baseline refresh), and on manual dispatch. The job compares PR ops/s +# against the latest main baseline and flags >5% throughput regressions +# and >10% memory regressions. +# +# Storage model (see benchmarks/baselines/README.md): +# - Artifacts hold the authoritative JSON for trend history (90-day +# retention for PRs, 365 days for the main baseline). +# - `benchmarks/baselines/main.json` is an in-repo quick-reference copy +# that gets refreshed by a follow-up chore PR after a merge to main. + +on: + pull_request: + branches: [main] + paths: + - "packages/protobuf/**" + - "benchmarks/**" + - ".github/workflows/benchmark.yaml" + push: + branches: [main] + paths: + - "packages/protobuf/**" + - "benchmarks/**" + workflow_dispatch: + +permissions: + contents: read + pull-requests: write + +env: + DO_NOT_TRACK: 1 + NODE_VERSION: "22" + +concurrency: + group: benchmark-${{ github.ref }} + cancel-in-progress: true + +jobs: + bench-matrix: + name: bench-matrix (${{ github.event_name }}) + runs-on: ubuntu-latest + timeout-minutes: 25 + + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + # Enough history that we can also check out the base branch for + # the baseline comparison pass if the artifact download fails. + fetch-depth: 0 + + - name: Setup Node ${{ env.NODE_VERSION }} + uses: actions/setup-node@v6 + with: + node-version: ${{ env.NODE_VERSION }} + cache: "npm" + + - name: Install + run: npm ci --ignore-scripts + env: + HUSKY: 0 + + - name: Build @bufbuild/protobuf + run: npx turbo run build --filter=@bufbuild/protobuf + + - name: Generate benchmark code (proto + pbjs) + run: npx turbo run generate --filter=@bufbuild/protobuf-benchmarks + + - name: Run benchmark matrix + working-directory: benchmarks + run: bash scripts/run-matrix-ci.sh bench-results.json + + - name: Upload run artifact + uses: actions/upload-artifact@v4 + with: + name: bench-results-${{ github.event.pull_request.number || github.sha }} + path: benchmarks/bench-results.json + retention-days: 90 + + # ------------------------------------------------------------------ + # Baseline acquisition (PR only). For `push` to main, the PR run + # becomes the new baseline — the artifact upload below is sufficient. + # ------------------------------------------------------------------ + + - name: Download latest main baseline artifact + if: github.event_name == 'pull_request' + id: dl-baseline + uses: dawidd6/action-download-artifact@v6 + continue-on-error: true + with: + workflow: benchmark.yaml + branch: main + name: bench-baseline-main + path: benchmarks/baseline-download + search_artifacts: true + if_no_artifact_found: warn + + - name: Resolve baseline source + if: github.event_name == 'pull_request' + run: | + set -euo pipefail + if [[ -f benchmarks/baseline-download/bench-results.json ]]; then + cp benchmarks/baseline-download/bench-results.json benchmarks/baseline-results.json + echo "Using downloaded main artifact as baseline." + elif [[ -f benchmarks/baselines/main.json ]]; then + cp benchmarks/baselines/main.json benchmarks/baseline-results.json + echo "Using in-repo benchmarks/baselines/main.json as baseline." + else + echo "No baseline available — compare step will emit an informational report." + fi + + - name: Compare PR against baseline + if: github.event_name == 'pull_request' + id: compare + working-directory: benchmarks + run: | + set -euo pipefail + if [[ -f baseline-results.json ]]; then + npx tsx scripts/compare-results.ts \ + --baseline=baseline-results.json \ + --current=bench-results.json \ + --output=bench-report.md \ + --threshold-ops=5 \ + --threshold-mem=10 + else + npx tsx scripts/compare-results.ts \ + --current=bench-results.json \ + --output=bench-report.md \ + --threshold-ops=5 \ + --threshold-mem=10 \ + --no-baseline + fi + if grep -q "REGRESSION" bench-report.md 2>/dev/null; then + echo "status=regression" >> "$GITHUB_OUTPUT" + else + echo "status=ok" >> "$GITHUB_OUTPUT" + fi + + - name: Comment report on PR + if: github.event_name == 'pull_request' + uses: marocchino/sticky-pull-request-comment@v2 + with: + header: benchmark-matrix + path: benchmarks/bench-report.md + + - name: Flag regression annotation + if: github.event_name == 'pull_request' && steps.compare.outputs.status == 'regression' + run: | + echo "::warning::Benchmark matrix flagged a regression. See the PR comment for the full table." + + # ------------------------------------------------------------------ + # Baseline refresh (push-to-main only). The PR run becomes the new + # authoritative baseline and gets uploaded as a stable-named artifact + # so subsequent PR jobs can pull it. + # ------------------------------------------------------------------ + + - name: Upload baseline artifact (main only) + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + uses: actions/upload-artifact@v4 + with: + name: bench-baseline-main + path: benchmarks/bench-results.json + retention-days: 365 + overwrite: true diff --git a/benchmarks/baselines/README.md b/benchmarks/baselines/README.md new file mode 100644 index 000000000..02bee96d7 --- /dev/null +++ b/benchmarks/baselines/README.md @@ -0,0 +1,59 @@ +# Benchmark Baselines + +This directory holds **quick-reference baselines** — last-known-good throughput numbers for the matrix, committed alongside code so a local developer can run `scripts/compare-results.ts` without having to hit GitHub. + +## Storage model + +The authoritative baseline is an **Actions artifact**, not a file in the repo: + +| Where | Role | Retention | +|----------------------------------------------|---------------------------------------|------------------| +| Artifact `bench-baseline-main` | Source of truth for CI diffs | 365 days | +| `benchmarks/baselines/main.json` | Quick-reference for local dev + fallback when the artifact API is unreachable | tracked in git | +| Artifact `bench-results-` | Historical trend per PR | 90 days | + +### Why two stores + +1. **Artifacts give trend history for free.** Downloading `bench-baseline-main@` at any point in the past reconstructs the baseline of that day. No file churn in git. +2. **A committed fallback de-risks the artifact dependency.** If GitHub artifact downloads rate-limit or the action times out, CI falls back to `main.json` so PRs are never blocked on infrastructure. The file does not have to be fresh to be useful — being roughly right on order of magnitude is enough to flag a regression that a human can investigate. +3. **Local dev needs a zero-network path.** `npm run bench:matrix:ci` followed by `npm run bench:matrix:compare -- --baseline=baselines/main.json --current=bench-results.json` works entirely offline. + +## Update procedure + +`main.json` is refreshed **by hand via a one-line PR** after every merge to `main` whose benchmark numbers moved materially (>5% on any row). A follow-up iteration will automate this via a `benchmark-baseline-refresh` workflow that opens the PR from the push-to-main run, but until that lands, manual refresh is the policy. + +```bash +# After a merge to main, pull the latest artifact: +gh run download --name bench-baseline-main --dir /tmp/baseline +cp /tmp/baseline/bench-results.json benchmarks/baselines/main.json + +# Commit on a chore/ branch and open a PR: +git checkout -b chore/refresh-benchmark-baseline +git add benchmarks/baselines/main.json +git commit -m "chore(benchmarks): refresh main baseline" +git push -u origin HEAD +gh pr create --title "chore(benchmarks): refresh main baseline" \ + --body "Auto-refresh from the bench-baseline-main CI artifact." +``` + +## Format + +Every `*.json` in this directory is the structured payload written by `bench-matrix.ts` (last line of its stdout when run standalone, or the full file when run via `scripts/run-matrix-ci.sh`): + +```json +{ + "node": "v22.11.0", + "platform": "linux/x64", + "timestamp": "2026-04-19T18:00:00.000Z", + "results": [ + { + "name": "SimpleMessage :: toBinary (pre-built, 19 B)", + "opsPerSec": 1065000, + "rme": 1.3, + "samples": 512 + } + ] +} +``` + +Field names are stable; additional fields are additive (e.g. a future `bytesPerOp` for memory tracking will not break existing consumers). diff --git a/benchmarks/package.json b/benchmarks/package.json index 55efeb1f6..bbff26b13 100644 --- a/benchmarks/package.json +++ b/benchmarks/package.json @@ -11,6 +11,8 @@ "bench:fromBinary": "tsx src/bench-fromBinary.ts", "bench:comparison": "tsx src/bench-comparison-protobufjs.ts", "bench:matrix": "tsx src/bench-matrix.ts", + "bench:matrix:ci": "bash scripts/run-matrix-ci.sh bench-results.json", + "bench:matrix:compare": "tsx scripts/compare-results.ts", "bench:memory": "node --expose-gc --import tsx src/bench-memory.ts", "bench:report": "tsx src/report.ts", "build": "../node_modules/typescript/bin/tsc --noEmit", diff --git a/benchmarks/scripts/compare-results.ts b/benchmarks/scripts/compare-results.ts new file mode 100644 index 000000000..ae9d4fbb5 --- /dev/null +++ b/benchmarks/scripts/compare-results.ts @@ -0,0 +1,286 @@ +// Copyright 2021-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// compare-results.ts — diff two bench-matrix JSON dumps and emit a +// Markdown regression table. Input is the structured payload written by +// src/bench-matrix.ts (the last `=== Matrix JSON ===` block when run +// standalone; run-matrix-ci.sh strips everything except the JSON object +// so this script can fs.readFileSync() it directly). +// +// Thresholds are configurable on the CLI so CI can tune leniency without +// touching this file. Defaults match the Phase 2 contract: 5% throughput, +// 10% memory. + +import { readFileSync, writeFileSync } from "node:fs"; +import { argv, exit } from "node:process"; + +interface ResultRow { + name: string; + opsPerSec: number; + rme: number; + samples: number; + /** + * Optional — bench-matrix currently does not emit per-op memory. The + * field is wired here so a follow-up fixture addition is a one-line + * change in the runner, not a script rewrite. + */ + bytesPerOp?: number; +} + +interface BenchPayload { + node: string; + platform: string; + timestamp: string; + results: ResultRow[]; +} + +interface Options { + baseline: string | null; + current: string; + output: string; + thresholdOps: number; + thresholdMem: number; + noBaseline: boolean; +} + +function parseArgs(): Options { + const opts: Options = { + baseline: null, + current: "bench-results.json", + output: "bench-report.md", + thresholdOps: 5, + thresholdMem: 10, + noBaseline: false, + }; + for (const arg of argv.slice(2)) { + if (arg.startsWith("--baseline=")) opts.baseline = arg.slice(11); + else if (arg.startsWith("--current=")) opts.current = arg.slice(10); + else if (arg.startsWith("--output=")) opts.output = arg.slice(9); + else if (arg.startsWith("--threshold-ops=")) opts.thresholdOps = Number(arg.slice(16)); + else if (arg.startsWith("--threshold-mem=")) opts.thresholdMem = Number(arg.slice(16)); + else if (arg === "--no-baseline") opts.noBaseline = true; + else if (arg === "--help" || arg === "-h") { + printUsage(); + exit(0); + } else { + console.error(`Unknown argument: ${arg}`); + printUsage(); + exit(2); + } + } + return opts; +} + +function printUsage(): void { + console.error(`Usage: compare-results.ts [options] + --baseline=PATH Path to baseline JSON (omit for --no-baseline) + --current=PATH Path to current run JSON (default bench-results.json) + --output=PATH Markdown report path (default bench-report.md) + --threshold-ops=N Throughput regression threshold, % (default 5) + --threshold-mem=N Memory regression threshold, % (default 10) + --no-baseline Emit current-only report (first run on a fork)`); +} + +function loadPayload(path: string): BenchPayload { + const raw = readFileSync(path, "utf8").trim(); + // run-matrix-ci.sh produces a pure JSON file. When a developer points + // the script at a raw bench-matrix stdout dump, tolerate the "=== Matrix + // JSON ===" sentinel by locating the last top-level '{' in the text. + if (raw.startsWith("{")) { + return JSON.parse(raw) as BenchPayload; + } + const jsonStart = raw.lastIndexOf("\n{"); + if (jsonStart === -1) { + throw new Error(`compare-results: no JSON payload found in ${path}`); + } + return JSON.parse(raw.slice(jsonStart + 1)) as BenchPayload; +} + +function fmtOps(n: number): string { + if (!Number.isFinite(n) || n === 0) return "–"; + if (n >= 1000) return n.toLocaleString("en-US", { maximumFractionDigits: 0 }); + return n.toFixed(1); +} + +function fmtBytes(n: number | undefined): string { + if (n === undefined || !Number.isFinite(n)) return "–"; + return n.toLocaleString("en-US", { maximumFractionDigits: 0 }); +} + +function fmtDelta(pct: number): string { + if (!Number.isFinite(pct)) return "–"; + const sign = pct >= 0 ? "+" : ""; + return `${sign}${pct.toFixed(1)}%`; +} + +interface CompareRow { + name: string; + baselineOps: number | null; + currentOps: number; + opsDeltaPct: number | null; + baselineMem: number | null; + currentMem: number | null; + memDeltaPct: number | null; + status: "ok" | "improved" | "regression" | "new"; +} + +function compare( + baseline: BenchPayload | null, + current: BenchPayload, + thresholdOps: number, + thresholdMem: number, +): CompareRow[] { + const baseMap = new Map(); + if (baseline) { + for (const r of baseline.results) baseMap.set(r.name, r); + } + + const rows: CompareRow[] = []; + for (const cur of current.results) { + const base = baseMap.get(cur.name); + let opsDelta: number | null = null; + let memDelta: number | null = null; + if (base && base.opsPerSec > 0) { + opsDelta = ((cur.opsPerSec - base.opsPerSec) / base.opsPerSec) * 100; + } + if ( + base?.bytesPerOp !== undefined && + cur.bytesPerOp !== undefined && + base.bytesPerOp > 0 + ) { + memDelta = ((cur.bytesPerOp - base.bytesPerOp) / base.bytesPerOp) * 100; + } + + let status: CompareRow["status"] = "ok"; + if (!base) { + status = "new"; + } else if (opsDelta !== null && opsDelta <= -thresholdOps) { + status = "regression"; + } else if (memDelta !== null && memDelta >= thresholdMem) { + status = "regression"; + } else if (opsDelta !== null && opsDelta >= thresholdOps) { + status = "improved"; + } + + rows.push({ + name: cur.name, + baselineOps: base?.opsPerSec ?? null, + currentOps: cur.opsPerSec, + opsDeltaPct: opsDelta, + baselineMem: base?.bytesPerOp ?? null, + currentMem: cur.bytesPerOp ?? null, + memDeltaPct: memDelta, + status, + }); + } + return rows; +} + +function statusBadge(s: CompareRow["status"]): string { + switch (s) { + case "regression": + return "REGRESSION"; + case "improved": + return "improved"; + case "new": + return "new"; + case "ok": + return "ok"; + } +} + +function renderMarkdown( + rows: CompareRow[], + opts: { + baseline: BenchPayload | null; + current: BenchPayload; + thresholdOps: number; + thresholdMem: number; + }, +): string { + const regressionCount = rows.filter((r) => r.status === "regression").length; + const improvedCount = rows.filter((r) => r.status === "improved").length; + const newCount = rows.filter((r) => r.status === "new").length; + const unchangedCount = + rows.length - regressionCount - improvedCount - newCount; + + const summaryTitle = + regressionCount > 0 + ? `Benchmark: ${regressionCount} regression(s)` + : "Benchmark: no regressions"; + + const out: string[] = []; + out.push(`## ${summaryTitle}`); + out.push(""); + out.push( + `Thresholds: throughput regression \`>${opts.thresholdOps}%\`, memory regression \`>${opts.thresholdMem}%\`. Current run on \`${opts.current.platform}\`, Node \`${opts.current.node}\`, captured \`${opts.current.timestamp}\`.`, + ); + if (opts.baseline) { + out.push( + `Baseline captured \`${opts.baseline.timestamp}\` on \`${opts.baseline.platform}\`, Node \`${opts.baseline.node}\`.`, + ); + } else { + out.push("No baseline available — this report is informational only."); + } + out.push(""); + out.push( + `Summary: \`${regressionCount}\` regressed, \`${improvedCount}\` improved, \`${newCount}\` new, \`${unchangedCount}\` unchanged.`, + ); + out.push(""); + out.push( + "| Fixture | Baseline ops/s | PR ops/s | Δ ops | Baseline B/op | PR B/op | Δ mem | Status |", + ); + out.push( + "|---------|---------------:|---------:|------:|--------------:|--------:|------:|:-------|", + ); + for (const r of rows) { + out.push( + `| ${r.name} | ${r.baselineOps === null ? "–" : fmtOps(r.baselineOps)} | ${fmtOps(r.currentOps)} | ${r.opsDeltaPct === null ? "–" : fmtDelta(r.opsDeltaPct)} | ${fmtBytes(r.baselineMem ?? undefined)} | ${fmtBytes(r.currentMem ?? undefined)} | ${r.memDeltaPct === null ? "–" : fmtDelta(r.memDeltaPct)} | ${statusBadge(r.status)} |`, + ); + } + out.push(""); + out.push( + "_Produced by `benchmarks/scripts/compare-results.ts`. Artifacts: `bench-results-` (current), `bench-baseline-main` (baseline)._", + ); + out.push(""); + return out.join("\n"); +} + +function main(): void { + const opts = parseArgs(); + const current = loadPayload(opts.current); + const baseline = + opts.noBaseline || !opts.baseline ? null : loadPayload(opts.baseline); + const rows = compare(baseline, current, opts.thresholdOps, opts.thresholdMem); + const md = renderMarkdown(rows, { + baseline, + current, + thresholdOps: opts.thresholdOps, + thresholdMem: opts.thresholdMem, + }); + writeFileSync(opts.output, md, "utf8"); + // Print to stdout for local/dev runs so developers don't have to open + // the file to see the result. + console.log(md); + const hasRegression = rows.some((r) => r.status === "regression"); + // Exit 0 even on regression — the workflow surfaces the flag via the + // PR comment + a ::warning:: annotation. Hard-failing the job would + // block legitimate PRs that intentionally trade throughput for another + // gain (e.g. bundle size). Tech-lead can promote to hard-fail later. + if (hasRegression) { + console.error("compare-results: REGRESSION flagged (non-fatal)."); + } +} + +main(); diff --git a/benchmarks/scripts/run-matrix-ci.sh b/benchmarks/scripts/run-matrix-ci.sh new file mode 100755 index 000000000..f5ffc3c62 --- /dev/null +++ b/benchmarks/scripts/run-matrix-ci.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# Copyright 2021-2026 Buf Technologies, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# run-matrix-ci.sh — stable wrapper around `tsx src/bench-matrix.ts`. +# +# Intent +# ------ +# bench-matrix.ts's default knobs (1000 ms measurement, 200 ms warmup) are +# tuned for local development — fast feedback, noisy numbers. CI needs the +# opposite: longer warmup so the V8 tier-up settles, longer measurement so +# RME shrinks, and a clean stdout stream that contains only the JSON +# payload so compare-results.ts can read it with fs.readFileSync. +# +# This wrapper: +# 1. Logs the host profile (Node version, CPU, RAM) for trace records. +# 2. Does a throwaway warmup run of the matrix so JIT + ICs are warm on +# the main benchmark functions. +# 3. Runs the real matrix with CI-sized time budgets. +# 4. Extracts the last JSON object from stdout and writes it to the +# caller-specified output file. +# +# Usage: benchmarks/scripts/run-matrix-ci.sh [output.json] +# defaults to bench-results.json in the current working directory. + +set -euo pipefail + +out="${1:-bench-results.json}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BENCH_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" + +cd "${BENCH_DIR}" + +# -------- 1. Host profile (trace only, never fails the job) -------- +echo "::group::Host profile" +echo "node: $(node --version)" +echo "platform: $(uname -srm)" +if command -v nproc >/dev/null 2>&1; then + echo "cpus: $(nproc)" +fi +if [[ -r /proc/meminfo ]]; then + echo "mem: $(awk '/MemTotal/ {printf "%.1f GB", $2/1024/1024}' /proc/meminfo)" +fi +if command -v lscpu >/dev/null 2>&1; then + lscpu | grep -E "Model name|CPU MHz|CPU max MHz" || true +fi +echo "::endgroup::" + +# -------- 2. Warmup pass (discarded) -------- +echo "::group::Warmup" +BENCH_MATRIX_TIME="${BENCH_MATRIX_WARMUP_TIME:-500}" \ +BENCH_MATRIX_WARMUP="${BENCH_MATRIX_WARMUP_TIME:-200}" \ + npx tsx src/bench-matrix.ts >/dev/null 2>&1 || true +echo "Warmup complete." +echo "::endgroup::" + +# -------- 3. Measurement pass -------- +echo "::group::Measurement" +BENCH_MATRIX_TIME="${BENCH_MATRIX_CI_TIME:-3000}" \ +BENCH_MATRIX_WARMUP="${BENCH_MATRIX_CI_WARMUP:-1000}" \ + npx tsx src/bench-matrix.ts | tee ".bench-stdout.log" +echo "::endgroup::" + +# -------- 4. Extract JSON payload -------- +# bench-matrix.ts prints human-readable tables and then one line of +# `=== Matrix JSON ===` followed by a single-line JSON object. Grab the +# last line that starts with '{' as the payload. +node -e ' +const fs = require("node:fs"); +const out = process.argv[1]; +const lines = fs.readFileSync(".bench-stdout.log", "utf8").split("\n"); +let payload = null; +for (let i = lines.length - 1; i >= 0; i--) { + const ln = lines[i].trim(); + if (ln.startsWith("{") && ln.endsWith("}")) { + try { payload = JSON.parse(ln); break; } catch { /* keep looking */ } + } +} +if (!payload) { + console.error("run-matrix-ci: could not locate JSON payload in bench-matrix output."); + process.exit(1); +} +fs.writeFileSync(out, JSON.stringify(payload, null, 2) + "\n"); +console.error(`run-matrix-ci: wrote ${payload.results.length} result rows to ${out}.`); +' "$out" + +rm -f .bench-stdout.log diff --git a/benchmarks/src/bench-matrix.ts b/benchmarks/src/bench-matrix.ts index 6266d6d78..0f1e22f45 100644 --- a/benchmarks/src/bench-matrix.ts +++ b/benchmarks/src/bench-matrix.ts @@ -154,7 +154,12 @@ const cases: MatrixCase[] = [ ]; export async function runMatrixBench() { - const bench = new Bench({ time: 1000, warmupTime: 200 }); + // Time budgets: defaults tuned for local dev feedback. CI sets + // BENCH_MATRIX_TIME / BENCH_MATRIX_WARMUP via run-matrix-ci.sh to get + // tighter RME on long-running hosts. Values are ms. + const time = Number(process.env.BENCH_MATRIX_TIME) || 1000; + const warmupTime = Number(process.env.BENCH_MATRIX_WARMUP) || 200; + const bench = new Bench({ time, warmupTime }); // Pre-build every message + pre-encoded bytes outside the measurement // loop so the encoder/decoder benchmarks reflect the encode/decode walk