diff --git a/.github/workflows/test-mcp-regression.yml b/.github/workflows/test-mcp-regression.yml index 9ed2c63e..8b98bdca 100644 --- a/.github/workflows/test-mcp-regression.yml +++ b/.github/workflows/test-mcp-regression.yml @@ -244,14 +244,21 @@ jobs: # bind handler / skill / fixture must keep recall ≥ 0.80, precision # ≥ 0.85, abort_rate ≤ 0.30 — OR explicitly re-record the cache # after a deliberate skill-prompt change. "Deliberate not drift." - - name: M2 grounding-recall eval (hard gate) + # #537: this metric is produced by a non-deterministic caller LLM and + # flaked the hard gate on unrelated PRs (recall swung 0.783–0.957 on an + # identical `main` base across #534/#535/#536). Quality thresholds + # (recall ≥ 0.80 etc.) are now warn-only — emitted to the run summary + # below but advisory — while a catastrophic floor (--catastrophic-recall, + # default 0.50) still hard-fails CI in any mode when the grounding path + # genuinely collapses. Per the gating-as-observability doctrine. + - name: M2 grounding-recall eval (warn + catastrophic floor) if: matrix.os == 'ubuntu-latest' env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} BICAMERAL_GROUNDING_EVAL_MODEL: claude-haiku-4-5-20251001 run: > python tests/eval_grounding_recall.py - --gate-mode hard + --gate-mode warn -o test-results/m2-grounding-recall.json # ── Surface M2 metrics on the GitHub run summary (#280 PR-3) ─── @@ -320,24 +327,30 @@ jobs: # BICAMERAL_PREFLIGHT_EVAL_RECORD=1 / # BICAMERAL_PREFLIGHT_INVOCATION_EVAL_RECORD=1 after a deliberate # skill-prompt change. "Deliberate not drift." - - name: M_skill_preflight Step-1 (hard gate) + # #537: both steps are caller-LLM driven (same nondeterminism that flaked + # M2). Quality thresholds (Step-1 per-axis recall ≥ 0.70; Step-0 + # should-invoke recall ≥ 0.50, fp_rate ≤ 0.30) are warn-only — emitted to + # the run summary below — while catastrophic floors (--catastrophic-recall, + # defaults 0.40 / 0.25) still hard-fail CI in any mode on a genuine path + # collapse. Per the gating-as-observability doctrine. + - name: M_skill_preflight Step-1 (warn + catastrophic floor) if: matrix.os == 'ubuntu-latest' env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} BICAMERAL_PREFLIGHT_EVAL_MODEL: claude-haiku-4-5-20251001 run: > python tests/eval_preflight_skill_step1.py - --gate-mode hard + --gate-mode warn -o test-results/skill-step1.json - - name: M_skill_preflight Step-0 (hard gate) + - name: M_skill_preflight Step-0 (warn + catastrophic floor) if: matrix.os == 'ubuntu-latest' env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} BICAMERAL_PREFLIGHT_INVOCATION_EVAL_MODEL: claude-haiku-4-5-20251001 run: > python tests/eval_preflight_skill_invocation.py - --gate-mode hard + --gate-mode warn -o test-results/skill-step0.json # ── Surface M_skill_preflight metrics on the GitHub run summary ── diff --git a/tests/eval/_gate.py b/tests/eval/_gate.py new file mode 100644 index 00000000..31c17f26 --- /dev/null +++ b/tests/eval/_gate.py @@ -0,0 +1,57 @@ +"""Two-tier CI gate policy shared by the LLM-driven eval runners (#537). + +Quality thresholds (recall / precision / abort-rate / fp-rate) are advisory in +``warn`` mode because the underlying metric is produced by a non-deterministic +caller LLM and flakes around the threshold. A separate *catastrophic floor* +hard-fails CI in any mode — it fires only when the metric collapses far below +the quality target, which signals a genuinely broken grounding / preflight path +rather than ordinary run-to-run variance. + +Originating flake: M2 grounding-recall swung 0.783–0.957 on an identical ``main`` +base across PRs #534 / #535 / #536 (a docs-only diff tripped the 0.80 hard gate). +See GitHub #537. This codifies the gating-as-observability doctrine: WARN + emit +on the quality signal, hard-fail only on a real lower-layer break. +""" + +from __future__ import annotations + +from collections.abc import Sequence + + +def gate_exit_code( + *, + quality_breaches: Sequence[str], + catastrophic_breaches: Sequence[str], + gate_mode: str, +) -> int: + """Decide a CI exit code from two breach tiers. + + - A non-empty ``catastrophic_breaches`` always returns ``1`` (hard floor), + regardless of ``gate_mode`` — a collapsed metric is a real failure. + - ``quality_breaches`` returns ``1`` only when ``gate_mode == "hard"``, + preserving the legacy opt-in hard-gate contract. + - Otherwise returns ``0`` (clean, or a warn-only quality breach). + """ + if catastrophic_breaches: + return 1 + if quality_breaches and gate_mode == "hard": + return 1 + return 0 + + +def is_inconclusive(error_count: int, total: int, *, max_error_rate: float = 0.5) -> bool: + """True when too many eval cases failed to *execute* (not failed to ground). + + A run where most cases erred — e.g. a missing API key or network outage on a + CI re-run — zeroes the quality metric without producing any real signal. That + is inconclusive, not catastrophic: the catastrophic floor (#537) must abstain + on it rather than hard-fail, otherwise an auth/infra hiccup masquerades as a + grounding collapse. A genuine collapse shows as low recall with the eval + actually running (a low error rate). + + Returns True when ``total <= 0`` (nothing ran) or the error rate reaches + ``max_error_rate`` (default 0.5). + """ + if total <= 0: + return True + return (error_count / total) >= max_error_rate diff --git a/tests/eval_grounding_recall.py b/tests/eval_grounding_recall.py index d2d7f3d3..be5cdd9a 100644 --- a/tests/eval_grounding_recall.py +++ b/tests/eval_grounding_recall.py @@ -51,6 +51,7 @@ sys.path.insert(0, str(REPO_ROOT / "tests" / "fixtures" / "grounding_recall")) from _bind_judge import BindJudgment, fixture_exists, run_bind_judgment # type: ignore[import-not-found] # noqa: E402, I001 +from _gate import gate_exit_code, is_inconclusive # type: ignore[import-not-found] # noqa: E402, I001 from dataset import ALL_CASES, GENERATOR_VERSION, GroundingCase, cases_by_type # type: ignore[import-not-found] # noqa: E402, I001 FIXTURE_REPO = REPO_ROOT / "tests" / "fixtures" / "grounding_recall" / "repo" @@ -306,6 +307,25 @@ async def run(args: argparse.Namespace) -> tuple[dict[str, Any], int]: breaches.append(f"abort_rate {summary['abort_rate']:.3f} > {args.max_abort_rate}") summary["gate_breaches"] = breaches + # Catastrophic floor (#537): hard-fails CI in any mode — but ONLY on a + # genuine grounding collapse, never on an eval that couldn't run. A high + # eval_error rate (e.g. missing API key / network on a CI re-run) zeroes + # recall without any grounding signal; that is inconclusive, not + # catastrophic, so we abstain and warn. A real collapse shows as low recall + # with the eval actually running (a low error rate). + total_cases = summary["total_cases"] + eval_errors = sum(1 for r in rows if r.get("outcome") == "eval_error") + error_rate = (eval_errors / total_cases) if total_cases else 0.0 + summary["eval_error_rate"] = error_rate + inconclusive = is_inconclusive(eval_errors, total_cases) + + catastrophic: list[str] = [] + if not inconclusive and summary["recall"] < args.catastrophic_recall: + catastrophic.append( + f"recall {summary['recall']:.3f} < catastrophic floor {args.catastrophic_recall}" + ) + summary["catastrophic_breaches"] = catastrophic + report = {"summary": summary, "rows": rows} if args.output: @@ -322,13 +342,24 @@ async def run(args: argparse.Namespace) -> tuple[dict[str, Any], int]: print(f" avg_turns : {summary['avg_turns']}") print(f" tokens : {summary['tokens_in_total']} in / {summary['tokens_out_total']} out") if breaches: - print(f" ⚠ gate breaches: {'; '.join(breaches)}") + mode_note = "advisory (warn)" if args.gate_mode != "hard" else "gate" + print(f" ⚠ {mode_note} breaches: {'; '.join(breaches)}") else: print(" ✓ all gates pass") - - if breaches and args.gate_mode == "hard": - return report, 1 - return report, 0 + if catastrophic: + print(f" ✗ CATASTROPHIC floor breached (hard-fails any mode): {'; '.join(catastrophic)}") + if inconclusive: + print( + f" ⓘ inconclusive: {eval_errors}/{total_cases} cases errored " + f"(rate {error_rate:.2f}) — catastrophic floor abstained " + "(eval could not run; not a grounding collapse)" + ) + + return report, gate_exit_code( + quality_breaches=breaches, + catastrophic_breaches=catastrophic, + gate_mode=args.gate_mode, + ) def main() -> int: @@ -341,6 +372,13 @@ def main() -> int: p.add_argument("--min-recall", type=float, default=0.80) p.add_argument("--min-precision", type=float, default=0.85) p.add_argument("--max-abort-rate", type=float, default=0.30) + p.add_argument( + "--catastrophic-recall", + type=float, + default=0.50, + help="Hard floor (#537): recall below this hard-fails CI in any gate mode " + "(a collapsed grounding path, not LLM variance). Default 0.50.", + ) p.add_argument("--gate-mode", choices=("warn", "hard"), default="warn") p.add_argument( "--skip-missing-fixtures", diff --git a/tests/eval_preflight_skill_invocation.py b/tests/eval_preflight_skill_invocation.py index 86a30a77..1637989c 100644 --- a/tests/eval_preflight_skill_invocation.py +++ b/tests/eval_preflight_skill_invocation.py @@ -46,6 +46,7 @@ REPO_ROOT = Path(__file__).resolve().parents[1] sys.path.insert(0, str(REPO_ROOT / "tests" / "eval")) +from _gate import gate_exit_code # noqa: E402 from _skill_invocation_judge import ( # noqa: E402 DEFAULT_MODEL, classify_outcome, @@ -85,6 +86,13 @@ def main() -> int: "honor the negative controls." ), ) + parser.add_argument( + "--catastrophic-recall", + type=float, + default=0.25, + help="Hard floor (#537): should-invoke recall below this hard-fails CI in " + "any gate mode (a collapsed invocation path, not LLM variance). Default 0.25.", + ) parser.add_argument( "--model", default=os.getenv("BICAMERAL_PREFLIGHT_INVOCATION_EVAL_MODEL", DEFAULT_MODEL), @@ -160,12 +168,18 @@ def main() -> int: if fp_rate is not None and fp_rate > args.max_fp_rate: breaches.append(f"fp_rate={fp_rate:.3f} > {args.max_fp_rate}") + # Catastrophic floor (#537): should-invoke recall collapse hard-fails any mode. + catastrophic: list[str] = [] + if recall is not None and recall < args.catastrophic_recall: + catastrophic.append(f"recall={recall:.3f} < catastrophic floor {args.catastrophic_recall}") + payload = { "step": "step0_invocation", "model": args.model, "gate_mode": args.gate_mode, "min_recall": args.min_recall, "max_fp_rate": args.max_fp_rate, + "catastrophic_recall": args.catastrophic_recall, "summary": { "total": len(rows), "counts": counts, @@ -177,6 +191,7 @@ def main() -> int: "precision": precision, "fp_rate": fp_rate, "gate_breaches": breaches, + "catastrophic_breaches": catastrophic, }, "rows": case_rows, } @@ -187,9 +202,11 @@ def main() -> int: args.output.parent.mkdir(parents=True, exist_ok=True) args.output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") - if args.gate_mode == "hard" and breaches: - return 1 - return 0 + return gate_exit_code( + quality_breaches=breaches, + catastrophic_breaches=catastrophic, + gate_mode=args.gate_mode, + ) if __name__ == "__main__": diff --git a/tests/eval_preflight_skill_step1.py b/tests/eval_preflight_skill_step1.py index 5585bc9a..9c262ded 100644 --- a/tests/eval_preflight_skill_step1.py +++ b/tests/eval_preflight_skill_step1.py @@ -33,6 +33,7 @@ REPO_ROOT = Path(__file__).resolve().parents[1] sys.path.insert(0, str(REPO_ROOT / "tests" / "eval")) +from _gate import gate_exit_code # noqa: E402 from _skill_judge import DEFAULT_MODEL, fixture_exists, judge_relevance # noqa: E402 DATASET = REPO_ROOT / "tests" / "eval" / "preflight_skill_dataset.jsonl" @@ -76,6 +77,13 @@ def main() -> int: default=0.70, help="Per-axis recall gate (default 0.70 per the wiki M6 signal threshold)", ) + parser.add_argument( + "--catastrophic-recall", + type=float, + default=0.40, + help="Hard floor (#537): overall recall below this hard-fails CI in any " + "gate mode (a collapsed preflight path, not LLM variance). Default 0.40.", + ) parser.add_argument( "--model", default=os.getenv("BICAMERAL_PREFLIGHT_EVAL_MODEL", DEFAULT_MODEL), @@ -144,11 +152,19 @@ def main() -> int: if recall is not None and recall < args.min_recall: breaches.append(f"axis={axis} recall={recall:.3f} < {args.min_recall}") + # Catastrophic floor (#537): overall recall collapse hard-fails any mode. + catastrophic: list[str] = [] + if overall_recall is not None and overall_recall < args.catastrophic_recall: + catastrophic.append( + f"overall recall={overall_recall:.3f} < catastrophic floor {args.catastrophic_recall}" + ) + payload = { "step": "step1_relevance", "model": args.model, "gate_mode": args.gate_mode, "min_recall": args.min_recall, + "catastrophic_recall": args.catastrophic_recall, "summary": { "total": total, "hits": hits, @@ -158,6 +174,7 @@ def main() -> int: "recall": overall_recall, "per_axis": per_axis_summary, "gate_breaches": breaches, + "catastrophic_breaches": catastrophic, }, "rows": case_rows, } @@ -168,9 +185,11 @@ def main() -> int: args.output.parent.mkdir(parents=True, exist_ok=True) args.output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") - if args.gate_mode == "hard" and breaches: - return 1 - return 0 + return gate_exit_code( + quality_breaches=breaches, + catastrophic_breaches=catastrophic, + gate_mode=args.gate_mode, + ) if __name__ == "__main__": diff --git a/tests/test_eval_gate_two_tier.py b/tests/test_eval_gate_two_tier.py new file mode 100644 index 00000000..04d79b26 --- /dev/null +++ b/tests/test_eval_gate_two_tier.py @@ -0,0 +1,99 @@ +"""Behavioral tests for the two-tier eval gate policy (#537). + +The helper is a pure decision function with no shipped collaborators, so a +solitary test that invokes it directly and asserts the returned exit code is +the correct shape (per the project's "solitary is correct for pure helpers" +exception). Each case asserts the observable output (exit code) for a given +(quality_breaches, catastrophic_breaches, gate_mode) input — not presence. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent / "eval")) + +from _gate import gate_exit_code, is_inconclusive # noqa: E402 + + +def test_clean_passes_in_warn_mode(): + assert gate_exit_code(quality_breaches=[], catastrophic_breaches=[], gate_mode="warn") == 0 + + +def test_clean_passes_in_hard_mode(): + assert gate_exit_code(quality_breaches=[], catastrophic_breaches=[], gate_mode="hard") == 0 + + +def test_quality_breach_is_advisory_in_warn_mode(): + # The #537 fix: a quality-threshold breach (LLM variance) must NOT fail CI + # in warn mode. + assert ( + gate_exit_code( + quality_breaches=["recall 0.783 < 0.8"], + catastrophic_breaches=[], + gate_mode="warn", + ) + == 0 + ) + + +def test_quality_breach_still_hard_fails_in_hard_mode(): + # Legacy opt-in contract is preserved for callers that pass --gate-mode hard. + assert ( + gate_exit_code( + quality_breaches=["recall 0.783 < 0.8"], + catastrophic_breaches=[], + gate_mode="hard", + ) + == 1 + ) + + +def test_catastrophic_breach_hard_fails_in_warn_mode(): + # The catastrophic floor fires even in warn mode — a collapsed metric is a + # genuine break, not variance. + assert ( + gate_exit_code( + quality_breaches=["recall 0.40 < 0.8"], + catastrophic_breaches=["recall 0.40 < catastrophic floor 0.50"], + gate_mode="warn", + ) + == 1 + ) + + +def test_catastrophic_breach_hard_fails_in_hard_mode(): + assert ( + gate_exit_code( + quality_breaches=[], + catastrophic_breaches=["recall 0.10 < catastrophic floor 0.50"], + gate_mode="hard", + ) + == 1 + ) + + +def test_all_cases_errored_is_inconclusive_not_catastrophic(): + # The #536 re-run scored recall 0.000 because every case errored (no API key + # on rerun). That must read as inconclusive, not a grounding collapse. + assert is_inconclusive(error_count=23, total=23) is True + + +def test_no_cases_run_is_inconclusive(): + assert is_inconclusive(error_count=0, total=0) is True + + +def test_low_error_rate_is_conclusive(): + # A real collapse: the eval ran (only 1/23 errored) but recall is genuinely + # low — the catastrophic floor should be allowed to fire. + assert is_inconclusive(error_count=1, total=23) is False + + +def test_error_rate_at_threshold_is_inconclusive(): + # Exactly half errored — at/above the default 0.5 ceiling → abstain. + assert is_inconclusive(error_count=5, total=10) is True + + +def test_error_rate_just_below_threshold_is_conclusive(): + assert is_inconclusive(error_count=4, total=10) is False