-
Notifications
You must be signed in to change notification settings - Fork 1
fix(ci): #537 — warn + catastrophic-floor for LLM-driven eval gates #539
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| """Two-tier CI gate policy shared by the LLM-driven eval runners (#537). | ||
|
|
||
| Quality thresholds (recall / precision / abort-rate / fp-rate) are advisory in | ||
| ``warn`` mode because the underlying metric is produced by a non-deterministic | ||
| caller LLM and flakes around the threshold. A separate *catastrophic floor* | ||
| hard-fails CI in any mode — it fires only when the metric collapses far below | ||
| the quality target, which signals a genuinely broken grounding / preflight path | ||
| rather than ordinary run-to-run variance. | ||
|
|
||
| Originating flake: M2 grounding-recall swung 0.783–0.957 on an identical ``main`` | ||
| base across PRs #534 / #535 / #536 (a docs-only diff tripped the 0.80 hard gate). | ||
| See GitHub #537. This codifies the gating-as-observability doctrine: WARN + emit | ||
| on the quality signal, hard-fail only on a real lower-layer break. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from collections.abc import Sequence | ||
|
|
||
|
|
||
| def gate_exit_code( | ||
| *, | ||
| quality_breaches: Sequence[str], | ||
| catastrophic_breaches: Sequence[str], | ||
| gate_mode: str, | ||
| ) -> int: | ||
| """Decide a CI exit code from two breach tiers. | ||
|
|
||
| - A non-empty ``catastrophic_breaches`` always returns ``1`` (hard floor), | ||
| regardless of ``gate_mode`` — a collapsed metric is a real failure. | ||
| - ``quality_breaches`` returns ``1`` only when ``gate_mode == "hard"``, | ||
| preserving the legacy opt-in hard-gate contract. | ||
| - Otherwise returns ``0`` (clean, or a warn-only quality breach). | ||
| """ | ||
| if catastrophic_breaches: | ||
| return 1 | ||
| if quality_breaches and gate_mode == "hard": | ||
| return 1 | ||
| return 0 | ||
|
|
||
|
|
||
| def is_inconclusive(error_count: int, total: int, *, max_error_rate: float = 0.5) -> bool: | ||
| """True when too many eval cases failed to *execute* (not failed to ground). | ||
|
|
||
| A run where most cases erred — e.g. a missing API key or network outage on a | ||
| CI re-run — zeroes the quality metric without producing any real signal. That | ||
| is inconclusive, not catastrophic: the catastrophic floor (#537) must abstain | ||
| on it rather than hard-fail, otherwise an auth/infra hiccup masquerades as a | ||
| grounding collapse. A genuine collapse shows as low recall with the eval | ||
| actually running (a low error rate). | ||
|
|
||
| Returns True when ``total <= 0`` (nothing ran) or the error rate reaches | ||
| ``max_error_rate`` (default 0.5). | ||
| """ | ||
| if total <= 0: | ||
| return True | ||
| return (error_count / total) >= max_error_rate |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,6 +46,7 @@ | |
| REPO_ROOT = Path(__file__).resolve().parents[1] | ||
| sys.path.insert(0, str(REPO_ROOT / "tests" / "eval")) | ||
|
|
||
| from _gate import gate_exit_code # noqa: E402 | ||
| from _skill_invocation_judge import ( # noqa: E402 | ||
| DEFAULT_MODEL, | ||
| classify_outcome, | ||
|
|
@@ -85,6 +86,13 @@ def main() -> int: | |
| "honor the negative controls." | ||
| ), | ||
| ) | ||
| parser.add_argument( | ||
| "--catastrophic-recall", | ||
| type=float, | ||
| default=0.25, | ||
| help="Hard floor (#537): should-invoke recall below this hard-fails CI in " | ||
| "any gate mode (a collapsed invocation path, not LLM variance). Default 0.25.", | ||
| ) | ||
|
Comment on lines
+89
to
+95
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reject threshold combinations that invert the warn/hard policy. This flag is not validated against Suggested guard args = parser.parse_args()
+ for flag in ("min_recall", "max_fp_rate", "catastrophic_recall"):
+ value = getattr(args, flag)
+ if not 0.0 <= value <= 1.0:
+ parser.error(f"--{flag.replace('_', '-')} must be between 0 and 1")
+ if args.catastrophic_recall > args.min_recall:
+ parser.error("--catastrophic-recall must be less than or equal to --min-recall")
rows = [json.loads(line) for line in DATASET.read_text().splitlines() if line.strip()]🤖 Prompt for AI Agents |
||
| parser.add_argument( | ||
| "--model", | ||
| default=os.getenv("BICAMERAL_PREFLIGHT_INVOCATION_EVAL_MODEL", DEFAULT_MODEL), | ||
|
|
@@ -160,12 +168,18 @@ def main() -> int: | |
| if fp_rate is not None and fp_rate > args.max_fp_rate: | ||
| breaches.append(f"fp_rate={fp_rate:.3f} > {args.max_fp_rate}") | ||
|
|
||
| # Catastrophic floor (#537): should-invoke recall collapse hard-fails any mode. | ||
| catastrophic: list[str] = [] | ||
| if recall is not None and recall < args.catastrophic_recall: | ||
| catastrophic.append(f"recall={recall:.3f} < catastrophic floor {args.catastrophic_recall}") | ||
|
|
||
| payload = { | ||
| "step": "step0_invocation", | ||
| "model": args.model, | ||
| "gate_mode": args.gate_mode, | ||
| "min_recall": args.min_recall, | ||
| "max_fp_rate": args.max_fp_rate, | ||
| "catastrophic_recall": args.catastrophic_recall, | ||
| "summary": { | ||
| "total": len(rows), | ||
| "counts": counts, | ||
|
|
@@ -177,6 +191,7 @@ def main() -> int: | |
| "precision": precision, | ||
| "fp_rate": fp_rate, | ||
| "gate_breaches": breaches, | ||
| "catastrophic_breaches": catastrophic, | ||
| }, | ||
| "rows": case_rows, | ||
| } | ||
|
|
@@ -187,9 +202,11 @@ def main() -> int: | |
| args.output.parent.mkdir(parents=True, exist_ok=True) | ||
| args.output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") | ||
|
|
||
| if args.gate_mode == "hard" and breaches: | ||
| return 1 | ||
| return 0 | ||
| return gate_exit_code( | ||
| quality_breaches=breaches, | ||
| catastrophic_breaches=catastrophic, | ||
| gate_mode=args.gate_mode, | ||
| ) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ | |
| REPO_ROOT = Path(__file__).resolve().parents[1] | ||
| sys.path.insert(0, str(REPO_ROOT / "tests" / "eval")) | ||
|
|
||
| from _gate import gate_exit_code # noqa: E402 | ||
| from _skill_judge import DEFAULT_MODEL, fixture_exists, judge_relevance # noqa: E402 | ||
|
|
||
| DATASET = REPO_ROOT / "tests" / "eval" / "preflight_skill_dataset.jsonl" | ||
|
|
@@ -76,6 +77,13 @@ def main() -> int: | |
| default=0.70, | ||
| help="Per-axis recall gate (default 0.70 per the wiki M6 signal threshold)", | ||
| ) | ||
| parser.add_argument( | ||
| "--catastrophic-recall", | ||
| type=float, | ||
| default=0.40, | ||
| help="Hard floor (#537): overall recall below this hard-fails CI in any " | ||
| "gate mode (a collapsed preflight path, not LLM variance). Default 0.40.", | ||
| ) | ||
|
Comment on lines
+80
to
+86
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Enforce a sane catastrophic-floor range.
Suggested guard args = parser.parse_args()
+ for flag in ("min_recall", "catastrophic_recall"):
+ value = getattr(args, flag)
+ if not 0.0 <= value <= 1.0:
+ parser.error(f"--{flag.replace('_', '-')} must be between 0 and 1")
+ if args.catastrophic_recall > args.min_recall:
+ parser.error("--catastrophic-recall must be less than or equal to --min-recall")
rows = [json.loads(line) for line in DATASET.read_text().splitlines() if line.strip()]🤖 Prompt for AI Agents |
||
| parser.add_argument( | ||
| "--model", | ||
| default=os.getenv("BICAMERAL_PREFLIGHT_EVAL_MODEL", DEFAULT_MODEL), | ||
|
|
@@ -144,11 +152,19 @@ def main() -> int: | |
| if recall is not None and recall < args.min_recall: | ||
| breaches.append(f"axis={axis} recall={recall:.3f} < {args.min_recall}") | ||
|
|
||
| # Catastrophic floor (#537): overall recall collapse hard-fails any mode. | ||
| catastrophic: list[str] = [] | ||
| if overall_recall is not None and overall_recall < args.catastrophic_recall: | ||
| catastrophic.append( | ||
| f"overall recall={overall_recall:.3f} < catastrophic floor {args.catastrophic_recall}" | ||
| ) | ||
|
|
||
| payload = { | ||
| "step": "step1_relevance", | ||
| "model": args.model, | ||
| "gate_mode": args.gate_mode, | ||
| "min_recall": args.min_recall, | ||
| "catastrophic_recall": args.catastrophic_recall, | ||
| "summary": { | ||
| "total": total, | ||
| "hits": hits, | ||
|
|
@@ -158,6 +174,7 @@ def main() -> int: | |
| "recall": overall_recall, | ||
| "per_axis": per_axis_summary, | ||
| "gate_breaches": breaches, | ||
| "catastrophic_breaches": catastrophic, | ||
| }, | ||
| "rows": case_rows, | ||
| } | ||
|
|
@@ -168,9 +185,11 @@ def main() -> int: | |
| args.output.parent.mkdir(parents=True, exist_ok=True) | ||
| args.output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") | ||
|
|
||
| if args.gate_mode == "hard" and breaches: | ||
| return 1 | ||
| return 0 | ||
| return gate_exit_code( | ||
| quality_breaches=breaches, | ||
| catastrophic_breaches=catastrophic, | ||
| gate_mode=args.gate_mode, | ||
| ) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Validate catastrophic-floor ordering at parse time.
--catastrophic-recallis unconstrained here, so it can be set above--min-recall. That breaks the two-tier contract by making warn mode hard-fail scores that the quality gate treats as passing, and in this runner can even print✓ all gates passbefore the catastrophic failure line. Reject invalid threshold combinations (and out-of-range percentages) up front.Suggested guard
🤖 Prompt for AI Agents