BicameralAI · jinhongkuan · Jun 7, 2026 · Jun 3, 2026 · coderabbitai · Jun 3, 2026
@@ -244,14 +244,21 @@ jobs:
       # bind handler / skill / fixture must keep recall ≥ 0.80, precision
       # ≥ 0.85, abort_rate ≤ 0.30 — OR explicitly re-record the cache
       # after a deliberate skill-prompt change. "Deliberate not drift."
-      - name: M2 grounding-recall eval (hard gate)
+      # #537: this metric is produced by a non-deterministic caller LLM and
+      # flaked the hard gate on unrelated PRs (recall swung 0.783–0.957 on an
+      # identical `main` base across #534/#535/#536). Quality thresholds
+      # (recall ≥ 0.80 etc.) are now warn-only — emitted to the run summary
+      # below but advisory — while a catastrophic floor (--catastrophic-recall,
+      # default 0.50) still hard-fails CI in any mode when the grounding path
+      # genuinely collapses. Per the gating-as-observability doctrine.
+      - name: M2 grounding-recall eval (warn + catastrophic floor)
         if: matrix.os == 'ubuntu-latest'
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           BICAMERAL_GROUNDING_EVAL_MODEL: claude-haiku-4-5-20251001
         run: >
           python tests/eval_grounding_recall.py
-          --gate-mode hard
+          --gate-mode warn
           -o test-results/m2-grounding-recall.json
 
       # ── Surface M2 metrics on the GitHub run summary (#280 PR-3) ───
@@ -320,24 +327,30 @@ jobs:
       # BICAMERAL_PREFLIGHT_EVAL_RECORD=1 /
       # BICAMERAL_PREFLIGHT_INVOCATION_EVAL_RECORD=1 after a deliberate
       # skill-prompt change. "Deliberate not drift."
-      - name: M_skill_preflight Step-1 (hard gate)
+      # #537: both steps are caller-LLM driven (same nondeterminism that flaked
+      # M2). Quality thresholds (Step-1 per-axis recall ≥ 0.70; Step-0
+      # should-invoke recall ≥ 0.50, fp_rate ≤ 0.30) are warn-only — emitted to
+      # the run summary below — while catastrophic floors (--catastrophic-recall,
+      # defaults 0.40 / 0.25) still hard-fail CI in any mode on a genuine path
+      # collapse. Per the gating-as-observability doctrine.
+      - name: M_skill_preflight Step-1 (warn + catastrophic floor)
         if: matrix.os == 'ubuntu-latest'
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           BICAMERAL_PREFLIGHT_EVAL_MODEL: claude-haiku-4-5-20251001
         run: >
           python tests/eval_preflight_skill_step1.py
-          --gate-mode hard
+          --gate-mode warn
           -o test-results/skill-step1.json
 
-      - name: M_skill_preflight Step-0 (hard gate)
+      - name: M_skill_preflight Step-0 (warn + catastrophic floor)
         if: matrix.os == 'ubuntu-latest'
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           BICAMERAL_PREFLIGHT_INVOCATION_EVAL_MODEL: claude-haiku-4-5-20251001
         run: >
           python tests/eval_preflight_skill_invocation.py
-          --gate-mode hard
+          --gate-mode warn
           -o test-results/skill-step0.json
 
       # ── Surface M_skill_preflight metrics on the GitHub run summary ──

@@ -0,0 +1,57 @@
+"""Two-tier CI gate policy shared by the LLM-driven eval runners (#537).
+
+Quality thresholds (recall / precision / abort-rate / fp-rate) are advisory in
+``warn`` mode because the underlying metric is produced by a non-deterministic
+caller LLM and flakes around the threshold. A separate *catastrophic floor*
+hard-fails CI in any mode — it fires only when the metric collapses far below
+the quality target, which signals a genuinely broken grounding / preflight path
+rather than ordinary run-to-run variance.
+
+Originating flake: M2 grounding-recall swung 0.783–0.957 on an identical ``main``
+base across PRs #534 / #535 / #536 (a docs-only diff tripped the 0.80 hard gate).
+See GitHub #537. This codifies the gating-as-observability doctrine: WARN + emit
+on the quality signal, hard-fail only on a real lower-layer break.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+
+def gate_exit_code(
+    *,
+    quality_breaches: Sequence[str],
+    catastrophic_breaches: Sequence[str],
+    gate_mode: str,
+) -> int:
+    """Decide a CI exit code from two breach tiers.
+
+    - A non-empty ``catastrophic_breaches`` always returns ``1`` (hard floor),
+      regardless of ``gate_mode`` — a collapsed metric is a real failure.
+    - ``quality_breaches`` returns ``1`` only when ``gate_mode == "hard"``,
+      preserving the legacy opt-in hard-gate contract.
+    - Otherwise returns ``0`` (clean, or a warn-only quality breach).
+    """
+    if catastrophic_breaches:
+        return 1
+    if quality_breaches and gate_mode == "hard":
+        return 1
+    return 0
+
+
+def is_inconclusive(error_count: int, total: int, *, max_error_rate: float = 0.5) -> bool:
+    """True when too many eval cases failed to *execute* (not failed to ground).
+
+    A run where most cases erred — e.g. a missing API key or network outage on a
+    CI re-run — zeroes the quality metric without producing any real signal. That
+    is inconclusive, not catastrophic: the catastrophic floor (#537) must abstain
+    on it rather than hard-fail, otherwise an auth/infra hiccup masquerades as a
+    grounding collapse. A genuine collapse shows as low recall with the eval
+    actually running (a low error rate).
+
+    Returns True when ``total <= 0`` (nothing ran) or the error rate reaches
+    ``max_error_rate`` (default 0.5).
+    """
+    if total <= 0:
+        return True
+    return (error_count / total) >= max_error_rate
@@ -51,6 +51,7 @@
 sys.path.insert(0, str(REPO_ROOT / "tests" / "fixtures" / "grounding_recall"))
 
 from _bind_judge import BindJudgment, fixture_exists, run_bind_judgment  # type: ignore[import-not-found]  # noqa: E402, I001
+from _gate import gate_exit_code, is_inconclusive  # type: ignore[import-not-found]  # noqa: E402, I001
 from dataset import ALL_CASES, GENERATOR_VERSION, GroundingCase, cases_by_type  # type: ignore[import-not-found]  # noqa: E402, I001
 
 FIXTURE_REPO = REPO_ROOT / "tests" / "fixtures" / "grounding_recall" / "repo"
@@ -306,6 +307,25 @@ async def run(args: argparse.Namespace) -> tuple[dict[str, Any], int]:
         breaches.append(f"abort_rate {summary['abort_rate']:.3f} > {args.max_abort_rate}")
     summary["gate_breaches"] = breaches
 
+    # Catastrophic floor (#537): hard-fails CI in any mode — but ONLY on a
+    # genuine grounding collapse, never on an eval that couldn't run. A high
+    # eval_error rate (e.g. missing API key / network on a CI re-run) zeroes
+    # recall without any grounding signal; that is inconclusive, not
+    # catastrophic, so we abstain and warn. A real collapse shows as low recall
+    # with the eval actually running (a low error rate).
+    total_cases = summary["total_cases"]
+    eval_errors = sum(1 for r in rows if r.get("outcome") == "eval_error")
+    error_rate = (eval_errors / total_cases) if total_cases else 0.0
+    summary["eval_error_rate"] = error_rate
+    inconclusive = is_inconclusive(eval_errors, total_cases)
+
+    catastrophic: list[str] = []
+    if not inconclusive and summary["recall"] < args.catastrophic_recall:
+        catastrophic.append(
+            f"recall {summary['recall']:.3f} < catastrophic floor {args.catastrophic_recall}"
+        )
+    summary["catastrophic_breaches"] = catastrophic
+
     report = {"summary": summary, "rows": rows}
 
     if args.output:
@@ -322,13 +342,24 @@ async def run(args: argparse.Namespace) -> tuple[dict[str, Any], int]:
     print(f"  avg_turns  : {summary['avg_turns']}")
     print(f"  tokens     : {summary['tokens_in_total']} in / {summary['tokens_out_total']} out")
     if breaches:
-        print(f"  ⚠ gate breaches: {'; '.join(breaches)}")
+        mode_note = "advisory (warn)" if args.gate_mode != "hard" else "gate"
+        print(f"  ⚠ {mode_note} breaches: {'; '.join(breaches)}")
     else:
         print("  ✓ all gates pass")
-
-    if breaches and args.gate_mode == "hard":
-        return report, 1
-    return report, 0
+    if catastrophic:
+        print(f"  ✗ CATASTROPHIC floor breached (hard-fails any mode): {'; '.join(catastrophic)}")
+    if inconclusive:
+        print(
+            f"  ⓘ inconclusive: {eval_errors}/{total_cases} cases errored "
+            f"(rate {error_rate:.2f}) — catastrophic floor abstained "
+            "(eval could not run; not a grounding collapse)"
+        )
+
+    return report, gate_exit_code(
+        quality_breaches=breaches,
+        catastrophic_breaches=catastrophic,
+        gate_mode=args.gate_mode,
+    )
 
 
 def main() -> int:
@@ -341,6 +372,13 @@ def main() -> int:
     p.add_argument("--min-recall", type=float, default=0.80)
     p.add_argument("--min-precision", type=float, default=0.85)
     p.add_argument("--max-abort-rate", type=float, default=0.30)
+    p.add_argument(
+        "--catastrophic-recall",
+        type=float,
+        default=0.50,
+        help="Hard floor (#537): recall below this hard-fails CI in any gate mode "
+        "(a collapsed grounding path, not LLM variance). Default 0.50.",
+    )
     p.add_argument("--gate-mode", choices=("warn", "hard"), default="warn")
     p.add_argument(
         "--skip-missing-fixtures",

@@ -46,6 +46,7 @@
 REPO_ROOT = Path(__file__).resolve().parents[1]
 sys.path.insert(0, str(REPO_ROOT / "tests" / "eval"))
 
+from _gate import gate_exit_code  # noqa: E402
 from _skill_invocation_judge import (  # noqa: E402
     DEFAULT_MODEL,
     classify_outcome,
@@ -85,6 +86,13 @@ def main() -> int:
             "honor the negative controls."
         ),
     )
+    parser.add_argument(
+        "--catastrophic-recall",
+        type=float,
+        default=0.25,
+        help="Hard floor (#537): should-invoke recall below this hard-fails CI in "
+        "any gate mode (a collapsed invocation path, not LLM variance). Default 0.25.",
+    )
     parser.add_argument(
         "--model",
         default=os.getenv("BICAMERAL_PREFLIGHT_INVOCATION_EVAL_MODEL", DEFAULT_MODEL),
@@ -160,12 +168,18 @@ def main() -> int:
     if fp_rate is not None and fp_rate > args.max_fp_rate:
         breaches.append(f"fp_rate={fp_rate:.3f} > {args.max_fp_rate}")
 
+    # Catastrophic floor (#537): should-invoke recall collapse hard-fails any mode.
+    catastrophic: list[str] = []
+    if recall is not None and recall < args.catastrophic_recall:
+        catastrophic.append(f"recall={recall:.3f} < catastrophic floor {args.catastrophic_recall}")
+
     payload = {
         "step": "step0_invocation",
         "model": args.model,
         "gate_mode": args.gate_mode,
         "min_recall": args.min_recall,
         "max_fp_rate": args.max_fp_rate,
+        "catastrophic_recall": args.catastrophic_recall,
         "summary": {
             "total": len(rows),
             "counts": counts,
@@ -177,6 +191,7 @@ def main() -> int:
             "precision": precision,
             "fp_rate": fp_rate,
             "gate_breaches": breaches,
+            "catastrophic_breaches": catastrophic,
         },
         "rows": case_rows,
     }
@@ -187,9 +202,11 @@ def main() -> int:
         args.output.parent.mkdir(parents=True, exist_ok=True)
         args.output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
 
-    if args.gate_mode == "hard" and breaches:
-        return 1
-    return 0
+    return gate_exit_code(
+        quality_breaches=breaches,
+        catastrophic_breaches=catastrophic,
+        gate_mode=args.gate_mode,
+    )
 
 
 if __name__ == "__main__":

@@ -33,6 +33,7 @@
 REPO_ROOT = Path(__file__).resolve().parents[1]
 sys.path.insert(0, str(REPO_ROOT / "tests" / "eval"))
 
+from _gate import gate_exit_code  # noqa: E402
 from _skill_judge import DEFAULT_MODEL, fixture_exists, judge_relevance  # noqa: E402
 
 DATASET = REPO_ROOT / "tests" / "eval" / "preflight_skill_dataset.jsonl"
@@ -76,6 +77,13 @@ def main() -> int:
         default=0.70,
         help="Per-axis recall gate (default 0.70 per the wiki M6 signal threshold)",
     )
+    parser.add_argument(
+        "--catastrophic-recall",
+        type=float,
+        default=0.40,
+        help="Hard floor (#537): overall recall below this hard-fails CI in any "
+        "gate mode (a collapsed preflight path, not LLM variance). Default 0.40.",
+    )
     parser.add_argument(
         "--model",
         default=os.getenv("BICAMERAL_PREFLIGHT_EVAL_MODEL", DEFAULT_MODEL),
@@ -144,11 +152,19 @@ def main() -> int:
         if recall is not None and recall < args.min_recall:
             breaches.append(f"axis={axis} recall={recall:.3f} < {args.min_recall}")
 
+    # Catastrophic floor (#537): overall recall collapse hard-fails any mode.
+    catastrophic: list[str] = []
+    if overall_recall is not None and overall_recall < args.catastrophic_recall:
+        catastrophic.append(
+            f"overall recall={overall_recall:.3f} < catastrophic floor {args.catastrophic_recall}"
+        )
+
     payload = {
         "step": "step1_relevance",
         "model": args.model,
         "gate_mode": args.gate_mode,
         "min_recall": args.min_recall,
+        "catastrophic_recall": args.catastrophic_recall,
         "summary": {
             "total": total,
             "hits": hits,
@@ -158,6 +174,7 @@ def main() -> int:
             "recall": overall_recall,
             "per_axis": per_axis_summary,
             "gate_breaches": breaches,
+            "catastrophic_breaches": catastrophic,
         },
         "rows": case_rows,
     }
@@ -168,9 +185,11 @@ def main() -> int:
         args.output.parent.mkdir(parents=True, exist_ok=True)
         args.output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
 
-    if args.gate_mode == "hard" and breaches:
-        return 1
-    return 0
+    return gate_exit_code(
+        quality_breaches=breaches,
+        catastrophic_breaches=catastrophic,
+        gate_mode=args.gate_mode,
+    )
 
 
 if __name__ == "__main__":