Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions .github/workflows/test-mcp-regression.yml
Original file line number Diff line number Diff line change
Expand Up @@ -244,14 +244,21 @@ jobs:
# bind handler / skill / fixture must keep recall ≥ 0.80, precision
# ≥ 0.85, abort_rate ≤ 0.30 — OR explicitly re-record the cache
# after a deliberate skill-prompt change. "Deliberate not drift."
- name: M2 grounding-recall eval (hard gate)
# #537: this metric is produced by a non-deterministic caller LLM and
# flaked the hard gate on unrelated PRs (recall swung 0.783–0.957 on an
# identical `main` base across #534/#535/#536). Quality thresholds
# (recall ≥ 0.80 etc.) are now warn-only — emitted to the run summary
# below but advisory — while a catastrophic floor (--catastrophic-recall,
# default 0.50) still hard-fails CI in any mode when the grounding path
# genuinely collapses. Per the gating-as-observability doctrine.
- name: M2 grounding-recall eval (warn + catastrophic floor)
if: matrix.os == 'ubuntu-latest'
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BICAMERAL_GROUNDING_EVAL_MODEL: claude-haiku-4-5-20251001
run: >
python tests/eval_grounding_recall.py
--gate-mode hard
--gate-mode warn
-o test-results/m2-grounding-recall.json

# ── Surface M2 metrics on the GitHub run summary (#280 PR-3) ───
Expand Down Expand Up @@ -320,24 +327,30 @@ jobs:
# BICAMERAL_PREFLIGHT_EVAL_RECORD=1 /
# BICAMERAL_PREFLIGHT_INVOCATION_EVAL_RECORD=1 after a deliberate
# skill-prompt change. "Deliberate not drift."
- name: M_skill_preflight Step-1 (hard gate)
# #537: both steps are caller-LLM driven (same nondeterminism that flaked
# M2). Quality thresholds (Step-1 per-axis recall ≥ 0.70; Step-0
# should-invoke recall ≥ 0.50, fp_rate ≤ 0.30) are warn-only — emitted to
# the run summary below — while catastrophic floors (--catastrophic-recall,
# defaults 0.40 / 0.25) still hard-fail CI in any mode on a genuine path
# collapse. Per the gating-as-observability doctrine.
- name: M_skill_preflight Step-1 (warn + catastrophic floor)
if: matrix.os == 'ubuntu-latest'
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BICAMERAL_PREFLIGHT_EVAL_MODEL: claude-haiku-4-5-20251001
run: >
python tests/eval_preflight_skill_step1.py
--gate-mode hard
--gate-mode warn
-o test-results/skill-step1.json

- name: M_skill_preflight Step-0 (hard gate)
- name: M_skill_preflight Step-0 (warn + catastrophic floor)
if: matrix.os == 'ubuntu-latest'
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BICAMERAL_PREFLIGHT_INVOCATION_EVAL_MODEL: claude-haiku-4-5-20251001
run: >
python tests/eval_preflight_skill_invocation.py
--gate-mode hard
--gate-mode warn
-o test-results/skill-step0.json

# ── Surface M_skill_preflight metrics on the GitHub run summary ──
Expand Down
57 changes: 57 additions & 0 deletions tests/eval/_gate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Two-tier CI gate policy shared by the LLM-driven eval runners (#537).

Quality thresholds (recall / precision / abort-rate / fp-rate) are advisory in
``warn`` mode because the underlying metric is produced by a non-deterministic
caller LLM and flakes around the threshold. A separate *catastrophic floor*
hard-fails CI in any mode — it fires only when the metric collapses far below
the quality target, which signals a genuinely broken grounding / preflight path
rather than ordinary run-to-run variance.

Originating flake: M2 grounding-recall swung 0.783–0.957 on an identical ``main``
base across PRs #534 / #535 / #536 (a docs-only diff tripped the 0.80 hard gate).
See GitHub #537. This codifies the gating-as-observability doctrine: WARN + emit
on the quality signal, hard-fail only on a real lower-layer break.
"""

from __future__ import annotations

from collections.abc import Sequence


def gate_exit_code(
*,
quality_breaches: Sequence[str],
catastrophic_breaches: Sequence[str],
gate_mode: str,
) -> int:
"""Decide a CI exit code from two breach tiers.

- A non-empty ``catastrophic_breaches`` always returns ``1`` (hard floor),
regardless of ``gate_mode`` — a collapsed metric is a real failure.
- ``quality_breaches`` returns ``1`` only when ``gate_mode == "hard"``,
preserving the legacy opt-in hard-gate contract.
- Otherwise returns ``0`` (clean, or a warn-only quality breach).
"""
if catastrophic_breaches:
return 1
if quality_breaches and gate_mode == "hard":
return 1
return 0


def is_inconclusive(error_count: int, total: int, *, max_error_rate: float = 0.5) -> bool:
"""True when too many eval cases failed to *execute* (not failed to ground).

A run where most cases erred — e.g. a missing API key or network outage on a
CI re-run — zeroes the quality metric without producing any real signal. That
is inconclusive, not catastrophic: the catastrophic floor (#537) must abstain
on it rather than hard-fail, otherwise an auth/infra hiccup masquerades as a
grounding collapse. A genuine collapse shows as low recall with the eval
actually running (a low error rate).

Returns True when ``total <= 0`` (nothing ran) or the error rate reaches
``max_error_rate`` (default 0.5).
"""
if total <= 0:
return True
return (error_count / total) >= max_error_rate
48 changes: 43 additions & 5 deletions tests/eval_grounding_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
sys.path.insert(0, str(REPO_ROOT / "tests" / "fixtures" / "grounding_recall"))

from _bind_judge import BindJudgment, fixture_exists, run_bind_judgment # type: ignore[import-not-found] # noqa: E402, I001
from _gate import gate_exit_code, is_inconclusive # type: ignore[import-not-found] # noqa: E402, I001
from dataset import ALL_CASES, GENERATOR_VERSION, GroundingCase, cases_by_type # type: ignore[import-not-found] # noqa: E402, I001

FIXTURE_REPO = REPO_ROOT / "tests" / "fixtures" / "grounding_recall" / "repo"
Expand Down Expand Up @@ -306,6 +307,25 @@ async def run(args: argparse.Namespace) -> tuple[dict[str, Any], int]:
breaches.append(f"abort_rate {summary['abort_rate']:.3f} > {args.max_abort_rate}")
summary["gate_breaches"] = breaches

# Catastrophic floor (#537): hard-fails CI in any mode — but ONLY on a
# genuine grounding collapse, never on an eval that couldn't run. A high
# eval_error rate (e.g. missing API key / network on a CI re-run) zeroes
# recall without any grounding signal; that is inconclusive, not
# catastrophic, so we abstain and warn. A real collapse shows as low recall
# with the eval actually running (a low error rate).
total_cases = summary["total_cases"]
eval_errors = sum(1 for r in rows if r.get("outcome") == "eval_error")
error_rate = (eval_errors / total_cases) if total_cases else 0.0
summary["eval_error_rate"] = error_rate
inconclusive = is_inconclusive(eval_errors, total_cases)

catastrophic: list[str] = []
if not inconclusive and summary["recall"] < args.catastrophic_recall:
catastrophic.append(
f"recall {summary['recall']:.3f} < catastrophic floor {args.catastrophic_recall}"
)
summary["catastrophic_breaches"] = catastrophic

report = {"summary": summary, "rows": rows}

if args.output:
Expand All @@ -322,13 +342,24 @@ async def run(args: argparse.Namespace) -> tuple[dict[str, Any], int]:
print(f" avg_turns : {summary['avg_turns']}")
print(f" tokens : {summary['tokens_in_total']} in / {summary['tokens_out_total']} out")
if breaches:
print(f" ⚠ gate breaches: {'; '.join(breaches)}")
mode_note = "advisory (warn)" if args.gate_mode != "hard" else "gate"
print(f" ⚠ {mode_note} breaches: {'; '.join(breaches)}")
else:
print(" ✓ all gates pass")

if breaches and args.gate_mode == "hard":
return report, 1
return report, 0
if catastrophic:
print(f" ✗ CATASTROPHIC floor breached (hard-fails any mode): {'; '.join(catastrophic)}")
if inconclusive:
print(
f" ⓘ inconclusive: {eval_errors}/{total_cases} cases errored "
f"(rate {error_rate:.2f}) — catastrophic floor abstained "
"(eval could not run; not a grounding collapse)"
)

return report, gate_exit_code(
quality_breaches=breaches,
catastrophic_breaches=catastrophic,
gate_mode=args.gate_mode,
)


def main() -> int:
Expand All @@ -341,6 +372,13 @@ def main() -> int:
p.add_argument("--min-recall", type=float, default=0.80)
p.add_argument("--min-precision", type=float, default=0.85)
p.add_argument("--max-abort-rate", type=float, default=0.30)
p.add_argument(
"--catastrophic-recall",
type=float,
default=0.50,
help="Hard floor (#537): recall below this hard-fails CI in any gate mode "
"(a collapsed grounding path, not LLM variance). Default 0.50.",
)
Comment on lines +375 to +381

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Validate catastrophic-floor ordering at parse time.

--catastrophic-recall is unconstrained here, so it can be set above --min-recall. That breaks the two-tier contract by making warn mode hard-fail scores that the quality gate treats as passing, and in this runner can even print ✓ all gates pass before the catastrophic failure line. Reject invalid threshold combinations (and out-of-range percentages) up front.

Suggested guard
     args = p.parse_args()
+    for flag in ("min_recall", "min_precision", "max_abort_rate", "catastrophic_recall"):
+        value = getattr(args, flag)
+        if not 0.0 <= value <= 1.0:
+            p.error(f"--{flag.replace('_', '-')} must be between 0 and 1")
+    if args.catastrophic_recall > args.min_recall:
+        p.error("--catastrophic-recall must be less than or equal to --min-recall")
 
     _, exit_code = asyncio.run(run(args))
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tests/eval_grounding_recall.py` around lines 375 - 381, After parsing CLI
flags, validate the thresholds so --catastrophic-recall (parsed as
catastrophic_recall) is within [0.0,1.0], --min-recall (parsed as min_recall) is
within [0.0,1.0], and catastrophic_recall <= min_recall; if any check fails,
print a clear error and exit non‑zero. Locate the argument definitions added via
p.add_argument("--catastrophic-recall", ...) and the corresponding
"--min-recall" arg, perform this validation immediately after
parser.parse_args() (or in the function that handles args) and reject invalid
combinations up front to prevent the two-tier contract from being violated.

p.add_argument("--gate-mode", choices=("warn", "hard"), default="warn")
p.add_argument(
"--skip-missing-fixtures",
Expand Down
23 changes: 20 additions & 3 deletions tests/eval_preflight_skill_invocation.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO_ROOT / "tests" / "eval"))

from _gate import gate_exit_code # noqa: E402
from _skill_invocation_judge import ( # noqa: E402
DEFAULT_MODEL,
classify_outcome,
Expand Down Expand Up @@ -85,6 +86,13 @@ def main() -> int:
"honor the negative controls."
),
)
parser.add_argument(
"--catastrophic-recall",
type=float,
default=0.25,
help="Hard floor (#537): should-invoke recall below this hard-fails CI in "
"any gate mode (a collapsed invocation path, not LLM variance). Default 0.25.",
)
Comment on lines +89 to +95

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Reject threshold combinations that invert the warn/hard policy.

This flag is not validated against --min-recall, so a caller can set a catastrophic floor above the advisory recall gate and turn warn mode back into a hard gate for non-catastrophic results. Please enforce 0 <= catastrophic_recall <= min_recall <= 1 (and keep max_fp_rate in [0,1]) before computing breaches.

Suggested guard
     args = parser.parse_args()
+    for flag in ("min_recall", "max_fp_rate", "catastrophic_recall"):
+        value = getattr(args, flag)
+        if not 0.0 <= value <= 1.0:
+            parser.error(f"--{flag.replace('_', '-')} must be between 0 and 1")
+    if args.catastrophic_recall > args.min_recall:
+        parser.error("--catastrophic-recall must be less than or equal to --min-recall")
 
     rows = [json.loads(line) for line in DATASET.read_text().splitlines() if line.strip()]
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tests/eval_preflight_skill_invocation.py` around lines 89 - 95, Reject
invalid threshold combinations by validating parsed args: after argument
parsing, check that 0 <= args.catastrophic_recall <= args.min_recall <= 1 and
that 0 <= args.max_fp_rate <= 1, and raise/exit with a clear error if violated;
update the validation logic around where thresholds are used (e.g., before
breach computation and in any function that reads catastrophic_recall,
min_recall, max_fp_rate) so downstream code (breach computation) never runs with
inverted or out-of-range thresholds.

parser.add_argument(
"--model",
default=os.getenv("BICAMERAL_PREFLIGHT_INVOCATION_EVAL_MODEL", DEFAULT_MODEL),
Expand Down Expand Up @@ -160,12 +168,18 @@ def main() -> int:
if fp_rate is not None and fp_rate > args.max_fp_rate:
breaches.append(f"fp_rate={fp_rate:.3f} > {args.max_fp_rate}")

# Catastrophic floor (#537): should-invoke recall collapse hard-fails any mode.
catastrophic: list[str] = []
if recall is not None and recall < args.catastrophic_recall:
catastrophic.append(f"recall={recall:.3f} < catastrophic floor {args.catastrophic_recall}")

payload = {
"step": "step0_invocation",
"model": args.model,
"gate_mode": args.gate_mode,
"min_recall": args.min_recall,
"max_fp_rate": args.max_fp_rate,
"catastrophic_recall": args.catastrophic_recall,
"summary": {
"total": len(rows),
"counts": counts,
Expand All @@ -177,6 +191,7 @@ def main() -> int:
"precision": precision,
"fp_rate": fp_rate,
"gate_breaches": breaches,
"catastrophic_breaches": catastrophic,
},
"rows": case_rows,
}
Expand All @@ -187,9 +202,11 @@ def main() -> int:
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")

if args.gate_mode == "hard" and breaches:
return 1
return 0
return gate_exit_code(
quality_breaches=breaches,
catastrophic_breaches=catastrophic,
gate_mode=args.gate_mode,
)


if __name__ == "__main__":
Expand Down
25 changes: 22 additions & 3 deletions tests/eval_preflight_skill_step1.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO_ROOT / "tests" / "eval"))

from _gate import gate_exit_code # noqa: E402
from _skill_judge import DEFAULT_MODEL, fixture_exists, judge_relevance # noqa: E402

DATASET = REPO_ROOT / "tests" / "eval" / "preflight_skill_dataset.jsonl"
Expand Down Expand Up @@ -76,6 +77,13 @@ def main() -> int:
default=0.70,
help="Per-axis recall gate (default 0.70 per the wiki M6 signal threshold)",
)
parser.add_argument(
"--catastrophic-recall",
type=float,
default=0.40,
help="Hard floor (#537): overall recall below this hard-fails CI in any "
"gate mode (a collapsed preflight path, not LLM variance). Default 0.40.",
)
Comment on lines +80 to +86

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Enforce a sane catastrophic-floor range.

--catastrophic-recall is accepted without any bounds or ordering check. If someone tunes it above --min-recall, this runner will hard-fail warn-mode runs that only cleared the advisory gate, which defeats the two-tier policy. Validate both the [0,1] range and catastrophic_recall <= min_recall when parsing args.

Suggested guard
     args = parser.parse_args()
+    for flag in ("min_recall", "catastrophic_recall"):
+        value = getattr(args, flag)
+        if not 0.0 <= value <= 1.0:
+            parser.error(f"--{flag.replace('_', '-')} must be between 0 and 1")
+    if args.catastrophic_recall > args.min_recall:
+        parser.error("--catastrophic-recall must be less than or equal to --min-recall")
 
     rows = [json.loads(line) for line in DATASET.read_text().splitlines() if line.strip()]
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tests/eval_preflight_skill_step1.py` around lines 80 - 86, The CLI currently
accepts --catastrophic-recall without bounds or ordering checks; after parsing
args (using parser and the parsed names catastrophic_recall and min_recall),
validate that catastrophic_recall is within [0.0, 1.0] and that
catastrophic_recall <= min_recall; if a check fails call parser.error with a
clear message so arg parsing exits cleanly. Add these checks immediately after
args = parser.parse_args() (or equivalent) to enforce the range and the ordering
between --catastrophic-recall and --min-recall.

parser.add_argument(
"--model",
default=os.getenv("BICAMERAL_PREFLIGHT_EVAL_MODEL", DEFAULT_MODEL),
Expand Down Expand Up @@ -144,11 +152,19 @@ def main() -> int:
if recall is not None and recall < args.min_recall:
breaches.append(f"axis={axis} recall={recall:.3f} < {args.min_recall}")

# Catastrophic floor (#537): overall recall collapse hard-fails any mode.
catastrophic: list[str] = []
if overall_recall is not None and overall_recall < args.catastrophic_recall:
catastrophic.append(
f"overall recall={overall_recall:.3f} < catastrophic floor {args.catastrophic_recall}"
)

payload = {
"step": "step1_relevance",
"model": args.model,
"gate_mode": args.gate_mode,
"min_recall": args.min_recall,
"catastrophic_recall": args.catastrophic_recall,
"summary": {
"total": total,
"hits": hits,
Expand All @@ -158,6 +174,7 @@ def main() -> int:
"recall": overall_recall,
"per_axis": per_axis_summary,
"gate_breaches": breaches,
"catastrophic_breaches": catastrophic,
},
"rows": case_rows,
}
Expand All @@ -168,9 +185,11 @@ def main() -> int:
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")

if args.gate_mode == "hard" and breaches:
return 1
return 0
return gate_exit_code(
quality_breaches=breaches,
catastrophic_breaches=catastrophic,
gate_mode=args.gate_mode,
)


if __name__ == "__main__":
Expand Down
Loading
Loading