diff --git a/.github/workflows/test-mcp-regression.yml b/.github/workflows/test-mcp-regression.yml
index 9ed2c63e..8b98bdca 100644
--- a/.github/workflows/test-mcp-regression.yml
+++ b/.github/workflows/test-mcp-regression.yml
@@ -244,14 +244,21 @@ jobs:
       # bind handler / skill / fixture must keep recall ≥ 0.80, precision
       # ≥ 0.85, abort_rate ≤ 0.30 — OR explicitly re-record the cache
       # after a deliberate skill-prompt change. "Deliberate not drift."
-      - name: M2 grounding-recall eval (hard gate)
+      # #537: this metric is produced by a non-deterministic caller LLM and
+      # flaked the hard gate on unrelated PRs (recall swung 0.783–0.957 on an
+      # identical `main` base across #534/#535/#536). Quality thresholds
+      # (recall ≥ 0.80 etc.) are now warn-only — emitted to the run summary
+      # below but advisory — while a catastrophic floor (--catastrophic-recall,
+      # default 0.50) still hard-fails CI in any mode when the grounding path
+      # genuinely collapses. Per the gating-as-observability doctrine.
+      - name: M2 grounding-recall eval (warn + catastrophic floor)
         if: matrix.os == 'ubuntu-latest'
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           BICAMERAL_GROUNDING_EVAL_MODEL: claude-haiku-4-5-20251001
         run: >
           python tests/eval_grounding_recall.py
-          --gate-mode hard
+          --gate-mode warn
           -o test-results/m2-grounding-recall.json
 
       # ── Surface M2 metrics on the GitHub run summary (#280 PR-3) ───
@@ -320,24 +327,30 @@ jobs:
       # BICAMERAL_PREFLIGHT_EVAL_RECORD=1 /
       # BICAMERAL_PREFLIGHT_INVOCATION_EVAL_RECORD=1 after a deliberate
       # skill-prompt change. "Deliberate not drift."
-      - name: M_skill_preflight Step-1 (hard gate)
+      # #537: both steps are caller-LLM driven (same nondeterminism that flaked
+      # M2). Quality thresholds (Step-1 per-axis recall ≥ 0.70; Step-0
+      # should-invoke recall ≥ 0.50, fp_rate ≤ 0.30) are warn-only — emitted to
+      # the run summary below — while catastrophic floors (--catastrophic-recall,
+      # defaults 0.40 / 0.25) still hard-fail CI in any mode on a genuine path
+      # collapse. Per the gating-as-observability doctrine.
+      - name: M_skill_preflight Step-1 (warn + catastrophic floor)
         if: matrix.os == 'ubuntu-latest'
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           BICAMERAL_PREFLIGHT_EVAL_MODEL: claude-haiku-4-5-20251001
         run: >
           python tests/eval_preflight_skill_step1.py
-          --gate-mode hard
+          --gate-mode warn
           -o test-results/skill-step1.json
 
-      - name: M_skill_preflight Step-0 (hard gate)
+      - name: M_skill_preflight Step-0 (warn + catastrophic floor)
         if: matrix.os == 'ubuntu-latest'
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           BICAMERAL_PREFLIGHT_INVOCATION_EVAL_MODEL: claude-haiku-4-5-20251001
         run: >
           python tests/eval_preflight_skill_invocation.py
-          --gate-mode hard
+          --gate-mode warn
           -o test-results/skill-step0.json
 
       # ── Surface M_skill_preflight metrics on the GitHub run summary ──
diff --git a/tests/eval/_gate.py b/tests/eval/_gate.py
new file mode 100644
index 00000000..31c17f26
--- /dev/null
+++ b/tests/eval/_gate.py
@@ -0,0 +1,57 @@
+"""Two-tier CI gate policy shared by the LLM-driven eval runners (#537).
+
+Quality thresholds (recall / precision / abort-rate / fp-rate) are advisory in
+``warn`` mode because the underlying metric is produced by a non-deterministic
+caller LLM and flakes around the threshold. A separate *catastrophic floor*
+hard-fails CI in any mode — it fires only when the metric collapses far below
+the quality target, which signals a genuinely broken grounding / preflight path
+rather than ordinary run-to-run variance.
+
+Originating flake: M2 grounding-recall swung 0.783–0.957 on an identical ``main``
+base across PRs #534 / #535 / #536 (a docs-only diff tripped the 0.80 hard gate).
+See GitHub #537. This codifies the gating-as-observability doctrine: WARN + emit
+on the quality signal, hard-fail only on a real lower-layer break.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+
+def gate_exit_code(
+    *,
+    quality_breaches: Sequence[str],
+    catastrophic_breaches: Sequence[str],
+    gate_mode: str,
+) -> int:
+    """Decide a CI exit code from two breach tiers.
+
+    - A non-empty ``catastrophic_breaches`` always returns ``1`` (hard floor),
+      regardless of ``gate_mode`` — a collapsed metric is a real failure.
+    - ``quality_breaches`` returns ``1`` only when ``gate_mode == "hard"``,
+      preserving the legacy opt-in hard-gate contract.
+    - Otherwise returns ``0`` (clean, or a warn-only quality breach).
+    """
+    if catastrophic_breaches:
+        return 1
+    if quality_breaches and gate_mode == "hard":
+        return 1
+    return 0
+
+
+def is_inconclusive(error_count: int, total: int, *, max_error_rate: float = 0.5) -> bool:
+    """True when too many eval cases failed to *execute* (not failed to ground).
+
+    A run where most cases erred — e.g. a missing API key or network outage on a
+    CI re-run — zeroes the quality metric without producing any real signal. That
+    is inconclusive, not catastrophic: the catastrophic floor (#537) must abstain
+    on it rather than hard-fail, otherwise an auth/infra hiccup masquerades as a
+    grounding collapse. A genuine collapse shows as low recall with the eval
+    actually running (a low error rate).
+
+    Returns True when ``total <= 0`` (nothing ran) or the error rate reaches
+    ``max_error_rate`` (default 0.5).
+    """
+    if total <= 0:
+        return True
+    return (error_count / total) >= max_error_rate
diff --git a/tests/eval_grounding_recall.py b/tests/eval_grounding_recall.py
index d2d7f3d3..be5cdd9a 100644
--- a/tests/eval_grounding_recall.py
+++ b/tests/eval_grounding_recall.py
@@ -51,6 +51,7 @@
 sys.path.insert(0, str(REPO_ROOT / "tests" / "fixtures" / "grounding_recall"))
 
 from _bind_judge import BindJudgment, fixture_exists, run_bind_judgment  # type: ignore[import-not-found]  # noqa: E402, I001
+from _gate import gate_exit_code, is_inconclusive  # type: ignore[import-not-found]  # noqa: E402, I001
 from dataset import ALL_CASES, GENERATOR_VERSION, GroundingCase, cases_by_type  # type: ignore[import-not-found]  # noqa: E402, I001
 
 FIXTURE_REPO = REPO_ROOT / "tests" / "fixtures" / "grounding_recall" / "repo"
@@ -306,6 +307,25 @@ async def run(args: argparse.Namespace) -> tuple[dict[str, Any], int]:
         breaches.append(f"abort_rate {summary['abort_rate']:.3f} > {args.max_abort_rate}")
     summary["gate_breaches"] = breaches
 
+    # Catastrophic floor (#537): hard-fails CI in any mode — but ONLY on a
+    # genuine grounding collapse, never on an eval that couldn't run. A high
+    # eval_error rate (e.g. missing API key / network on a CI re-run) zeroes
+    # recall without any grounding signal; that is inconclusive, not
+    # catastrophic, so we abstain and warn. A real collapse shows as low recall
+    # with the eval actually running (a low error rate).
+    total_cases = summary["total_cases"]
+    eval_errors = sum(1 for r in rows if r.get("outcome") == "eval_error")
+    error_rate = (eval_errors / total_cases) if total_cases else 0.0
+    summary["eval_error_rate"] = error_rate
+    inconclusive = is_inconclusive(eval_errors, total_cases)
+
+    catastrophic: list[str] = []
+    if not inconclusive and summary["recall"] < args.catastrophic_recall:
+        catastrophic.append(
+            f"recall {summary['recall']:.3f} < catastrophic floor {args.catastrophic_recall}"
+        )
+    summary["catastrophic_breaches"] = catastrophic
+
     report = {"summary": summary, "rows": rows}
 
     if args.output:
@@ -322,13 +342,24 @@ async def run(args: argparse.Namespace) -> tuple[dict[str, Any], int]:
     print(f"  avg_turns  : {summary['avg_turns']}")
     print(f"  tokens     : {summary['tokens_in_total']} in / {summary['tokens_out_total']} out")
     if breaches:
-        print(f"  ⚠ gate breaches: {'; '.join(breaches)}")
+        mode_note = "advisory (warn)" if args.gate_mode != "hard" else "gate"
+        print(f"  ⚠ {mode_note} breaches: {'; '.join(breaches)}")
     else:
         print("  ✓ all gates pass")
-
-    if breaches and args.gate_mode == "hard":
-        return report, 1
-    return report, 0
+    if catastrophic:
+        print(f"  ✗ CATASTROPHIC floor breached (hard-fails any mode): {'; '.join(catastrophic)}")
+    if inconclusive:
+        print(
+            f"  ⓘ inconclusive: {eval_errors}/{total_cases} cases errored "
+            f"(rate {error_rate:.2f}) — catastrophic floor abstained "
+            "(eval could not run; not a grounding collapse)"
+        )
+
+    return report, gate_exit_code(
+        quality_breaches=breaches,
+        catastrophic_breaches=catastrophic,
+        gate_mode=args.gate_mode,
+    )
 
 
 def main() -> int:
@@ -341,6 +372,13 @@ def main() -> int:
     p.add_argument("--min-recall", type=float, default=0.80)
     p.add_argument("--min-precision", type=float, default=0.85)
     p.add_argument("--max-abort-rate", type=float, default=0.30)
+    p.add_argument(
+        "--catastrophic-recall",
+        type=float,
+        default=0.50,
+        help="Hard floor (#537): recall below this hard-fails CI in any gate mode "
+        "(a collapsed grounding path, not LLM variance). Default 0.50.",
+    )
     p.add_argument("--gate-mode", choices=("warn", "hard"), default="warn")
     p.add_argument(
         "--skip-missing-fixtures",
diff --git a/tests/eval_preflight_skill_invocation.py b/tests/eval_preflight_skill_invocation.py
index 86a30a77..1637989c 100644
--- a/tests/eval_preflight_skill_invocation.py
+++ b/tests/eval_preflight_skill_invocation.py
@@ -46,6 +46,7 @@
 REPO_ROOT = Path(__file__).resolve().parents[1]
 sys.path.insert(0, str(REPO_ROOT / "tests" / "eval"))
 
+from _gate import gate_exit_code  # noqa: E402
 from _skill_invocation_judge import (  # noqa: E402
     DEFAULT_MODEL,
     classify_outcome,
@@ -85,6 +86,13 @@ def main() -> int:
             "honor the negative controls."
         ),
     )
+    parser.add_argument(
+        "--catastrophic-recall",
+        type=float,
+        default=0.25,
+        help="Hard floor (#537): should-invoke recall below this hard-fails CI in "
+        "any gate mode (a collapsed invocation path, not LLM variance). Default 0.25.",
+    )
     parser.add_argument(
         "--model",
         default=os.getenv("BICAMERAL_PREFLIGHT_INVOCATION_EVAL_MODEL", DEFAULT_MODEL),
@@ -160,12 +168,18 @@ def main() -> int:
     if fp_rate is not None and fp_rate > args.max_fp_rate:
         breaches.append(f"fp_rate={fp_rate:.3f} > {args.max_fp_rate}")
 
+    # Catastrophic floor (#537): should-invoke recall collapse hard-fails any mode.
+    catastrophic: list[str] = []
+    if recall is not None and recall < args.catastrophic_recall:
+        catastrophic.append(f"recall={recall:.3f} < catastrophic floor {args.catastrophic_recall}")
+
     payload = {
         "step": "step0_invocation",
         "model": args.model,
         "gate_mode": args.gate_mode,
         "min_recall": args.min_recall,
         "max_fp_rate": args.max_fp_rate,
+        "catastrophic_recall": args.catastrophic_recall,
         "summary": {
             "total": len(rows),
             "counts": counts,
@@ -177,6 +191,7 @@ def main() -> int:
             "precision": precision,
             "fp_rate": fp_rate,
             "gate_breaches": breaches,
+            "catastrophic_breaches": catastrophic,
         },
         "rows": case_rows,
     }
@@ -187,9 +202,11 @@ def main() -> int:
         args.output.parent.mkdir(parents=True, exist_ok=True)
         args.output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
 
-    if args.gate_mode == "hard" and breaches:
-        return 1
-    return 0
+    return gate_exit_code(
+        quality_breaches=breaches,
+        catastrophic_breaches=catastrophic,
+        gate_mode=args.gate_mode,
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/eval_preflight_skill_step1.py b/tests/eval_preflight_skill_step1.py
index 5585bc9a..9c262ded 100644
--- a/tests/eval_preflight_skill_step1.py
+++ b/tests/eval_preflight_skill_step1.py
@@ -33,6 +33,7 @@
 REPO_ROOT = Path(__file__).resolve().parents[1]
 sys.path.insert(0, str(REPO_ROOT / "tests" / "eval"))
 
+from _gate import gate_exit_code  # noqa: E402
 from _skill_judge import DEFAULT_MODEL, fixture_exists, judge_relevance  # noqa: E402
 
 DATASET = REPO_ROOT / "tests" / "eval" / "preflight_skill_dataset.jsonl"
@@ -76,6 +77,13 @@ def main() -> int:
         default=0.70,
         help="Per-axis recall gate (default 0.70 per the wiki M6 signal threshold)",
     )
+    parser.add_argument(
+        "--catastrophic-recall",
+        type=float,
+        default=0.40,
+        help="Hard floor (#537): overall recall below this hard-fails CI in any "
+        "gate mode (a collapsed preflight path, not LLM variance). Default 0.40.",
+    )
     parser.add_argument(
         "--model",
         default=os.getenv("BICAMERAL_PREFLIGHT_EVAL_MODEL", DEFAULT_MODEL),
@@ -144,11 +152,19 @@ def main() -> int:
         if recall is not None and recall < args.min_recall:
             breaches.append(f"axis={axis} recall={recall:.3f} < {args.min_recall}")
 
+    # Catastrophic floor (#537): overall recall collapse hard-fails any mode.
+    catastrophic: list[str] = []
+    if overall_recall is not None and overall_recall < args.catastrophic_recall:
+        catastrophic.append(
+            f"overall recall={overall_recall:.3f} < catastrophic floor {args.catastrophic_recall}"
+        )
+
     payload = {
         "step": "step1_relevance",
         "model": args.model,
         "gate_mode": args.gate_mode,
         "min_recall": args.min_recall,
+        "catastrophic_recall": args.catastrophic_recall,
         "summary": {
             "total": total,
             "hits": hits,
@@ -158,6 +174,7 @@ def main() -> int:
             "recall": overall_recall,
             "per_axis": per_axis_summary,
             "gate_breaches": breaches,
+            "catastrophic_breaches": catastrophic,
         },
         "rows": case_rows,
     }
@@ -168,9 +185,11 @@ def main() -> int:
         args.output.parent.mkdir(parents=True, exist_ok=True)
         args.output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
 
-    if args.gate_mode == "hard" and breaches:
-        return 1
-    return 0
+    return gate_exit_code(
+        quality_breaches=breaches,
+        catastrophic_breaches=catastrophic,
+        gate_mode=args.gate_mode,
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/test_eval_gate_two_tier.py b/tests/test_eval_gate_two_tier.py
new file mode 100644
index 00000000..04d79b26
--- /dev/null
+++ b/tests/test_eval_gate_two_tier.py
@@ -0,0 +1,99 @@
+"""Behavioral tests for the two-tier eval gate policy (#537).
+
+The helper is a pure decision function with no shipped collaborators, so a
+solitary test that invokes it directly and asserts the returned exit code is
+the correct shape (per the project's "solitary is correct for pure helpers"
+exception). Each case asserts the observable output (exit code) for a given
+(quality_breaches, catastrophic_breaches, gate_mode) input — not presence.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent / "eval"))
+
+from _gate import gate_exit_code, is_inconclusive  # noqa: E402
+
+
+def test_clean_passes_in_warn_mode():
+    assert gate_exit_code(quality_breaches=[], catastrophic_breaches=[], gate_mode="warn") == 0
+
+
+def test_clean_passes_in_hard_mode():
+    assert gate_exit_code(quality_breaches=[], catastrophic_breaches=[], gate_mode="hard") == 0
+
+
+def test_quality_breach_is_advisory_in_warn_mode():
+    # The #537 fix: a quality-threshold breach (LLM variance) must NOT fail CI
+    # in warn mode.
+    assert (
+        gate_exit_code(
+            quality_breaches=["recall 0.783 < 0.8"],
+            catastrophic_breaches=[],
+            gate_mode="warn",
+        )
+        == 0
+    )
+
+
+def test_quality_breach_still_hard_fails_in_hard_mode():
+    # Legacy opt-in contract is preserved for callers that pass --gate-mode hard.
+    assert (
+        gate_exit_code(
+            quality_breaches=["recall 0.783 < 0.8"],
+            catastrophic_breaches=[],
+            gate_mode="hard",
+        )
+        == 1
+    )
+
+
+def test_catastrophic_breach_hard_fails_in_warn_mode():
+    # The catastrophic floor fires even in warn mode — a collapsed metric is a
+    # genuine break, not variance.
+    assert (
+        gate_exit_code(
+            quality_breaches=["recall 0.40 < 0.8"],
+            catastrophic_breaches=["recall 0.40 < catastrophic floor 0.50"],
+            gate_mode="warn",
+        )
+        == 1
+    )
+
+
+def test_catastrophic_breach_hard_fails_in_hard_mode():
+    assert (
+        gate_exit_code(
+            quality_breaches=[],
+            catastrophic_breaches=["recall 0.10 < catastrophic floor 0.50"],
+            gate_mode="hard",
+        )
+        == 1
+    )
+
+
+def test_all_cases_errored_is_inconclusive_not_catastrophic():
+    # The #536 re-run scored recall 0.000 because every case errored (no API key
+    # on rerun). That must read as inconclusive, not a grounding collapse.
+    assert is_inconclusive(error_count=23, total=23) is True
+
+
+def test_no_cases_run_is_inconclusive():
+    assert is_inconclusive(error_count=0, total=0) is True
+
+
+def test_low_error_rate_is_conclusive():
+    # A real collapse: the eval ran (only 1/23 errored) but recall is genuinely
+    # low — the catastrophic floor should be allowed to fire.
+    assert is_inconclusive(error_count=1, total=23) is False
+
+
+def test_error_rate_at_threshold_is_inconclusive():
+    # Exactly half errored — at/above the default 0.5 ceiling → abstain.
+    assert is_inconclusive(error_count=5, total=10) is True
+
+
+def test_error_rate_just_below_threshold_is_conclusive():
+    assert is_inconclusive(error_count=4, total=10) is False