stranske · stranske · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/.agents/issue-1427-ledger.yml b/.agents/issue-1427-ledger.yml
@@ -32,3 +32,9 @@ Note: The pip cache snippet uses `matrix.python-version`. If a job does not alre
 2. Insert the cache step from `docs/workflow-snippets/pip-cache-step.yml` immediately after line 96.
 3. Replace the `Install dependencies` step at lines 98-101 with the contents of `docs/workflow-snippets/agents-verify-to-new-pr-install.yml`.
 4. Insert the pip freeze step from `docs/workflow-snippets/pip-freeze-step.yml` immediately after the install step you added in step 3.
+
+**Deterministic verdict extraction**
+1. In `.github/workflows/agents-verify-to-new-pr.yml`, replace the regex-based verdict parsing step with a Python step that runs `python scripts/langchain/verdict_extract.py --summary-path <path-to-provider-summary>`.
+2. Ensure the step has an `id` (for example `verdict-policy`) so downstream steps can read the outputs.
+3. The script writes step outputs to `GITHUB_OUTPUT` with keys: `verdict`, `needs_human`, `needs_human_reason`, `policy`, `verdict_kind`, `selected_provider`, `selected_model`, `selected_confidence`, `split_verdict`, `concerns_confidence`, and `verdict_metadata`.
+4. Update downstream steps to consume `steps.<id>.outputs.verdict` and `steps.<id>.outputs.needs_human` instead of re-parsing the markdown table.
diff --git a/scripts/langchain/followup_issue_generator.py b/scripts/langchain/followup_issue_generator.py
@@ -32,6 +32,8 @@
 from pathlib import Path
 from typing import Any
 
+from scripts.langchain import verdict_policy
+
 # Section alias handling aligned with issue_formatter/issue_optimizer.
 SECTION_ALIASES = {
     "why": ["why", "motivation", "summary", "goals"],
@@ -104,13 +106,6 @@ def _parse_confidence_value(text: str) -> int:
     return int(round(value))
 
 
-VERDICT_SEVERITY = {
-    "pass": 0,
-    "unknown": 1,
-    "concerns": 2,
-    "fail": 3,
-}
-
 ADVISORY_PATTERNS = [
     r"\bnit\b",
     r"\bnitpick\b",
@@ -153,49 +148,6 @@ def _parse_confidence_value(text: str) -> int:
 ]
 
 
-def _classify_verdict(text: str) -> str:
-    """Normalize verdict text into pass/concerns/fail/unknown."""
-    normalized = (text or "").strip().lower()
-    if not normalized:
-        return "unknown"
-    if "pass" in normalized:
-        return "pass"
-    if "fail" in normalized:
-        return "fail"
-    if "concern" in normalized or "needs work" in normalized or "not ready" in normalized:
-        return "concerns"
-    if "unknown" in normalized:
-        return "unknown"
-    return "concerns"
-
-
-def _select_primary_verdict(provider_verdicts: dict[str, dict[str, Any]]) -> str:
-    """Select the worst-case verdict with deterministic tie-breaking."""
-    if not provider_verdicts:
-        return "Unknown"
-
-    best_provider = None
-    best_severity = -1
-    best_confidence = -1
-    for provider, payload in provider_verdicts.items():
-        verdict_text = payload.get("verdict", "") or ""
-        verdict_kind = _classify_verdict(verdict_text)
-        severity = VERDICT_SEVERITY.get(verdict_kind, 1)
-        confidence = payload.get("confidence", 0) or 0
-        if severity > best_severity:
-            best_provider = provider
-            best_severity = severity
-            best_confidence = confidence
-            continue
-        if severity == best_severity and confidence > best_confidence:
-            best_provider = provider
-            best_confidence = confidence
-
-    if best_provider is None:
-        return "Unknown"
-    return provider_verdicts[best_provider].get("verdict", "Unknown") or "Unknown"
-
-
 def _is_advisory_concern(concern: str) -> bool:
     text = (concern or "").strip().lower()
     if not text or concern == MISSING_CONCERNS_MESSAGE:
@@ -218,40 +170,20 @@ def _split_concerns(concerns: list[str]) -> tuple[list[str], list[str]]:
     return blocking, advisory
 
 
-def _split_verdicts(provider_verdicts: dict[str, dict[str, Any]]) -> tuple[bool, int]:
-    """Return (is_split, max_non_pass_confidence)."""
-    if not provider_verdicts:
-        return False, 0
-    verdict_kinds = [
-        _classify_verdict(payload.get("verdict", "")) for payload in provider_verdicts.values()
-    ]
-    has_pass = any(kind == "pass" for kind in verdict_kinds)
-    has_non_pass = any(kind in {"concerns", "fail"} for kind in verdict_kinds)
-    if not (has_pass and has_non_pass):
-        return False, 0
-    max_confidence = 0
-    for payload in provider_verdicts.values():
-        if _classify_verdict(payload.get("verdict", "")) in {"concerns", "fail"}:
-            max_confidence = max(max_confidence, payload.get("confidence", 0) or 0)
-    return True, max_confidence
-
-
-def _needs_human_due_to_split(
-    provider_verdicts: dict[str, dict[str, Any]],
-    *,
-    confidence_threshold: int = 85,
-) -> tuple[bool, str]:
-    is_split, max_confidence = _split_verdicts(provider_verdicts)
-    if not is_split:
-        return False, ""
-    if max_confidence >= confidence_threshold:
-        return False, ""
-    return (
-        True,
-        "Provider verdicts split with low-confidence concerns; "
-        f"dissenting confidence {max_confidence}% < {confidence_threshold}%. "
-        "Requires human review before starting another automated follow-up.",
-    )
+def _resolve_verdict_policy(
+    verification_data: VerificationData,
+) -> verdict_policy.VerdictPolicyResult:
+    verdicts: list[verdict_policy.ProviderVerdict] = []
+    for provider, payload in verification_data.provider_verdicts.items():
+        verdicts.append(
+            verdict_policy.ProviderVerdict(
+                provider=provider,
+                model=payload.get("model", "") or "",
+                verdict=payload.get("verdict", "") or "",
+                confidence=float(payload.get("confidence", 0) or 0),
+            )
+        )
+    return verdict_policy.evaluate_verdict_policy(verdicts, policy="worst")
 
 
 # Pre-computed normalized aliases for efficient section resolution.
@@ -1057,8 +989,10 @@ def generate_followup_issue(
     4. Format the final issue
     """
     blocking_concerns, advisory_concerns = _split_concerns(verification_data.concerns)
-    needs_human, needs_human_reason = _needs_human_due_to_split(verification_data.provider_verdicts)
-    verdict = _get_primary_verdict(verification_data)
+    policy_result = _resolve_verdict_policy(verification_data)
+    needs_human = policy_result.needs_human
+    needs_human_reason = policy_result.needs_human_reason
+    verdict = policy_result.verdict
 
     if needs_human:
         return _generate_without_llm(
@@ -1450,7 +1384,8 @@ def _build_why_section(
     needs_human_reason: str | None = None,
 ) -> str:
     """Build the Why section explaining the follow-up context."""
-    verdict = verdict or _get_primary_verdict(verification_data)
+    if verdict is None:
+        verdict = _resolve_verdict_policy(verification_data).verdict
 
     parts = [
         f"PR #{pr_number} addressed issue #{original_issue.number} but verification "
@@ -1480,11 +1415,6 @@ def _build_why_section(
     return " ".join(parts)
 
 
-def _get_primary_verdict(verification_data: VerificationData) -> str:
-    """Get the primary verdict from verification data."""
-    return _select_primary_verdict(verification_data.provider_verdicts)
-
-
 def main() -> int:
     parser = argparse.ArgumentParser(
         description="Generate follow-up issue from verification feedback."

diff --git a/scripts/langchain/verdict_extract.py b/scripts/langchain/verdict_extract.py
@@ -0,0 +1,112 @@
+"""Entrypoint for deterministic verdict extraction with structured outputs."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from typing import TypedDict
+
+from scripts.langchain import verdict_policy
+
+
+def build_verdict_result(
+    summary: str,
+    *,
+    policy: str = "worst",
+) -> verdict_policy.VerdictPolicyResult:
+    return verdict_policy.evaluate_summary(summary, policy=policy)
+
+
+class VerdictGithubOutputs(TypedDict):
+    verdict: str
+    needs_human: str
+    needs_human_reason: str
+    policy: str
+    verdict_kind: str
+    selected_provider: str
+    selected_model: str
+    selected_confidence: str
+    split_verdict: str
+    concerns_confidence: str
+    verdict_metadata: str
+
+
+def _build_github_outputs(result: verdict_policy.VerdictPolicyResult) -> VerdictGithubOutputs:
+    return {
+        "verdict": result.verdict,
+        "needs_human": str(result.needs_human).lower(),
+        "needs_human_reason": result.needs_human_reason,
+        "policy": result.policy,
+        "verdict_kind": result.verdict_kind,
+        "selected_provider": result.selected_provider or "",
+        "selected_model": result.selected_model or "",
+        "selected_confidence": (
+            f"{result.selected_confidence:.4f}" if result.selected_confidence is not None else ""
+        ),
+        "split_verdict": str(result.split_verdict).lower(),
+        "concerns_confidence": (
+            f"{result.concerns_confidence:.4f}" if result.concerns_confidence is not None else ""
+        ),
+        "verdict_metadata": json.dumps(result.as_dict()),
+    }
+
+
+def _write_github_outputs(result: verdict_policy.VerdictPolicyResult, output_path: str) -> None:
+    outputs = _build_github_outputs(result)
+
+    with open(output_path, "a", encoding="utf-8") as handle:
+        for key, value in outputs.items():
+            handle.write(f"{key}={value}\n")
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        description="Extract a deterministic verdict and emit structured outputs."
+    )
+    parser.add_argument(
+        "--summary-path",
+        required=True,
+        help="Path to the markdown summary (use '-' for stdin).",
+    )
+    parser.add_argument(
+        "--policy",
+        choices=["worst", "majority"],
+        default="worst",
+        help="Policy used to resolve split provider verdicts.",
+    )
+    parser.add_argument(
+        "--emit",
+        choices=["github", "json", "verdict"],
+        default="github",
+        help="Output format for results.",
+    )
+    args = parser.parse_args(argv)
+
+    summary = verdict_policy._read_summary(args.summary_path)
+    result = build_verdict_result(summary, policy=args.policy)
+
+    if args.emit == "verdict":
+        print(result.verdict)
+        return 0
+
+    if args.emit == "json":
+        print(json.dumps(result.as_dict(), indent=2))
+        return 0
+
+    github_output = os.environ.get("GITHUB_OUTPUT", "")
+    if not github_output:
+        print(
+            "GITHUB_OUTPUT is not set; falling back to JSON on stdout.",
+            file=sys.stderr,
+        )
+        print(json.dumps(result.as_dict(), indent=2))
+        return 0
+
+    _write_github_outputs(result, github_output)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())