Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
671 changes: 671 additions & 0 deletions .agents/issue-1427-ledger.yml

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions docs/workflow-integration-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,9 @@ Note: The pip cache snippet uses `matrix.python-version`. If a job does not alre
2. Insert the cache step from `docs/workflow-snippets/pip-cache-step.yml` immediately after line 96.
3. Replace the `Install dependencies` step at lines 98-101 with the contents of `docs/workflow-snippets/agents-verify-to-new-pr-install.yml`.
4. Insert the pip freeze step from `docs/workflow-snippets/pip-freeze-step.yml` immediately after the install step you added in step 3.

**Deterministic verdict extraction**
1. In `.github/workflows/agents-verify-to-new-pr.yml`, replace the regex-based verdict parsing step with a Python step that runs `python scripts/langchain/verdict_extract.py --summary-path <path-to-provider-summary>`.
2. Ensure the step has an `id` (for example `verdict-policy`) so downstream steps can read the outputs.
3. The script writes step outputs to `GITHUB_OUTPUT` with keys: `verdict`, `needs_human`, `needs_human_reason`, `policy`, `verdict_kind`, `selected_provider`, `selected_model`, `selected_confidence`, `split_verdict`, `concerns_confidence`, and `verdict_metadata`.
4. Update downstream steps to consume `steps.<id>.outputs.verdict` and `steps.<id>.outputs.needs_human` instead of re-parsing the markdown table.
114 changes: 22 additions & 92 deletions scripts/langchain/followup_issue_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
from pathlib import Path
from typing import Any

from scripts.langchain import verdict_policy

# Section alias handling aligned with issue_formatter/issue_optimizer.
SECTION_ALIASES = {
"why": ["why", "motivation", "summary", "goals"],
Expand Down Expand Up @@ -104,13 +106,6 @@ def _parse_confidence_value(text: str) -> int:
return int(round(value))


VERDICT_SEVERITY = {
"pass": 0,
"unknown": 1,
"concerns": 2,
"fail": 3,
}

ADVISORY_PATTERNS = [
r"\bnit\b",
r"\bnitpick\b",
Expand Down Expand Up @@ -153,49 +148,6 @@ def _parse_confidence_value(text: str) -> int:
]


def _classify_verdict(text: str) -> str:
"""Normalize verdict text into pass/concerns/fail/unknown."""
normalized = (text or "").strip().lower()
if not normalized:
return "unknown"
if "pass" in normalized:
return "pass"
if "fail" in normalized:
return "fail"
if "concern" in normalized or "needs work" in normalized or "not ready" in normalized:
return "concerns"
if "unknown" in normalized:
return "unknown"
return "concerns"


def _select_primary_verdict(provider_verdicts: dict[str, dict[str, Any]]) -> str:
"""Select the worst-case verdict with deterministic tie-breaking."""
if not provider_verdicts:
return "Unknown"

best_provider = None
best_severity = -1
best_confidence = -1
for provider, payload in provider_verdicts.items():
verdict_text = payload.get("verdict", "") or ""
verdict_kind = _classify_verdict(verdict_text)
severity = VERDICT_SEVERITY.get(verdict_kind, 1)
confidence = payload.get("confidence", 0) or 0
if severity > best_severity:
best_provider = provider
best_severity = severity
best_confidence = confidence
continue
if severity == best_severity and confidence > best_confidence:
best_provider = provider
best_confidence = confidence

if best_provider is None:
return "Unknown"
return provider_verdicts[best_provider].get("verdict", "Unknown") or "Unknown"


def _is_advisory_concern(concern: str) -> bool:
text = (concern or "").strip().lower()
if not text or concern == MISSING_CONCERNS_MESSAGE:
Expand All @@ -218,40 +170,20 @@ def _split_concerns(concerns: list[str]) -> tuple[list[str], list[str]]:
return blocking, advisory


def _split_verdicts(provider_verdicts: dict[str, dict[str, Any]]) -> tuple[bool, int]:
"""Return (is_split, max_non_pass_confidence)."""
if not provider_verdicts:
return False, 0
verdict_kinds = [
_classify_verdict(payload.get("verdict", "")) for payload in provider_verdicts.values()
]
has_pass = any(kind == "pass" for kind in verdict_kinds)
has_non_pass = any(kind in {"concerns", "fail"} for kind in verdict_kinds)
if not (has_pass and has_non_pass):
return False, 0
max_confidence = 0
for payload in provider_verdicts.values():
if _classify_verdict(payload.get("verdict", "")) in {"concerns", "fail"}:
max_confidence = max(max_confidence, payload.get("confidence", 0) or 0)
return True, max_confidence


def _needs_human_due_to_split(
provider_verdicts: dict[str, dict[str, Any]],
*,
confidence_threshold: int = 85,
) -> tuple[bool, str]:
is_split, max_confidence = _split_verdicts(provider_verdicts)
if not is_split:
return False, ""
if max_confidence >= confidence_threshold:
return False, ""
return (
True,
"Provider verdicts split with low-confidence concerns; "
f"dissenting confidence {max_confidence}% < {confidence_threshold}%. "
"Requires human review before starting another automated follow-up.",
)
def _resolve_verdict_policy(
verification_data: VerificationData,
) -> verdict_policy.VerdictPolicyResult:
verdicts: list[verdict_policy.ProviderVerdict] = []
for provider, payload in verification_data.provider_verdicts.items():
verdicts.append(
verdict_policy.ProviderVerdict(
provider=provider,
model=payload.get("model", "") or "",
verdict=payload.get("verdict", "") or "",
confidence=float(payload.get("confidence", 0) or 0),
)
)
return verdict_policy.evaluate_verdict_policy(verdicts, policy="worst")


# Pre-computed normalized aliases for efficient section resolution.
Expand Down Expand Up @@ -1057,8 +989,10 @@ def generate_followup_issue(
4. Format the final issue
"""
blocking_concerns, advisory_concerns = _split_concerns(verification_data.concerns)
needs_human, needs_human_reason = _needs_human_due_to_split(verification_data.provider_verdicts)
verdict = _get_primary_verdict(verification_data)
policy_result = _resolve_verdict_policy(verification_data)
needs_human = policy_result.needs_human
needs_human_reason = policy_result.needs_human_reason
verdict = policy_result.verdict

if needs_human:
return _generate_without_llm(
Expand Down Expand Up @@ -1450,7 +1384,8 @@ def _build_why_section(
needs_human_reason: str | None = None,
) -> str:
"""Build the Why section explaining the follow-up context."""
verdict = verdict or _get_primary_verdict(verification_data)
if verdict is None:
verdict = _resolve_verdict_policy(verification_data).verdict

parts = [
f"PR #{pr_number} addressed issue #{original_issue.number} but verification "
Expand Down Expand Up @@ -1480,11 +1415,6 @@ def _build_why_section(
return " ".join(parts)


def _get_primary_verdict(verification_data: VerificationData) -> str:
"""Get the primary verdict from verification data."""
return _select_primary_verdict(verification_data.provider_verdicts)


def main() -> int:
parser = argparse.ArgumentParser(
description="Generate follow-up issue from verification feedback."
Expand Down
112 changes: 112 additions & 0 deletions scripts/langchain/verdict_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Entrypoint for deterministic verdict extraction with structured outputs."""

from __future__ import annotations

import argparse
import json
import os
import sys
from typing import TypedDict

from scripts.langchain import verdict_policy


def build_verdict_result(
summary: str,
*,
policy: str = "worst",
) -> verdict_policy.VerdictPolicyResult:
return verdict_policy.evaluate_summary(summary, policy=policy)


class VerdictGithubOutputs(TypedDict):
verdict: str
needs_human: str
needs_human_reason: str
policy: str
verdict_kind: str
selected_provider: str
selected_model: str
selected_confidence: str
split_verdict: str
concerns_confidence: str
verdict_metadata: str


def _build_github_outputs(result: verdict_policy.VerdictPolicyResult) -> VerdictGithubOutputs:
return {
"verdict": result.verdict,
"needs_human": str(result.needs_human).lower(),
"needs_human_reason": result.needs_human_reason,
"policy": result.policy,
"verdict_kind": result.verdict_kind,
"selected_provider": result.selected_provider or "",
"selected_model": result.selected_model or "",
"selected_confidence": (
f"{result.selected_confidence:.4f}" if result.selected_confidence is not None else ""
),
"split_verdict": str(result.split_verdict).lower(),
"concerns_confidence": (
f"{result.concerns_confidence:.4f}" if result.concerns_confidence is not None else ""
),
"verdict_metadata": json.dumps(result.as_dict()),
}


def _write_github_outputs(result: verdict_policy.VerdictPolicyResult, output_path: str) -> None:
outputs = _build_github_outputs(result)

with open(output_path, "a", encoding="utf-8") as handle:
for key, value in outputs.items():
handle.write(f"{key}={value}\n")


def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(
description="Extract a deterministic verdict and emit structured outputs."
)
parser.add_argument(
"--summary-path",
required=True,
help="Path to the markdown summary (use '-' for stdin).",
)
parser.add_argument(
"--policy",
choices=["worst", "majority"],
default="worst",
help="Policy used to resolve split provider verdicts.",
)
parser.add_argument(
"--emit",
choices=["github", "json", "verdict"],
default="github",
help="Output format for results.",
)
args = parser.parse_args(argv)

summary = verdict_policy._read_summary(args.summary_path)
result = build_verdict_result(summary, policy=args.policy)

if args.emit == "verdict":
print(result.verdict)
return 0

if args.emit == "json":
print(json.dumps(result.as_dict(), indent=2))
return 0

github_output = os.environ.get("GITHUB_OUTPUT", "")
if not github_output:
print(
"GITHUB_OUTPUT is not set; falling back to JSON on stdout.",
file=sys.stderr,
)
print(json.dumps(result.as_dict(), indent=2))
return 0

_write_github_outputs(result, github_output)
return 0


if __name__ == "__main__":
raise SystemExit(main())
Loading
Loading