diff --git a/.github/actions/setup-api-client/action.yml b/.github/actions/setup-api-client/action.yml index 3343ebfb5..109cc33a7 100644 --- a/.github/actions/setup-api-client/action.yml +++ b/.github/actions/setup-api-client/action.yml @@ -102,6 +102,22 @@ runs: if [ -d "node_modules/@octokit/rest" ]; then echo "✅ @octokit/rest already installed" else + # Snapshot vendored package metadata before npm install. + # npm may overwrite transitive deps (e.g. minimatch) that are + # committed as vendored packages with intentional version pins. + VENDORED_SNAPSHOT="" + if [ -f "node_modules/minimatch/package.json" ]; then + VENDORED_SNAPSHOT=$(mktemp -d) + for pkg_dir in node_modules/*/; do + if [ -f "${pkg_dir}package.json" ]; then + pkg_name=$(basename "$pkg_dir") + mkdir -p "${VENDORED_SNAPSHOT}/${pkg_name}" + cp "${pkg_dir}package.json" "${VENDORED_SNAPSHOT}/${pkg_name}/package.json" + fi + done + echo "📸 Snapshotted vendored package metadata" + fi + # Install with pinned versions for consistency # Capture stderr for debugging if the command fails npm_output=$(mktemp) @@ -122,6 +138,20 @@ runs: @octokit/plugin-paginate-rest@9.1.5 \ @octokit/auth-app@6.0.3 fi + + # Restore vendored package metadata that npm may have overwritten + if [ -n "${VENDORED_SNAPSHOT:-}" ] && [ -d "${VENDORED_SNAPSHOT}" ]; then + for pkg_backup in "${VENDORED_SNAPSHOT}"/*/; do + pkg_name=$(basename "$pkg_backup") + if [ -f "node_modules/${pkg_name}/package.json" ] && \ + [ -f "${pkg_backup}package.json" ]; then + cp "${pkg_backup}package.json" "node_modules/${pkg_name}/package.json" + fi + done + rm -rf "${VENDORED_SNAPSHOT}" + echo "📸 Restored vendored package metadata" + fi + echo "✅ @octokit dependencies installed" fi diff --git a/.github/workflows/reusable-18-autofix.yml b/.github/workflows/reusable-18-autofix.yml index 564e38157..b83a1f0f8 100644 --- a/.github/workflows/reusable-18-autofix.yml +++ b/.github/workflows/reusable-18-autofix.yml @@ -794,6 +794,8 @@ jobs: git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" git add -A + # Unstage vendored node_modules that may have been modified by npm install + git reset HEAD -- .github/scripts/node_modules node_modules .workflows-lib/.github/scripts/node_modules 2>/dev/null || true git commit -m "${AUTOFIX_COMMIT_PREFIX} formatting/lint" echo "AUTOFIX_COMMIT_SHA=$(git rev-parse HEAD)" >> "$GITHUB_ENV" @@ -865,6 +867,8 @@ jobs: git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" git add -A + # Unstage vendored node_modules that may have been modified by npm install + git reset HEAD -- .github/scripts/node_modules node_modules .workflows-lib/.github/scripts/node_modules 2>/dev/null || true git commit -m "${AUTOFIX_COMMIT_PREFIX} formatting/lint (patch)" || true git format-patch -1 --stdout > autofix.patch diff --git a/.github/workflows/reusable-agents-issue-bridge.yml b/.github/workflows/reusable-agents-issue-bridge.yml index d193a16ce..16dea6053 100644 --- a/.github/workflows/reusable-agents-issue-bridge.yml +++ b/.github/workflows/reusable-agents-issue-bridge.yml @@ -580,7 +580,9 @@ jobs: git checkout -B "$HEAD_BRANCH" "origin/${BASE_BRANCH}" mkdir -p agents printf "\n" "$AGENT" "$ISSUE_NUM" > "agents/${AGENT}-${ISSUE_NUM}.md" - git add -A || true + # Stage only the intended bootstrap file — 'git add -A' would capture + # vendored node_modules changes made by setup-api-client npm install. + git add "agents/${AGENT}-${ISSUE_NUM}.md" || true if ! git diff --cached --quiet; then git commit -m "chore(${AGENT}): bootstrap PR for issue #${ISSUE_NUM}" else diff --git a/scripts/check_issue_consistency.py b/scripts/check_issue_consistency.py index 5e00ec95e..f69507e1d 100755 --- a/scripts/check_issue_consistency.py +++ b/scripts/check_issue_consistency.py @@ -260,6 +260,8 @@ def _is_fallback_error(message: str) -> bool: for snippet in ( "no merge base", "bad object", + "bad revision", + "invalid revision range", "invalid object name", "not a valid object name", "ambiguous argument", diff --git a/scripts/langchain/capability_check.py b/scripts/langchain/capability_check.py index 0ff398491..8ef5db729 100755 --- a/scripts/langchain/capability_check.py +++ b/scripts/langchain/capability_check.py @@ -161,8 +161,8 @@ def _is_multi_action_task(task: str) -> bool: def _requires_admin_access(task: str) -> bool: patterns = [ r"\bgithub\s+secrets?\b", - r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\s+secrets?\b", - r"\bsecrets?\s+(?:management|configuration|rotation)\b", + r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\b.{0,30}\bsecrets?\b", + r"\bsecrets?\b.{0,30}\b(?:management|configuration|rotation)\b", r"\brepository\s+settings\b", r"\brepo\s+settings\b", r"\bbranch\s+protection\b", diff --git a/scripts/langchain/verdict_policy.py b/scripts/langchain/verdict_policy.py index 809bdcadb..86d422350 100644 --- a/scripts/langchain/verdict_policy.py +++ b/scripts/langchain/verdict_policy.py @@ -16,7 +16,7 @@ "fail": 3, } -CONCERNS_NEEDS_HUMAN_THRESHOLD = 0.50 +CONCERNS_NEEDS_HUMAN_THRESHOLD = 0.85 @dataclass(frozen=True) @@ -193,11 +193,11 @@ def evaluate_verdict_policy( needs_human_reason = "" if split_verdict: confidence_value = concerns_confidence or 0.0 - if confidence_value < CONCERNS_NEEDS_HUMAN_THRESHOLD: + if confidence_value >= CONCERNS_NEEDS_HUMAN_THRESHOLD: needs_human = True needs_human_reason = ( - "Provider verdicts split with low-confidence concerns; " - f"dissenting confidence {confidence_value:.2f} < " + "Provider verdicts split with high-confidence concerns; " + f"dissenting confidence {confidence_value:.2f} >= " f"{CONCERNS_NEEDS_HUMAN_THRESHOLD:.2f}. " "Requires human review before starting another automated follow-up." ) diff --git a/templates/consumer-repo/.github/actions/setup-api-client/action.yml b/templates/consumer-repo/.github/actions/setup-api-client/action.yml index 3343ebfb5..109cc33a7 100644 --- a/templates/consumer-repo/.github/actions/setup-api-client/action.yml +++ b/templates/consumer-repo/.github/actions/setup-api-client/action.yml @@ -102,6 +102,22 @@ runs: if [ -d "node_modules/@octokit/rest" ]; then echo "✅ @octokit/rest already installed" else + # Snapshot vendored package metadata before npm install. + # npm may overwrite transitive deps (e.g. minimatch) that are + # committed as vendored packages with intentional version pins. + VENDORED_SNAPSHOT="" + if [ -f "node_modules/minimatch/package.json" ]; then + VENDORED_SNAPSHOT=$(mktemp -d) + for pkg_dir in node_modules/*/; do + if [ -f "${pkg_dir}package.json" ]; then + pkg_name=$(basename "$pkg_dir") + mkdir -p "${VENDORED_SNAPSHOT}/${pkg_name}" + cp "${pkg_dir}package.json" "${VENDORED_SNAPSHOT}/${pkg_name}/package.json" + fi + done + echo "📸 Snapshotted vendored package metadata" + fi + # Install with pinned versions for consistency # Capture stderr for debugging if the command fails npm_output=$(mktemp) @@ -122,6 +138,20 @@ runs: @octokit/plugin-paginate-rest@9.1.5 \ @octokit/auth-app@6.0.3 fi + + # Restore vendored package metadata that npm may have overwritten + if [ -n "${VENDORED_SNAPSHOT:-}" ] && [ -d "${VENDORED_SNAPSHOT}" ]; then + for pkg_backup in "${VENDORED_SNAPSHOT}"/*/; do + pkg_name=$(basename "$pkg_backup") + if [ -f "node_modules/${pkg_name}/package.json" ] && \ + [ -f "${pkg_backup}package.json" ]; then + cp "${pkg_backup}package.json" "node_modules/${pkg_name}/package.json" + fi + done + rm -rf "${VENDORED_SNAPSHOT}" + echo "📸 Restored vendored package metadata" + fi + echo "✅ @octokit dependencies installed" fi diff --git a/templates/consumer-repo/scripts/langchain/capability_check.py b/templates/consumer-repo/scripts/langchain/capability_check.py index b53886821..0bd8ab361 100755 --- a/templates/consumer-repo/scripts/langchain/capability_check.py +++ b/templates/consumer-repo/scripts/langchain/capability_check.py @@ -160,8 +160,8 @@ def _is_multi_action_task(task: str) -> bool: def _requires_admin_access(task: str) -> bool: patterns = [ r"\bgithub\s+secrets?\b", - r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\s+secrets?\b", - r"\bsecrets?\s+(?:management|configuration|rotation)\b", + r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\b.{0,30}\bsecrets?\b", + r"\bsecrets?\b.{0,30}\b(?:management|configuration|rotation)\b", r"\brepository\s+settings\b", r"\brepo\s+settings\b", r"\bbranch\s+protection\b", diff --git a/tests/scripts/test_capability_check.py b/tests/scripts/test_capability_check.py index e35788188..d462e057e 100644 --- a/tests/scripts/test_capability_check.py +++ b/tests/scripts/test_capability_check.py @@ -434,6 +434,21 @@ def test_fallback_flags_manage_secrets(self) -> None: assert result.recommendation == "BLOCKED" assert "admin" in result.blocked_tasks[0]["reason"].lower() + def test_fallback_flags_set_repository_secret(self) -> None: + """Regression: 'Set repository secret TOKEN' must be blocked even with + intervening words between the verb and 'secret'.""" + with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None): + result = classify_capabilities(["Set repository secret TOKEN"], "") + assert result.recommendation == "BLOCKED" + assert "admin" in result.blocked_tasks[0]["reason"].lower() + + def test_fallback_flags_update_actions_secret(self) -> None: + """Regression: 'Update GitHub Actions secret FOO' must be blocked.""" + with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None): + result = classify_capabilities(["Update GitHub Actions secret FOO"], "") + assert result.recommendation == "BLOCKED" + assert "admin" in result.blocked_tasks[0]["reason"].lower() + def test_fallback_suggests_decomposition(self) -> None: with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None): result = classify_capabilities(["Refactor auth + add tests + update docs"], "") diff --git a/tests/scripts/test_issue_consistency_check.py b/tests/scripts/test_issue_consistency_check.py index 7c85b6263..020c4c54d 100644 --- a/tests/scripts/test_issue_consistency_check.py +++ b/tests/scripts/test_issue_consistency_check.py @@ -174,6 +174,27 @@ def fake_run_git(args: list[str]) -> str: assert calls == [["log"], ["log", "-n", "1"]] +def test_run_git_with_fallback_handles_bad_revision(monkeypatch) -> None: + calls = [] + + def fake_run_git(args: list[str]) -> str: + calls.append(args) + if args == ["log"]: + raise RuntimeError("fatal: bad revision 'deadbeef..HEAD'") + return "ok" + + monkeypatch.setattr(check_issue_consistency, "_run_git", fake_run_git) + + output, used_fallback = check_issue_consistency._run_git_with_fallback_and_flag( + ["log"], + ["log", "-n", "1"], + ) + + assert output == "ok" + assert used_fallback is True + assert calls == [["log"], ["log", "-n", "1"]] + + def test_resolve_base_sha_prefers_merge_base(monkeypatch) -> None: calls = [] diff --git a/tests/test_followup_issue_generator.py b/tests/test_followup_issue_generator.py index 7570cd103..c3f613be6 100755 --- a/tests/test_followup_issue_generator.py +++ b/tests/test_followup_issue_generator.py @@ -528,12 +528,12 @@ def test_advisory_concerns_are_notes(self): assert "- [ ] Address: Could add a clarifying comment" not in followup.body assert "## Notes" in followup.body - def test_split_low_confidence_requires_needs_human(self): - """Low-confidence split verdicts should trigger needs-human labeling.""" + def test_split_high_confidence_requires_needs_human(self): + """High-confidence CONCERNS in a split verdict should trigger needs-human labeling.""" verification_data = VerificationData( provider_verdicts={ "openai": {"verdict": "PASS", "confidence": 90}, - "anthropic": {"verdict": "CONCERNS", "confidence": 49}, + "anthropic": {"verdict": "CONCERNS", "confidence": 92}, }, concerns=["Missing test coverage"], ) diff --git a/tests/test_verdict_extract.py b/tests/test_verdict_extract.py index 4e7c70c7f..38d43456a 100644 --- a/tests/test_verdict_extract.py +++ b/tests/test_verdict_extract.py @@ -25,7 +25,7 @@ def _parse_github_output(raw: str) -> dict[str, str]: def test_verdict_extract_emits_structured_github_outputs(tmp_path): summary = _build_summary( "| openai | gpt-5.2 | PASS | 0.92 | Looks good. |", - "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.49 | Missing edge case. |", + "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.90 | Missing edge case. |", ) result = verdict_extract.build_verdict_result(summary, policy="worst") output_path = tmp_path / "github_output.txt" diff --git a/tests/test_verdict_policy.py b/tests/test_verdict_policy.py index 7e134a5d5..bfe631e05 100755 --- a/tests/test_verdict_policy.py +++ b/tests/test_verdict_policy.py @@ -51,6 +51,7 @@ def test_select_verdict_majority_policy(): def test_needs_human_threshold_boundary(): + """At exactly the threshold, needs_human should fire (>= comparison).""" verdicts = [ ProviderVerdict("openai", "gpt-5.2", "PASS", 0.92), ProviderVerdict( @@ -60,30 +61,30 @@ def test_needs_human_threshold_boundary(): result = evaluate_verdict_policy(verdicts, policy="worst") - assert result.needs_human is False + assert result.needs_human is True -def test_needs_human_true_below_threshold(): - """Concerns below the threshold should trigger needs_human.""" +def test_needs_human_true_above_threshold(): + """Concerns above the threshold should trigger needs_human.""" verdicts = [ ProviderVerdict("openai", "gpt-5.2", "PASS", 0.92), - ProviderVerdict("anthropic", "claude-sonnet-4-5", "CONCERNS", 0.40), + ProviderVerdict("anthropic", "claude-sonnet-4-5", "CONCERNS", 0.90), ] result = evaluate_verdict_policy(verdicts, policy="worst") assert result.needs_human is True assert result.split_verdict is True - assert "low-confidence" in result.needs_human_reason + assert "high-confidence" in result.needs_human_reason def test_moderate_confidence_concerns_do_not_block(): - """Regression: 72% concerns in a split verdict should not trigger needs_human. + """Moderate-confidence concerns in a split verdict should not trigger needs_human. - Previously CONCERNS_NEEDS_HUMAN_THRESHOLD was 0.85, which caused any - split verdict with <85% concerns to be flagged. The lowered threshold - (0.50) allows moderate-confidence concerns to proceed with automatic - follow-up creation. + needs_human only fires when the CONCERNS provider is highly confident + (>= 0.85), indicating the LLM is quite sure there are real problems. + Moderate confidence means the LLM is uncertain — that's a weaker signal + and shouldn't block follow-up automation. """ verdicts = [ ProviderVerdict("openai", "gpt-5.2", "CONCERNS", 72), diff --git a/tests/test_verdict_policy_integration.py b/tests/test_verdict_policy_integration.py index c5b29fde3..40562e48f 100755 --- a/tests/test_verdict_policy_integration.py +++ b/tests/test_verdict_policy_integration.py @@ -23,7 +23,8 @@ def _build_summary(*rows: str) -> str: return f"## Provider Summary\n\n{header}{body}\n" -def test_split_verdict_confidence_boundary_needs_human_false(): +def test_split_verdict_confidence_boundary_needs_human_true(): + """At exactly the threshold (0.85), needs_human should be True.""" summary = _build_summary( "| openai | gpt-5.2 | PASS | 0.92 | Looks good. |", "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.85 | Missing edge case. |", @@ -33,22 +34,23 @@ def test_split_verdict_confidence_boundary_needs_human_false(): followup_result = _followup_result(summary) assert workflow_result.verdict == followup_result.verdict == "CONCERNS" - assert workflow_result.needs_human is False - assert followup_result.needs_human is False + assert workflow_result.needs_human is True + assert followup_result.needs_human is True -def test_split_verdict_low_confidence_needs_human_true(): +def test_split_verdict_low_confidence_needs_human_false(): + """Below threshold, low-confidence concerns should NOT trigger needs_human.""" summary = _build_summary( "| openai | gpt-5.2 | PASS | 0.92 | Looks good. |", - "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.49 | Missing edge case. |", + "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.40 | Missing edge case. |", ) workflow_result = _workflow_result(summary) followup_result = _followup_result(summary) assert workflow_result.verdict == followup_result.verdict == "CONCERNS" - assert workflow_result.needs_human is True - assert followup_result.needs_human is True + assert workflow_result.needs_human is False + assert followup_result.needs_human is False def test_split_verdict_row_order_invariance():