Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/actions/setup-api-client/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,22 @@ runs:
if [ -d "node_modules/@octokit/rest" ]; then
echo "✅ @octokit/rest already installed"
else
# Snapshot vendored package metadata before npm install.
# npm may overwrite transitive deps (e.g. minimatch) that are
# committed as vendored packages with intentional version pins.
VENDORED_SNAPSHOT=""
if [ -f "node_modules/minimatch/package.json" ]; then
VENDORED_SNAPSHOT=$(mktemp -d)
for pkg_dir in node_modules/*/; do
if [ -f "${pkg_dir}package.json" ]; then
pkg_name=$(basename "$pkg_dir")
mkdir -p "${VENDORED_SNAPSHOT}/${pkg_name}"
cp "${pkg_dir}package.json" "${VENDORED_SNAPSHOT}/${pkg_name}/package.json"
fi
done
echo "📸 Snapshotted vendored package metadata"
fi

# Install with pinned versions for consistency
# Capture stderr for debugging if the command fails
npm_output=$(mktemp)
Expand All @@ -122,6 +138,20 @@ runs:
@octokit/plugin-paginate-rest@9.1.5 \
@octokit/auth-app@6.0.3
fi

# Restore vendored package metadata that npm may have overwritten
if [ -n "${VENDORED_SNAPSHOT:-}" ] && [ -d "${VENDORED_SNAPSHOT}" ]; then
for pkg_backup in "${VENDORED_SNAPSHOT}"/*/; do
pkg_name=$(basename "$pkg_backup")
if [ -f "node_modules/${pkg_name}/package.json" ] && \
[ -f "${pkg_backup}package.json" ]; then
cp "${pkg_backup}package.json" "node_modules/${pkg_name}/package.json"
fi
done
rm -rf "${VENDORED_SNAPSHOT}"
echo "📸 Restored vendored package metadata"
fi

echo "✅ @octokit dependencies installed"
fi

Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/reusable-18-autofix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -794,6 +794,8 @@ jobs:
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add -A
# Unstage vendored node_modules that may have been modified by npm install
git reset HEAD -- .github/scripts/node_modules node_modules .workflows-lib/.github/scripts/node_modules 2>/dev/null || true
git commit -m "${AUTOFIX_COMMIT_PREFIX} formatting/lint"
echo "AUTOFIX_COMMIT_SHA=$(git rev-parse HEAD)" >> "$GITHUB_ENV"

Expand Down Expand Up @@ -865,6 +867,8 @@ jobs:
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add -A
# Unstage vendored node_modules that may have been modified by npm install
git reset HEAD -- .github/scripts/node_modules node_modules .workflows-lib/.github/scripts/node_modules 2>/dev/null || true
git commit -m "${AUTOFIX_COMMIT_PREFIX} formatting/lint (patch)" || true
git format-patch -1 --stdout > autofix.patch

Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/reusable-agents-issue-bridge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,9 @@ jobs:
git checkout -B "$HEAD_BRANCH" "origin/${BASE_BRANCH}"
mkdir -p agents
printf "<!-- bootstrap for %s on issue #%s -->\n" "$AGENT" "$ISSUE_NUM" > "agents/${AGENT}-${ISSUE_NUM}.md"
git add -A || true
# Stage only the intended bootstrap file — 'git add -A' would capture
# vendored node_modules changes made by setup-api-client npm install.
git add "agents/${AGENT}-${ISSUE_NUM}.md" || true
if ! git diff --cached --quiet; then
git commit -m "chore(${AGENT}): bootstrap PR for issue #${ISSUE_NUM}"
else
Expand Down
2 changes: 2 additions & 0 deletions scripts/check_issue_consistency.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,8 @@ def _is_fallback_error(message: str) -> bool:
for snippet in (
"no merge base",
"bad object",
"bad revision",
"invalid revision range",
"invalid object name",
"not a valid object name",
"ambiguous argument",
Expand Down
4 changes: 2 additions & 2 deletions scripts/langchain/capability_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,8 @@ def _is_multi_action_task(task: str) -> bool:
def _requires_admin_access(task: str) -> bool:
patterns = [
r"\bgithub\s+secrets?\b",
r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\s+secrets?\b",
r"\bsecrets?\s+(?:management|configuration|rotation)\b",
r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\b.{0,30}\bsecrets?\b",
r"\bsecrets?\b.{0,30}\b(?:management|configuration|rotation)\b",
r"\brepository\s+settings\b",
r"\brepo\s+settings\b",
r"\bbranch\s+protection\b",
Expand Down
8 changes: 4 additions & 4 deletions scripts/langchain/verdict_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"fail": 3,
}

CONCERNS_NEEDS_HUMAN_THRESHOLD = 0.50
CONCERNS_NEEDS_HUMAN_THRESHOLD = 0.85


@dataclass(frozen=True)
Expand Down Expand Up @@ -193,11 +193,11 @@ def evaluate_verdict_policy(
needs_human_reason = ""
if split_verdict:
confidence_value = concerns_confidence or 0.0
if confidence_value < CONCERNS_NEEDS_HUMAN_THRESHOLD:
if confidence_value >= CONCERNS_NEEDS_HUMAN_THRESHOLD:
needs_human = True
needs_human_reason = (
"Provider verdicts split with low-confidence concerns; "
f"dissenting confidence {confidence_value:.2f} < "
"Provider verdicts split with high-confidence concerns; "
f"dissenting confidence {confidence_value:.2f} >= "
f"{CONCERNS_NEEDS_HUMAN_THRESHOLD:.2f}. "
"Requires human review before starting another automated follow-up."
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,22 @@ runs:
if [ -d "node_modules/@octokit/rest" ]; then
echo "✅ @octokit/rest already installed"
else
# Snapshot vendored package metadata before npm install.
# npm may overwrite transitive deps (e.g. minimatch) that are
# committed as vendored packages with intentional version pins.
VENDORED_SNAPSHOT=""
if [ -f "node_modules/minimatch/package.json" ]; then
VENDORED_SNAPSHOT=$(mktemp -d)
for pkg_dir in node_modules/*/; do
if [ -f "${pkg_dir}package.json" ]; then
pkg_name=$(basename "$pkg_dir")
mkdir -p "${VENDORED_SNAPSHOT}/${pkg_name}"
cp "${pkg_dir}package.json" "${VENDORED_SNAPSHOT}/${pkg_name}/package.json"
fi
done
echo "📸 Snapshotted vendored package metadata"
fi

# Install with pinned versions for consistency
# Capture stderr for debugging if the command fails
npm_output=$(mktemp)
Expand All @@ -122,6 +138,20 @@ runs:
@octokit/plugin-paginate-rest@9.1.5 \
@octokit/auth-app@6.0.3
fi

# Restore vendored package metadata that npm may have overwritten
if [ -n "${VENDORED_SNAPSHOT:-}" ] && [ -d "${VENDORED_SNAPSHOT}" ]; then
for pkg_backup in "${VENDORED_SNAPSHOT}"/*/; do
pkg_name=$(basename "$pkg_backup")
if [ -f "node_modules/${pkg_name}/package.json" ] && \
[ -f "${pkg_backup}package.json" ]; then
cp "${pkg_backup}package.json" "node_modules/${pkg_name}/package.json"
fi
done
rm -rf "${VENDORED_SNAPSHOT}"
echo "📸 Restored vendored package metadata"
fi

echo "✅ @octokit dependencies installed"
fi

Expand Down
4 changes: 2 additions & 2 deletions templates/consumer-repo/scripts/langchain/capability_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,8 @@ def _is_multi_action_task(task: str) -> bool:
def _requires_admin_access(task: str) -> bool:
patterns = [
r"\bgithub\s+secrets?\b",
r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\s+secrets?\b",
r"\bsecrets?\s+(?:management|configuration|rotation)\b",
r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\b.{0,30}\bsecrets?\b",
r"\bsecrets?\b.{0,30}\b(?:management|configuration|rotation)\b",
r"\brepository\s+settings\b",
r"\brepo\s+settings\b",
r"\bbranch\s+protection\b",
Expand Down
15 changes: 15 additions & 0 deletions tests/scripts/test_capability_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,21 @@ def test_fallback_flags_manage_secrets(self) -> None:
assert result.recommendation == "BLOCKED"
assert "admin" in result.blocked_tasks[0]["reason"].lower()

def test_fallback_flags_set_repository_secret(self) -> None:
"""Regression: 'Set repository secret TOKEN' must be blocked even with
intervening words between the verb and 'secret'."""
with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None):
result = classify_capabilities(["Set repository secret TOKEN"], "")
assert result.recommendation == "BLOCKED"
assert "admin" in result.blocked_tasks[0]["reason"].lower()

def test_fallback_flags_update_actions_secret(self) -> None:
"""Regression: 'Update GitHub Actions secret FOO' must be blocked."""
with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None):
result = classify_capabilities(["Update GitHub Actions secret FOO"], "")
assert result.recommendation == "BLOCKED"
assert "admin" in result.blocked_tasks[0]["reason"].lower()

def test_fallback_suggests_decomposition(self) -> None:
with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None):
result = classify_capabilities(["Refactor auth + add tests + update docs"], "")
Expand Down
21 changes: 21 additions & 0 deletions tests/scripts/test_issue_consistency_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,27 @@ def fake_run_git(args: list[str]) -> str:
assert calls == [["log"], ["log", "-n", "1"]]


def test_run_git_with_fallback_handles_bad_revision(monkeypatch) -> None:
calls = []

def fake_run_git(args: list[str]) -> str:
calls.append(args)
if args == ["log"]:
raise RuntimeError("fatal: bad revision 'deadbeef..HEAD'")
return "ok"

monkeypatch.setattr(check_issue_consistency, "_run_git", fake_run_git)

output, used_fallback = check_issue_consistency._run_git_with_fallback_and_flag(
["log"],
["log", "-n", "1"],
)

assert output == "ok"
assert used_fallback is True
assert calls == [["log"], ["log", "-n", "1"]]


def test_resolve_base_sha_prefers_merge_base(monkeypatch) -> None:
calls = []

Expand Down
6 changes: 3 additions & 3 deletions tests/test_followup_issue_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,12 +528,12 @@ def test_advisory_concerns_are_notes(self):
assert "- [ ] Address: Could add a clarifying comment" not in followup.body
assert "## Notes" in followup.body

def test_split_low_confidence_requires_needs_human(self):
"""Low-confidence split verdicts should trigger needs-human labeling."""
def test_split_high_confidence_requires_needs_human(self):
"""High-confidence CONCERNS in a split verdict should trigger needs-human labeling."""
verification_data = VerificationData(
provider_verdicts={
"openai": {"verdict": "PASS", "confidence": 90},
"anthropic": {"verdict": "CONCERNS", "confidence": 49},
"anthropic": {"verdict": "CONCERNS", "confidence": 92},
},
concerns=["Missing test coverage"],
)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_verdict_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def _parse_github_output(raw: str) -> dict[str, str]:
def test_verdict_extract_emits_structured_github_outputs(tmp_path):
summary = _build_summary(
"| openai | gpt-5.2 | PASS | 0.92 | Looks good. |",
"| anthropic | claude-sonnet-4-5 | CONCERNS | 0.49 | Missing edge case. |",
"| anthropic | claude-sonnet-4-5 | CONCERNS | 0.90 | Missing edge case. |",
)
result = verdict_extract.build_verdict_result(summary, policy="worst")
output_path = tmp_path / "github_output.txt"
Expand Down
21 changes: 11 additions & 10 deletions tests/test_verdict_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def test_select_verdict_majority_policy():


def test_needs_human_threshold_boundary():
"""At exactly the threshold, needs_human should fire (>= comparison)."""
verdicts = [
ProviderVerdict("openai", "gpt-5.2", "PASS", 0.92),
ProviderVerdict(
Expand All @@ -60,30 +61,30 @@ def test_needs_human_threshold_boundary():

result = evaluate_verdict_policy(verdicts, policy="worst")

assert result.needs_human is False
assert result.needs_human is True


def test_needs_human_true_below_threshold():
"""Concerns below the threshold should trigger needs_human."""
def test_needs_human_true_above_threshold():
"""Concerns above the threshold should trigger needs_human."""
verdicts = [
ProviderVerdict("openai", "gpt-5.2", "PASS", 0.92),
ProviderVerdict("anthropic", "claude-sonnet-4-5", "CONCERNS", 0.40),
ProviderVerdict("anthropic", "claude-sonnet-4-5", "CONCERNS", 0.90),
]

result = evaluate_verdict_policy(verdicts, policy="worst")

assert result.needs_human is True
assert result.split_verdict is True
assert "low-confidence" in result.needs_human_reason
assert "high-confidence" in result.needs_human_reason


def test_moderate_confidence_concerns_do_not_block():
"""Regression: 72% concerns in a split verdict should not trigger needs_human.
"""Moderate-confidence concerns in a split verdict should not trigger needs_human.

Previously CONCERNS_NEEDS_HUMAN_THRESHOLD was 0.85, which caused any
split verdict with <85% concerns to be flagged. The lowered threshold
(0.50) allows moderate-confidence concerns to proceed with automatic
follow-up creation.
needs_human only fires when the CONCERNS provider is highly confident
(>= 0.85), indicating the LLM is quite sure there are real problems.
Moderate confidence means the LLM is uncertain — that's a weaker signal
and shouldn't block follow-up automation.
"""
verdicts = [
ProviderVerdict("openai", "gpt-5.2", "CONCERNS", 72),
Expand Down
16 changes: 9 additions & 7 deletions tests/test_verdict_policy_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def _build_summary(*rows: str) -> str:
return f"## Provider Summary\n\n{header}{body}\n"


def test_split_verdict_confidence_boundary_needs_human_false():
def test_split_verdict_confidence_boundary_needs_human_true():
"""At exactly the threshold (0.85), needs_human should be True."""
summary = _build_summary(
"| openai | gpt-5.2 | PASS | 0.92 | Looks good. |",
"| anthropic | claude-sonnet-4-5 | CONCERNS | 0.85 | Missing edge case. |",
Expand All @@ -33,22 +34,23 @@ def test_split_verdict_confidence_boundary_needs_human_false():
followup_result = _followup_result(summary)

assert workflow_result.verdict == followup_result.verdict == "CONCERNS"
assert workflow_result.needs_human is False
assert followup_result.needs_human is False
assert workflow_result.needs_human is True
assert followup_result.needs_human is True


def test_split_verdict_low_confidence_needs_human_true():
def test_split_verdict_low_confidence_needs_human_false():
"""Below threshold, low-confidence concerns should NOT trigger needs_human."""
summary = _build_summary(
"| openai | gpt-5.2 | PASS | 0.92 | Looks good. |",
"| anthropic | claude-sonnet-4-5 | CONCERNS | 0.49 | Missing edge case. |",
"| anthropic | claude-sonnet-4-5 | CONCERNS | 0.40 | Missing edge case. |",
)

workflow_result = _workflow_result(summary)
followup_result = _followup_result(summary)

assert workflow_result.verdict == followup_result.verdict == "CONCERNS"
assert workflow_result.needs_human is True
assert followup_result.needs_human is True
assert workflow_result.needs_human is False
assert followup_result.needs_human is False


def test_split_verdict_row_order_invariance():
Expand Down
Loading