diff --git a/.github/scripts/error_classifier.js b/.github/scripts/error_classifier.js index 18f89c2c3..85709c43f 100644 --- a/.github/scripts/error_classifier.js +++ b/.github/scripts/error_classifier.js @@ -189,8 +189,10 @@ function classifyByMessage(message) { function classifyError(error) { const message = normaliseMessage(error); const preview = message ? message.slice(0, 50) : 'unknown'; - // eslint-disable-next-line no-console - console.log(`[error_classifier] Classifying error: ${preview}`); + if (process.env.RUNNER_DEBUG === '1') { + // eslint-disable-next-line no-console + console.log(`[error_classifier] Classifying error: ${preview}`); + } const status = getStatusCode(error); const statusCategory = status ? classifyByStatus(status, message) : null; diff --git a/.github/scripts/keepalive_loop.js b/.github/scripts/keepalive_loop.js index 26938a680..96cdf80e4 100644 --- a/.github/scripts/keepalive_loop.js +++ b/.github/scripts/keepalive_loop.js @@ -374,7 +374,7 @@ async function fetchRepoVariables({ github, context, core, names = [] }) { } } catch (error) { if (core) { - core.info(`Failed to fetch repository variables for timeout config: ${error.message}`); + core.debug(`Repository variables not accessible for timeout config (using defaults): ${error.message}`); } } diff --git a/.github/workflows/agents-72-codex-belt-worker.yml b/.github/workflows/agents-72-codex-belt-worker.yml index 28dc732b5..ec87d6d2f 100644 --- a/.github/workflows/agents-72-codex-belt-worker.yml +++ b/.github/workflows/agents-72-codex-belt-worker.yml @@ -591,6 +591,13 @@ jobs: fetch-depth: 1 path: .belt-tools + - name: Re-install API client after branch checkout + if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }} + uses: ./.belt-tools/.github/actions/setup-api-client + with: + secrets: ${{ toJSON(secrets) }} + github_token: ${{ env.GH_BELT_TOKEN || github.token }} + - name: Validate ledger base branch if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }} env: @@ -764,7 +771,7 @@ jobs: return super().increase_indent(flow, False) def iso_now() -> str: - return dt.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z' + return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z') issue = os.environ["ISSUE"].strip() branch = os.environ["BRANCH"].strip() @@ -1272,7 +1279,7 @@ jobs: return super().increase_indent(flow, False) def iso_now() -> str: - return dt.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z' + return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z') issue = os.environ.get('ISSUE', '').strip() branch = os.environ.get('BRANCH', '').strip() diff --git a/.github/workflows/agents-keepalive-loop-reporter.yml b/.github/workflows/agents-keepalive-loop-reporter.yml index feeae1868..f9c8890d0 100644 --- a/.github/workflows/agents-keepalive-loop-reporter.yml +++ b/.github/workflows/agents-keepalive-loop-reporter.yml @@ -40,6 +40,7 @@ jobs: token: ${{ steps.app_token.outputs.token || github.token }} sparse-checkout: | .github/scripts + .github/actions sparse-checkout-cone-mode: false fetch-depth: 1 diff --git a/.github/workflows/health-75-api-rate-diagnostic.yml b/.github/workflows/health-75-api-rate-diagnostic.yml index 0e28dae86..5dd070666 100644 --- a/.github/workflows/health-75-api-rate-diagnostic.yml +++ b/.github/workflows/health-75-api-rate-diagnostic.yml @@ -87,6 +87,9 @@ jobs: uses: actions/checkout@v6 - name: Export load-balancer tokens uses: ./.github/actions/setup-api-client + with: + secrets: ${{ toJSON(secrets) }} + github_token: ${{ github.token }} - name: Check GITHUB_TOKEN rate limits id: github_token @@ -1457,6 +1460,9 @@ jobs: uses: actions/checkout@v6 - name: Export load-balancer tokens uses: ./.github/actions/setup-api-client + with: + secrets: ${{ toJSON(secrets) }} + github_token: ${{ github.token }} - name: Check for critical utilization id: check run: | @@ -1570,6 +1576,9 @@ jobs: uses: actions/checkout@v6 - name: Export load-balancer tokens uses: ./.github/actions/setup-api-client + with: + secrets: ${{ toJSON(secrets) }} + github_token: ${{ github.token }} - name: Create issue on repeated failures env: GH_TOKEN: ${{ github.token }} @@ -1692,6 +1701,9 @@ jobs: uses: actions/checkout@v6 - name: Export load-balancer tokens uses: ./.github/actions/setup-api-client + with: + secrets: ${{ toJSON(secrets) }} + github_token: ${{ github.token }} - name: Parse date range id: dates run: | diff --git a/.github/workflows/reusable-codex-run.yml b/.github/workflows/reusable-codex-run.yml index 3a9e8c18c..e2221e6b6 100644 --- a/.github/workflows/reusable-codex-run.yml +++ b/.github/workflows/reusable-codex-run.yml @@ -1018,16 +1018,16 @@ jobs: # Extract key fields for downstream use if [ -f "$ANALYSIS_FILE" ]; then - COMPLETED=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('completed_tasks', [])))") - PROVIDER=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('provider', 'unknown'))") - MODEL=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('model', 'unknown'))") - CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('confidence', 0))") + COMPLETED=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('completed_tasks', [])), end='')") + PROVIDER=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('provider', 'unknown'), end='')") + MODEL=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('model', 'unknown'), end='')") + CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('confidence', 0), end='')") # Quality metrics for keepalive integration - RAW_CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('raw_confidence', d.get('confidence', 0)))") - EFFORT_SCORE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('effort_score', 0))") - DATA_QUALITY=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('data_quality', 'unknown'))") - ANALYSIS_LENGTH=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('analysis_text_length', 0))") - QUALITY_WARNINGS=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('quality_warnings', [])))") + RAW_CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('raw_confidence', d.get('confidence', 0)), end='')") + EFFORT_SCORE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('effort_score', 0), end='')") + DATA_QUALITY=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('data_quality', 'unknown'), end='')") + ANALYSIS_LENGTH=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('analysis_text_length', 0), end='')") + QUALITY_WARNINGS=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('quality_warnings', [])), end='')") { echo "completed-tasks=$COMPLETED" echo "provider=$PROVIDER" @@ -1185,7 +1185,7 @@ jobs: # Check if there are any staged changes remaining after reset if git diff --cached --quiet; then - echo "::warning::No non-artifact changes to commit after filtering. All changes were workflow artifacts." + echo "::notice::No non-artifact changes to commit after filtering. All changes were workflow artifacts." # Even if nothing to commit, check for unpushed commits that need pushing git fetch "${REMOTE_URL}" "${TARGET_BRANCH}" 2>/dev/null || true UNPUSHED=$(git rev-list FETCH_HEAD..HEAD --count 2>/dev/null || echo "0") diff --git a/scripts/langchain/capability_check.py b/scripts/langchain/capability_check.py index eb904c753..0ff398491 100755 --- a/scripts/langchain/capability_check.py +++ b/scripts/langchain/capability_check.py @@ -161,7 +161,8 @@ def _is_multi_action_task(task: str) -> bool: def _requires_admin_access(task: str) -> bool: patterns = [ r"\bgithub\s+secrets?\b", - r"\bsecrets?\b", + r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\s+secrets?\b", + r"\bsecrets?\s+(?:management|configuration|rotation)\b", r"\brepository\s+settings\b", r"\brepo\s+settings\b", r"\bbranch\s+protection\b", diff --git a/scripts/langchain/verdict_policy.py b/scripts/langchain/verdict_policy.py index 8fdcd9925..809bdcadb 100644 --- a/scripts/langchain/verdict_policy.py +++ b/scripts/langchain/verdict_policy.py @@ -16,7 +16,7 @@ "fail": 3, } -CONCERNS_NEEDS_HUMAN_THRESHOLD = 0.85 +CONCERNS_NEEDS_HUMAN_THRESHOLD = 0.50 @dataclass(frozen=True) diff --git a/templates/consumer-repo/.github/scripts/error_classifier.js b/templates/consumer-repo/.github/scripts/error_classifier.js index 18f89c2c3..85709c43f 100644 --- a/templates/consumer-repo/.github/scripts/error_classifier.js +++ b/templates/consumer-repo/.github/scripts/error_classifier.js @@ -189,8 +189,10 @@ function classifyByMessage(message) { function classifyError(error) { const message = normaliseMessage(error); const preview = message ? message.slice(0, 50) : 'unknown'; - // eslint-disable-next-line no-console - console.log(`[error_classifier] Classifying error: ${preview}`); + if (process.env.RUNNER_DEBUG === '1') { + // eslint-disable-next-line no-console + console.log(`[error_classifier] Classifying error: ${preview}`); + } const status = getStatusCode(error); const statusCategory = status ? classifyByStatus(status, message) : null; diff --git a/templates/consumer-repo/.github/scripts/keepalive_loop.js b/templates/consumer-repo/.github/scripts/keepalive_loop.js index 26938a680..96cdf80e4 100644 --- a/templates/consumer-repo/.github/scripts/keepalive_loop.js +++ b/templates/consumer-repo/.github/scripts/keepalive_loop.js @@ -374,7 +374,7 @@ async function fetchRepoVariables({ github, context, core, names = [] }) { } } catch (error) { if (core) { - core.info(`Failed to fetch repository variables for timeout config: ${error.message}`); + core.debug(`Repository variables not accessible for timeout config (using defaults): ${error.message}`); } } diff --git a/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml b/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml index 1c4dcd0a3..14e6f2bfb 100644 --- a/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml +++ b/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml @@ -591,6 +591,13 @@ jobs: fetch-depth: 1 path: .belt-tools + - name: Re-install API client after branch checkout + if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }} + uses: ./.belt-tools/.github/actions/setup-api-client + with: + secrets: ${{ toJSON(secrets) }} + github_token: ${{ env.GH_BELT_TOKEN || github.token }} + - name: Validate ledger base branch if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }} env: @@ -764,7 +771,7 @@ jobs: return super().increase_indent(flow, False) def iso_now() -> str: - return dt.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z' + return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z') issue = os.environ["ISSUE"].strip() branch = os.environ["BRANCH"].strip() @@ -1272,7 +1279,7 @@ jobs: return super().increase_indent(flow, False) def iso_now() -> str: - return dt.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z' + return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z') issue = os.environ.get('ISSUE', '').strip() branch = os.environ.get('BRANCH', '').strip() diff --git a/templates/consumer-repo/scripts/langchain/capability_check.py b/templates/consumer-repo/scripts/langchain/capability_check.py index 0632d893d..b53886821 100755 --- a/templates/consumer-repo/scripts/langchain/capability_check.py +++ b/templates/consumer-repo/scripts/langchain/capability_check.py @@ -160,7 +160,8 @@ def _is_multi_action_task(task: str) -> bool: def _requires_admin_access(task: str) -> bool: patterns = [ r"\bgithub\s+secrets?\b", - r"\bsecrets?\b", + r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\s+secrets?\b", + r"\bsecrets?\s+(?:management|configuration|rotation)\b", r"\brepository\s+settings\b", r"\brepo\s+settings\b", r"\bbranch\s+protection\b", diff --git a/tests/scripts/test_capability_check.py b/tests/scripts/test_capability_check.py index 28f5233c6..e35788188 100644 --- a/tests/scripts/test_capability_check.py +++ b/tests/scripts/test_capability_check.py @@ -419,6 +419,21 @@ def test_fallback_flags_admin_requirement(self) -> None: assert result.blocked_tasks[0]["task"] == "Update GitHub secrets" assert "admin" in result.blocked_tasks[0]["reason"].lower() + def test_fallback_does_not_flag_negated_secrets_mention(self) -> None: + """Regression: 'no secrets' in constraint text must not trigger admin block.""" + task = "safety rules (no secrets, no workflow edits, no file writes)" + with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None): + result = classify_capabilities([task], "") + assert result.recommendation != "BLOCKED" + assert all(item["task"] != task for item in result.blocked_tasks) + + def test_fallback_flags_manage_secrets(self) -> None: + """Specific secrets-management verbs should still be blocked.""" + with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None): + result = classify_capabilities(["manage secrets for deployment"], "") + assert result.recommendation == "BLOCKED" + assert "admin" in result.blocked_tasks[0]["reason"].lower() + def test_fallback_suggests_decomposition(self) -> None: with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None): result = classify_capabilities(["Refactor auth + add tests + update docs"], "") diff --git a/tests/test_followup_issue_generator.py b/tests/test_followup_issue_generator.py index 572099608..7570cd103 100755 --- a/tests/test_followup_issue_generator.py +++ b/tests/test_followup_issue_generator.py @@ -533,7 +533,7 @@ def test_split_low_confidence_requires_needs_human(self): verification_data = VerificationData( provider_verdicts={ "openai": {"verdict": "PASS", "confidence": 90}, - "anthropic": {"verdict": "CONCERNS", "confidence": 70}, + "anthropic": {"verdict": "CONCERNS", "confidence": 49}, }, concerns=["Missing test coverage"], ) diff --git a/tests/test_verdict_extract.py b/tests/test_verdict_extract.py index aa3a207cc..4e7c70c7f 100644 --- a/tests/test_verdict_extract.py +++ b/tests/test_verdict_extract.py @@ -25,7 +25,7 @@ def _parse_github_output(raw: str) -> dict[str, str]: def test_verdict_extract_emits_structured_github_outputs(tmp_path): summary = _build_summary( "| openai | gpt-5.2 | PASS | 0.92 | Looks good. |", - "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.84 | Missing edge case. |", + "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.49 | Missing edge case. |", ) result = verdict_extract.build_verdict_result(summary, policy="worst") output_path = tmp_path / "github_output.txt" diff --git a/tests/test_verdict_policy.py b/tests/test_verdict_policy.py index 9b5075406..7e134a5d5 100755 --- a/tests/test_verdict_policy.py +++ b/tests/test_verdict_policy.py @@ -61,3 +61,37 @@ def test_needs_human_threshold_boundary(): result = evaluate_verdict_policy(verdicts, policy="worst") assert result.needs_human is False + + +def test_needs_human_true_below_threshold(): + """Concerns below the threshold should trigger needs_human.""" + verdicts = [ + ProviderVerdict("openai", "gpt-5.2", "PASS", 0.92), + ProviderVerdict("anthropic", "claude-sonnet-4-5", "CONCERNS", 0.40), + ] + + result = evaluate_verdict_policy(verdicts, policy="worst") + + assert result.needs_human is True + assert result.split_verdict is True + assert "low-confidence" in result.needs_human_reason + + +def test_moderate_confidence_concerns_do_not_block(): + """Regression: 72% concerns in a split verdict should not trigger needs_human. + + Previously CONCERNS_NEEDS_HUMAN_THRESHOLD was 0.85, which caused any + split verdict with <85% concerns to be flagged. The lowered threshold + (0.50) allows moderate-confidence concerns to proceed with automatic + follow-up creation. + """ + verdicts = [ + ProviderVerdict("openai", "gpt-5.2", "CONCERNS", 72), + ProviderVerdict("anthropic", "claude-sonnet-4-5", "PASS", 85), + ] + + result = evaluate_verdict_policy(verdicts, policy="worst") + + assert result.split_verdict is True + assert result.needs_human is False + assert result.verdict == "CONCERNS" diff --git a/tests/test_verdict_policy_integration.py b/tests/test_verdict_policy_integration.py index 67afa4ee5..c5b29fde3 100755 --- a/tests/test_verdict_policy_integration.py +++ b/tests/test_verdict_policy_integration.py @@ -40,7 +40,7 @@ def test_split_verdict_confidence_boundary_needs_human_false(): def test_split_verdict_low_confidence_needs_human_true(): summary = _build_summary( "| openai | gpt-5.2 | PASS | 0.92 | Looks good. |", - "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.84 | Missing edge case. |", + "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.49 | Missing edge case. |", ) workflow_result = _workflow_result(summary)