From b8121fffd31e6415740d3f52d19e12defd9d28f9 Mon Sep 17 00:00:00 2001 From: stranske Date: Thu, 12 Feb 2026 00:59:57 +0000 Subject: [PATCH 01/11] fix: resolve 8 issues found in Codex run log audit Essential fixes: - Reporter sparse-checkout: add .github/actions to checkout so setup-api-client action is available (was failing 100% on Workflows repo) - Belt Worker: re-install API client after branch checkout wipes node_modules (was causing @octokit/rest import failures and degraded token rotation) High-value fixes: - LLM analysis outputs: use print(..., end='') to strip trailing newlines from python extraction (confidence values had '\n' suffix e.g. '0.63\n') - Repo variables fetch: downgrade from core.info to core.debug since the token permission limitation is known and the fallback to defaults works correctly Medium fixes: - Health 75 API Rate Diagnostic: pass secrets to 4 setup-api-client calls that were missing the input, causing 'No tokens were exported' warnings - datetime.utcnow(): replace deprecated calls with timezone-aware alternative in both Belt Worker ledger functions Low-salience fixes: - error_classifier: gate entry log behind RUNNER_DEBUG to reduce log noise - Non-artifact commit warning: downgrade from warning to notice since it is expected behavior when Codex produces only workflow artifacts --- .github/scripts/error_classifier.js | 6 ++++-- .github/scripts/keepalive_loop.js | 2 +- .../workflows/agents-72-codex-belt-worker.yml | 11 ++++++++-- .../agents-keepalive-loop-reporter.yml | 1 + .../health-75-api-rate-diagnostic.yml | 12 +++++++++++ .github/workflows/reusable-codex-run.yml | 20 +++++++++---------- .../.github/scripts/error_classifier.js | 6 ++++-- .../.github/scripts/keepalive_loop.js | 2 +- .../workflows/agents-72-codex-belt-worker.yml | 11 ++++++++-- 9 files changed, 51 insertions(+), 20 deletions(-) diff --git a/.github/scripts/error_classifier.js b/.github/scripts/error_classifier.js index 18f89c2c3..85709c43f 100644 --- a/.github/scripts/error_classifier.js +++ b/.github/scripts/error_classifier.js @@ -189,8 +189,10 @@ function classifyByMessage(message) { function classifyError(error) { const message = normaliseMessage(error); const preview = message ? message.slice(0, 50) : 'unknown'; - // eslint-disable-next-line no-console - console.log(`[error_classifier] Classifying error: ${preview}`); + if (process.env.RUNNER_DEBUG === '1') { + // eslint-disable-next-line no-console + console.log(`[error_classifier] Classifying error: ${preview}`); + } const status = getStatusCode(error); const statusCategory = status ? classifyByStatus(status, message) : null; diff --git a/.github/scripts/keepalive_loop.js b/.github/scripts/keepalive_loop.js index 26938a680..96cdf80e4 100644 --- a/.github/scripts/keepalive_loop.js +++ b/.github/scripts/keepalive_loop.js @@ -374,7 +374,7 @@ async function fetchRepoVariables({ github, context, core, names = [] }) { } } catch (error) { if (core) { - core.info(`Failed to fetch repository variables for timeout config: ${error.message}`); + core.debug(`Repository variables not accessible for timeout config (using defaults): ${error.message}`); } } diff --git a/.github/workflows/agents-72-codex-belt-worker.yml b/.github/workflows/agents-72-codex-belt-worker.yml index 28dc732b5..95530873d 100644 --- a/.github/workflows/agents-72-codex-belt-worker.yml +++ b/.github/workflows/agents-72-codex-belt-worker.yml @@ -591,6 +591,13 @@ jobs: fetch-depth: 1 path: .belt-tools + - name: Re-install API client after branch checkout + if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }} + uses: ./.github/actions/setup-api-client + with: + secrets: ${{ toJSON(secrets) }} + github_token: ${{ github.token }} + - name: Validate ledger base branch if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }} env: @@ -764,7 +771,7 @@ jobs: return super().increase_indent(flow, False) def iso_now() -> str: - return dt.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z' + return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z') issue = os.environ["ISSUE"].strip() branch = os.environ["BRANCH"].strip() @@ -1272,7 +1279,7 @@ jobs: return super().increase_indent(flow, False) def iso_now() -> str: - return dt.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z' + return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z') issue = os.environ.get('ISSUE', '').strip() branch = os.environ.get('BRANCH', '').strip() diff --git a/.github/workflows/agents-keepalive-loop-reporter.yml b/.github/workflows/agents-keepalive-loop-reporter.yml index feeae1868..f9c8890d0 100644 --- a/.github/workflows/agents-keepalive-loop-reporter.yml +++ b/.github/workflows/agents-keepalive-loop-reporter.yml @@ -40,6 +40,7 @@ jobs: token: ${{ steps.app_token.outputs.token || github.token }} sparse-checkout: | .github/scripts + .github/actions sparse-checkout-cone-mode: false fetch-depth: 1 diff --git a/.github/workflows/health-75-api-rate-diagnostic.yml b/.github/workflows/health-75-api-rate-diagnostic.yml index 0e28dae86..5dd070666 100644 --- a/.github/workflows/health-75-api-rate-diagnostic.yml +++ b/.github/workflows/health-75-api-rate-diagnostic.yml @@ -87,6 +87,9 @@ jobs: uses: actions/checkout@v6 - name: Export load-balancer tokens uses: ./.github/actions/setup-api-client + with: + secrets: ${{ toJSON(secrets) }} + github_token: ${{ github.token }} - name: Check GITHUB_TOKEN rate limits id: github_token @@ -1457,6 +1460,9 @@ jobs: uses: actions/checkout@v6 - name: Export load-balancer tokens uses: ./.github/actions/setup-api-client + with: + secrets: ${{ toJSON(secrets) }} + github_token: ${{ github.token }} - name: Check for critical utilization id: check run: | @@ -1570,6 +1576,9 @@ jobs: uses: actions/checkout@v6 - name: Export load-balancer tokens uses: ./.github/actions/setup-api-client + with: + secrets: ${{ toJSON(secrets) }} + github_token: ${{ github.token }} - name: Create issue on repeated failures env: GH_TOKEN: ${{ github.token }} @@ -1692,6 +1701,9 @@ jobs: uses: actions/checkout@v6 - name: Export load-balancer tokens uses: ./.github/actions/setup-api-client + with: + secrets: ${{ toJSON(secrets) }} + github_token: ${{ github.token }} - name: Parse date range id: dates run: | diff --git a/.github/workflows/reusable-codex-run.yml b/.github/workflows/reusable-codex-run.yml index 3a9e8c18c..e2221e6b6 100644 --- a/.github/workflows/reusable-codex-run.yml +++ b/.github/workflows/reusable-codex-run.yml @@ -1018,16 +1018,16 @@ jobs: # Extract key fields for downstream use if [ -f "$ANALYSIS_FILE" ]; then - COMPLETED=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('completed_tasks', [])))") - PROVIDER=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('provider', 'unknown'))") - MODEL=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('model', 'unknown'))") - CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('confidence', 0))") + COMPLETED=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('completed_tasks', [])), end='')") + PROVIDER=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('provider', 'unknown'), end='')") + MODEL=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('model', 'unknown'), end='')") + CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('confidence', 0), end='')") # Quality metrics for keepalive integration - RAW_CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('raw_confidence', d.get('confidence', 0)))") - EFFORT_SCORE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('effort_score', 0))") - DATA_QUALITY=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('data_quality', 'unknown'))") - ANALYSIS_LENGTH=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('analysis_text_length', 0))") - QUALITY_WARNINGS=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('quality_warnings', [])))") + RAW_CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('raw_confidence', d.get('confidence', 0)), end='')") + EFFORT_SCORE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('effort_score', 0), end='')") + DATA_QUALITY=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('data_quality', 'unknown'), end='')") + ANALYSIS_LENGTH=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('analysis_text_length', 0), end='')") + QUALITY_WARNINGS=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('quality_warnings', [])), end='')") { echo "completed-tasks=$COMPLETED" echo "provider=$PROVIDER" @@ -1185,7 +1185,7 @@ jobs: # Check if there are any staged changes remaining after reset if git diff --cached --quiet; then - echo "::warning::No non-artifact changes to commit after filtering. All changes were workflow artifacts." + echo "::notice::No non-artifact changes to commit after filtering. All changes were workflow artifacts." # Even if nothing to commit, check for unpushed commits that need pushing git fetch "${REMOTE_URL}" "${TARGET_BRANCH}" 2>/dev/null || true UNPUSHED=$(git rev-list FETCH_HEAD..HEAD --count 2>/dev/null || echo "0") diff --git a/templates/consumer-repo/.github/scripts/error_classifier.js b/templates/consumer-repo/.github/scripts/error_classifier.js index 18f89c2c3..85709c43f 100644 --- a/templates/consumer-repo/.github/scripts/error_classifier.js +++ b/templates/consumer-repo/.github/scripts/error_classifier.js @@ -189,8 +189,10 @@ function classifyByMessage(message) { function classifyError(error) { const message = normaliseMessage(error); const preview = message ? message.slice(0, 50) : 'unknown'; - // eslint-disable-next-line no-console - console.log(`[error_classifier] Classifying error: ${preview}`); + if (process.env.RUNNER_DEBUG === '1') { + // eslint-disable-next-line no-console + console.log(`[error_classifier] Classifying error: ${preview}`); + } const status = getStatusCode(error); const statusCategory = status ? classifyByStatus(status, message) : null; diff --git a/templates/consumer-repo/.github/scripts/keepalive_loop.js b/templates/consumer-repo/.github/scripts/keepalive_loop.js index 26938a680..96cdf80e4 100644 --- a/templates/consumer-repo/.github/scripts/keepalive_loop.js +++ b/templates/consumer-repo/.github/scripts/keepalive_loop.js @@ -374,7 +374,7 @@ async function fetchRepoVariables({ github, context, core, names = [] }) { } } catch (error) { if (core) { - core.info(`Failed to fetch repository variables for timeout config: ${error.message}`); + core.debug(`Repository variables not accessible for timeout config (using defaults): ${error.message}`); } } diff --git a/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml b/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml index 1c4dcd0a3..87c148c82 100644 --- a/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml +++ b/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml @@ -591,6 +591,13 @@ jobs: fetch-depth: 1 path: .belt-tools + - name: Re-install API client after branch checkout + if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }} + uses: ./.github/actions/setup-api-client + with: + secrets: ${{ toJSON(secrets) }} + github_token: ${{ github.token }} + - name: Validate ledger base branch if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }} env: @@ -764,7 +771,7 @@ jobs: return super().increase_indent(flow, False) def iso_now() -> str: - return dt.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z' + return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z') issue = os.environ["ISSUE"].strip() branch = os.environ["BRANCH"].strip() @@ -1272,7 +1279,7 @@ jobs: return super().increase_indent(flow, False) def iso_now() -> str: - return dt.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z' + return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z') issue = os.environ.get('ISSUE', '').strip() branch = os.environ.get('BRANCH', '').strip() From 926a8dcf8a1af13304f5e8cf4ab1f20a5802401a Mon Sep 17 00:00:00 2001 From: stranske Date: Thu, 12 Feb 2026 01:26:13 +0000 Subject: [PATCH 02/11] fix: address review comments on belt worker re-install step 1. Use .belt-tools action path instead of ./ for setup-api-client after branch checkout, so the action runs from trusted Workflows code rather than the untrusted issue branch (security fix). 2. Pass GH_BELT_TOKEN || github.token as github_token input to preserve the belt token selection instead of overriding GITHUB_TOKEN/GH_TOKEN with the default workflow token. --- .github/workflows/agents-72-codex-belt-worker.yml | 4 ++-- .../.github/workflows/agents-72-codex-belt-worker.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/agents-72-codex-belt-worker.yml b/.github/workflows/agents-72-codex-belt-worker.yml index 95530873d..ec87d6d2f 100644 --- a/.github/workflows/agents-72-codex-belt-worker.yml +++ b/.github/workflows/agents-72-codex-belt-worker.yml @@ -593,10 +593,10 @@ jobs: - name: Re-install API client after branch checkout if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }} - uses: ./.github/actions/setup-api-client + uses: ./.belt-tools/.github/actions/setup-api-client with: secrets: ${{ toJSON(secrets) }} - github_token: ${{ github.token }} + github_token: ${{ env.GH_BELT_TOKEN || github.token }} - name: Validate ledger base branch if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }} diff --git a/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml b/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml index 87c148c82..14e6f2bfb 100644 --- a/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml +++ b/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml @@ -593,10 +593,10 @@ jobs: - name: Re-install API client after branch checkout if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }} - uses: ./.github/actions/setup-api-client + uses: ./.belt-tools/.github/actions/setup-api-client with: secrets: ${{ toJSON(secrets) }} - github_token: ${{ github.token }} + github_token: ${{ env.GH_BELT_TOKEN || github.token }} - name: Validate ledger base branch if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }} From 24d84ec89f9038bfa511f4e2af0ea1f3d105e3ec Mon Sep 17 00:00:00 2001 From: stranske Date: Thu, 12 Feb 2026 02:41:46 +0000 Subject: [PATCH 03/11] fix: capability_check false-positive on 'secrets' + lower verdict threshold Two independent fixes for broken automation flows: 1. capability_check.py: The bare \bsecrets?\b regex matched negative mentions like 'no secrets' in issue constraint text, causing _requires_admin_access() to return true and the fallback classifier to BLOCK tasks that merely *describe* a no-secrets constraint. Replace with specific verb+secrets patterns (manage/configure/set/ create/update/delete/add/modify/rotate secrets). Root cause of PAEM #1403 false-positive BLOCKED. 2. verdict_policy.py: CONCERNS_NEEDS_HUMAN_THRESHOLD lowered from 0.85 to 0.50. The old threshold meant any split verdict (PASS + CONCERNS) with <85% confidence on the concerns side triggered needs_human, blocking automatic follow-up issue creation. A 72% confidence concerns verdict (TMP #4894) is well above chance and should produce a follow-up rather than require manual triage. Both template and main copies updated; new regression tests added. --- scripts/langchain/capability_check.py | 3 +- scripts/langchain/verdict_policy.py | 2 +- .../scripts/langchain/capability_check.py | 3 +- tests/scripts/test_capability_check.py | 15 ++++++++ tests/test_verdict_policy.py | 34 +++++++++++++++++++ 5 files changed, 54 insertions(+), 3 deletions(-) diff --git a/scripts/langchain/capability_check.py b/scripts/langchain/capability_check.py index eb904c753..0ff398491 100755 --- a/scripts/langchain/capability_check.py +++ b/scripts/langchain/capability_check.py @@ -161,7 +161,8 @@ def _is_multi_action_task(task: str) -> bool: def _requires_admin_access(task: str) -> bool: patterns = [ r"\bgithub\s+secrets?\b", - r"\bsecrets?\b", + r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\s+secrets?\b", + r"\bsecrets?\s+(?:management|configuration|rotation)\b", r"\brepository\s+settings\b", r"\brepo\s+settings\b", r"\bbranch\s+protection\b", diff --git a/scripts/langchain/verdict_policy.py b/scripts/langchain/verdict_policy.py index 8fdcd9925..809bdcadb 100644 --- a/scripts/langchain/verdict_policy.py +++ b/scripts/langchain/verdict_policy.py @@ -16,7 +16,7 @@ "fail": 3, } -CONCERNS_NEEDS_HUMAN_THRESHOLD = 0.85 +CONCERNS_NEEDS_HUMAN_THRESHOLD = 0.50 @dataclass(frozen=True) diff --git a/templates/consumer-repo/scripts/langchain/capability_check.py b/templates/consumer-repo/scripts/langchain/capability_check.py index 0632d893d..b53886821 100755 --- a/templates/consumer-repo/scripts/langchain/capability_check.py +++ b/templates/consumer-repo/scripts/langchain/capability_check.py @@ -160,7 +160,8 @@ def _is_multi_action_task(task: str) -> bool: def _requires_admin_access(task: str) -> bool: patterns = [ r"\bgithub\s+secrets?\b", - r"\bsecrets?\b", + r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\s+secrets?\b", + r"\bsecrets?\s+(?:management|configuration|rotation)\b", r"\brepository\s+settings\b", r"\brepo\s+settings\b", r"\bbranch\s+protection\b", diff --git a/tests/scripts/test_capability_check.py b/tests/scripts/test_capability_check.py index 28f5233c6..e35788188 100644 --- a/tests/scripts/test_capability_check.py +++ b/tests/scripts/test_capability_check.py @@ -419,6 +419,21 @@ def test_fallback_flags_admin_requirement(self) -> None: assert result.blocked_tasks[0]["task"] == "Update GitHub secrets" assert "admin" in result.blocked_tasks[0]["reason"].lower() + def test_fallback_does_not_flag_negated_secrets_mention(self) -> None: + """Regression: 'no secrets' in constraint text must not trigger admin block.""" + task = "safety rules (no secrets, no workflow edits, no file writes)" + with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None): + result = classify_capabilities([task], "") + assert result.recommendation != "BLOCKED" + assert all(item["task"] != task for item in result.blocked_tasks) + + def test_fallback_flags_manage_secrets(self) -> None: + """Specific secrets-management verbs should still be blocked.""" + with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None): + result = classify_capabilities(["manage secrets for deployment"], "") + assert result.recommendation == "BLOCKED" + assert "admin" in result.blocked_tasks[0]["reason"].lower() + def test_fallback_suggests_decomposition(self) -> None: with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None): result = classify_capabilities(["Refactor auth + add tests + update docs"], "") diff --git a/tests/test_verdict_policy.py b/tests/test_verdict_policy.py index 9b5075406..7e134a5d5 100755 --- a/tests/test_verdict_policy.py +++ b/tests/test_verdict_policy.py @@ -61,3 +61,37 @@ def test_needs_human_threshold_boundary(): result = evaluate_verdict_policy(verdicts, policy="worst") assert result.needs_human is False + + +def test_needs_human_true_below_threshold(): + """Concerns below the threshold should trigger needs_human.""" + verdicts = [ + ProviderVerdict("openai", "gpt-5.2", "PASS", 0.92), + ProviderVerdict("anthropic", "claude-sonnet-4-5", "CONCERNS", 0.40), + ] + + result = evaluate_verdict_policy(verdicts, policy="worst") + + assert result.needs_human is True + assert result.split_verdict is True + assert "low-confidence" in result.needs_human_reason + + +def test_moderate_confidence_concerns_do_not_block(): + """Regression: 72% concerns in a split verdict should not trigger needs_human. + + Previously CONCERNS_NEEDS_HUMAN_THRESHOLD was 0.85, which caused any + split verdict with <85% concerns to be flagged. The lowered threshold + (0.50) allows moderate-confidence concerns to proceed with automatic + follow-up creation. + """ + verdicts = [ + ProviderVerdict("openai", "gpt-5.2", "CONCERNS", 72), + ProviderVerdict("anthropic", "claude-sonnet-4-5", "PASS", 85), + ] + + result = evaluate_verdict_policy(verdicts, policy="worst") + + assert result.split_verdict is True + assert result.needs_human is False + assert result.verdict == "CONCERNS" From 00365f43e3f010729548560f4b6b196afa48b4ef Mon Sep 17 00:00:00 2001 From: stranske Date: Thu, 12 Feb 2026 12:48:51 +0000 Subject: [PATCH 04/11] fix: prevent Codex bootstrap from overwriting vendored node_modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three-layer fix for the systemic issue where setup-api-client's npm install overwrites vendored minimatch package.json, and git add -A captures the modification into bootstrap/autofix commits. Layer 1 (source fix): setup-api-client/action.yml - Snapshot vendored package.json files before npm install - Restore them after npm install completes - Applied to both .github/actions/ and templates/consumer-repo/ Layer 2 (targeted staging): reusable-agents-issue-bridge.yml - Replace 'git add -A' with targeted 'git add agents/${AGENT}-${ISSUE}.md' - Only the bootstrap file gets staged, not npm side-effects Layer 3 (safety net): reusable-18-autofix.yml - Add 'git reset HEAD -- .github/scripts/node_modules ...' after git add -A - Matches existing pattern in reusable-codex-run.yml line 1184 - Applied to both push-commit and patch-commit paths Also fixes test assertions that referenced the old CONCERNS_NEEDS_HUMAN_THRESHOLD (was 0.85, now 0.50) — confidence values in tests updated accordingly. Fixes: Copilot review finding on PAEM PR #1417 (minimatch vendoring cycle) --- .github/actions/setup-api-client/action.yml | 30 +++++++++++++++++++ .github/workflows/reusable-18-autofix.yml | 4 +++ .../reusable-agents-issue-bridge.yml | 4 ++- .../actions/setup-api-client/action.yml | 30 +++++++++++++++++++ tests/test_followup_issue_generator.py | 2 +- tests/test_verdict_extract.py | 2 +- tests/test_verdict_policy_integration.py | 2 +- 7 files changed, 70 insertions(+), 4 deletions(-) diff --git a/.github/actions/setup-api-client/action.yml b/.github/actions/setup-api-client/action.yml index 3343ebfb5..109cc33a7 100644 --- a/.github/actions/setup-api-client/action.yml +++ b/.github/actions/setup-api-client/action.yml @@ -102,6 +102,22 @@ runs: if [ -d "node_modules/@octokit/rest" ]; then echo "✅ @octokit/rest already installed" else + # Snapshot vendored package metadata before npm install. + # npm may overwrite transitive deps (e.g. minimatch) that are + # committed as vendored packages with intentional version pins. + VENDORED_SNAPSHOT="" + if [ -f "node_modules/minimatch/package.json" ]; then + VENDORED_SNAPSHOT=$(mktemp -d) + for pkg_dir in node_modules/*/; do + if [ -f "${pkg_dir}package.json" ]; then + pkg_name=$(basename "$pkg_dir") + mkdir -p "${VENDORED_SNAPSHOT}/${pkg_name}" + cp "${pkg_dir}package.json" "${VENDORED_SNAPSHOT}/${pkg_name}/package.json" + fi + done + echo "📸 Snapshotted vendored package metadata" + fi + # Install with pinned versions for consistency # Capture stderr for debugging if the command fails npm_output=$(mktemp) @@ -122,6 +138,20 @@ runs: @octokit/plugin-paginate-rest@9.1.5 \ @octokit/auth-app@6.0.3 fi + + # Restore vendored package metadata that npm may have overwritten + if [ -n "${VENDORED_SNAPSHOT:-}" ] && [ -d "${VENDORED_SNAPSHOT}" ]; then + for pkg_backup in "${VENDORED_SNAPSHOT}"/*/; do + pkg_name=$(basename "$pkg_backup") + if [ -f "node_modules/${pkg_name}/package.json" ] && \ + [ -f "${pkg_backup}package.json" ]; then + cp "${pkg_backup}package.json" "node_modules/${pkg_name}/package.json" + fi + done + rm -rf "${VENDORED_SNAPSHOT}" + echo "📸 Restored vendored package metadata" + fi + echo "✅ @octokit dependencies installed" fi diff --git a/.github/workflows/reusable-18-autofix.yml b/.github/workflows/reusable-18-autofix.yml index d4d64636a..f24a6f975 100644 --- a/.github/workflows/reusable-18-autofix.yml +++ b/.github/workflows/reusable-18-autofix.yml @@ -780,6 +780,8 @@ jobs: git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" git add -A + # Unstage vendored node_modules that may have been modified by npm install + git reset HEAD -- .github/scripts/node_modules node_modules .workflows-lib/.github/scripts/node_modules 2>/dev/null || true git commit -m "${AUTOFIX_COMMIT_PREFIX} formatting/lint" echo "AUTOFIX_COMMIT_SHA=$(git rev-parse HEAD)" >> "$GITHUB_ENV" @@ -851,6 +853,8 @@ jobs: git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" git add -A + # Unstage vendored node_modules that may have been modified by npm install + git reset HEAD -- .github/scripts/node_modules node_modules .workflows-lib/.github/scripts/node_modules 2>/dev/null || true git commit -m "${AUTOFIX_COMMIT_PREFIX} formatting/lint (patch)" || true git format-patch -1 --stdout > autofix.patch diff --git a/.github/workflows/reusable-agents-issue-bridge.yml b/.github/workflows/reusable-agents-issue-bridge.yml index d193a16ce..16dea6053 100644 --- a/.github/workflows/reusable-agents-issue-bridge.yml +++ b/.github/workflows/reusable-agents-issue-bridge.yml @@ -580,7 +580,9 @@ jobs: git checkout -B "$HEAD_BRANCH" "origin/${BASE_BRANCH}" mkdir -p agents printf "\n" "$AGENT" "$ISSUE_NUM" > "agents/${AGENT}-${ISSUE_NUM}.md" - git add -A || true + # Stage only the intended bootstrap file — 'git add -A' would capture + # vendored node_modules changes made by setup-api-client npm install. + git add "agents/${AGENT}-${ISSUE_NUM}.md" || true if ! git diff --cached --quiet; then git commit -m "chore(${AGENT}): bootstrap PR for issue #${ISSUE_NUM}" else diff --git a/templates/consumer-repo/.github/actions/setup-api-client/action.yml b/templates/consumer-repo/.github/actions/setup-api-client/action.yml index 3343ebfb5..109cc33a7 100644 --- a/templates/consumer-repo/.github/actions/setup-api-client/action.yml +++ b/templates/consumer-repo/.github/actions/setup-api-client/action.yml @@ -102,6 +102,22 @@ runs: if [ -d "node_modules/@octokit/rest" ]; then echo "✅ @octokit/rest already installed" else + # Snapshot vendored package metadata before npm install. + # npm may overwrite transitive deps (e.g. minimatch) that are + # committed as vendored packages with intentional version pins. + VENDORED_SNAPSHOT="" + if [ -f "node_modules/minimatch/package.json" ]; then + VENDORED_SNAPSHOT=$(mktemp -d) + for pkg_dir in node_modules/*/; do + if [ -f "${pkg_dir}package.json" ]; then + pkg_name=$(basename "$pkg_dir") + mkdir -p "${VENDORED_SNAPSHOT}/${pkg_name}" + cp "${pkg_dir}package.json" "${VENDORED_SNAPSHOT}/${pkg_name}/package.json" + fi + done + echo "📸 Snapshotted vendored package metadata" + fi + # Install with pinned versions for consistency # Capture stderr for debugging if the command fails npm_output=$(mktemp) @@ -122,6 +138,20 @@ runs: @octokit/plugin-paginate-rest@9.1.5 \ @octokit/auth-app@6.0.3 fi + + # Restore vendored package metadata that npm may have overwritten + if [ -n "${VENDORED_SNAPSHOT:-}" ] && [ -d "${VENDORED_SNAPSHOT}" ]; then + for pkg_backup in "${VENDORED_SNAPSHOT}"/*/; do + pkg_name=$(basename "$pkg_backup") + if [ -f "node_modules/${pkg_name}/package.json" ] && \ + [ -f "${pkg_backup}package.json" ]; then + cp "${pkg_backup}package.json" "node_modules/${pkg_name}/package.json" + fi + done + rm -rf "${VENDORED_SNAPSHOT}" + echo "📸 Restored vendored package metadata" + fi + echo "✅ @octokit dependencies installed" fi diff --git a/tests/test_followup_issue_generator.py b/tests/test_followup_issue_generator.py index 572099608..aeb9d1d38 100755 --- a/tests/test_followup_issue_generator.py +++ b/tests/test_followup_issue_generator.py @@ -533,7 +533,7 @@ def test_split_low_confidence_requires_needs_human(self): verification_data = VerificationData( provider_verdicts={ "openai": {"verdict": "PASS", "confidence": 90}, - "anthropic": {"verdict": "CONCERNS", "confidence": 70}, + "anthropic": {"verdict": "CONCERNS", "confidence": 40}, }, concerns=["Missing test coverage"], ) diff --git a/tests/test_verdict_extract.py b/tests/test_verdict_extract.py index aa3a207cc..938df898a 100644 --- a/tests/test_verdict_extract.py +++ b/tests/test_verdict_extract.py @@ -25,7 +25,7 @@ def _parse_github_output(raw: str) -> dict[str, str]: def test_verdict_extract_emits_structured_github_outputs(tmp_path): summary = _build_summary( "| openai | gpt-5.2 | PASS | 0.92 | Looks good. |", - "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.84 | Missing edge case. |", + "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.40 | Missing edge case. |", ) result = verdict_extract.build_verdict_result(summary, policy="worst") output_path = tmp_path / "github_output.txt" diff --git a/tests/test_verdict_policy_integration.py b/tests/test_verdict_policy_integration.py index 67afa4ee5..1cca00d00 100755 --- a/tests/test_verdict_policy_integration.py +++ b/tests/test_verdict_policy_integration.py @@ -40,7 +40,7 @@ def test_split_verdict_confidence_boundary_needs_human_false(): def test_split_verdict_low_confidence_needs_human_true(): summary = _build_summary( "| openai | gpt-5.2 | PASS | 0.92 | Looks good. |", - "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.84 | Missing edge case. |", + "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.40 | Missing edge case. |", ) workflow_result = _workflow_result(summary) From 089bc23fa925bb7dc47e9655bacf4d7335f6a8a5 Mon Sep 17 00:00:00 2001 From: stranske Date: Thu, 12 Feb 2026 12:58:14 +0000 Subject: [PATCH 05/11] fix: flip needs_human to trigger on high-confidence CONCERNS, not low MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The needs_human gate was backwards: it fired when the CONCERNS provider had LOW confidence (LLM unsure there's a problem) instead of HIGH confidence (LLM confident there's a real problem). Confidence reflects the LLM's certainty in its own evaluation, not a measure of code quality. Low-confidence CONCERNS is a weak signal that shouldn't block follow-up automation. High-confidence CONCERNS is the stronger signal warranting human review. Changed: confidence_value < threshold → confidence_value >= threshold Threshold set to 0.85 (high bar — a human is already in the loop and depth-of-rounds provides an independent guard against runaway automation). --- scripts/langchain/verdict_policy.py | 8 ++++---- tests/test_followup_issue_generator.py | 6 +++--- tests/test_verdict_extract.py | 2 +- tests/test_verdict_policy.py | 21 +++++++++++---------- tests/test_verdict_policy_integration.py | 14 ++++++++------ 5 files changed, 27 insertions(+), 24 deletions(-) diff --git a/scripts/langchain/verdict_policy.py b/scripts/langchain/verdict_policy.py index 809bdcadb..86d422350 100644 --- a/scripts/langchain/verdict_policy.py +++ b/scripts/langchain/verdict_policy.py @@ -16,7 +16,7 @@ "fail": 3, } -CONCERNS_NEEDS_HUMAN_THRESHOLD = 0.50 +CONCERNS_NEEDS_HUMAN_THRESHOLD = 0.85 @dataclass(frozen=True) @@ -193,11 +193,11 @@ def evaluate_verdict_policy( needs_human_reason = "" if split_verdict: confidence_value = concerns_confidence or 0.0 - if confidence_value < CONCERNS_NEEDS_HUMAN_THRESHOLD: + if confidence_value >= CONCERNS_NEEDS_HUMAN_THRESHOLD: needs_human = True needs_human_reason = ( - "Provider verdicts split with low-confidence concerns; " - f"dissenting confidence {confidence_value:.2f} < " + "Provider verdicts split with high-confidence concerns; " + f"dissenting confidence {confidence_value:.2f} >= " f"{CONCERNS_NEEDS_HUMAN_THRESHOLD:.2f}. " "Requires human review before starting another automated follow-up." ) diff --git a/tests/test_followup_issue_generator.py b/tests/test_followup_issue_generator.py index aeb9d1d38..c3f613be6 100755 --- a/tests/test_followup_issue_generator.py +++ b/tests/test_followup_issue_generator.py @@ -528,12 +528,12 @@ def test_advisory_concerns_are_notes(self): assert "- [ ] Address: Could add a clarifying comment" not in followup.body assert "## Notes" in followup.body - def test_split_low_confidence_requires_needs_human(self): - """Low-confidence split verdicts should trigger needs-human labeling.""" + def test_split_high_confidence_requires_needs_human(self): + """High-confidence CONCERNS in a split verdict should trigger needs-human labeling.""" verification_data = VerificationData( provider_verdicts={ "openai": {"verdict": "PASS", "confidence": 90}, - "anthropic": {"verdict": "CONCERNS", "confidence": 40}, + "anthropic": {"verdict": "CONCERNS", "confidence": 92}, }, concerns=["Missing test coverage"], ) diff --git a/tests/test_verdict_extract.py b/tests/test_verdict_extract.py index 938df898a..38d43456a 100644 --- a/tests/test_verdict_extract.py +++ b/tests/test_verdict_extract.py @@ -25,7 +25,7 @@ def _parse_github_output(raw: str) -> dict[str, str]: def test_verdict_extract_emits_structured_github_outputs(tmp_path): summary = _build_summary( "| openai | gpt-5.2 | PASS | 0.92 | Looks good. |", - "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.40 | Missing edge case. |", + "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.90 | Missing edge case. |", ) result = verdict_extract.build_verdict_result(summary, policy="worst") output_path = tmp_path / "github_output.txt" diff --git a/tests/test_verdict_policy.py b/tests/test_verdict_policy.py index 7e134a5d5..bfe631e05 100755 --- a/tests/test_verdict_policy.py +++ b/tests/test_verdict_policy.py @@ -51,6 +51,7 @@ def test_select_verdict_majority_policy(): def test_needs_human_threshold_boundary(): + """At exactly the threshold, needs_human should fire (>= comparison).""" verdicts = [ ProviderVerdict("openai", "gpt-5.2", "PASS", 0.92), ProviderVerdict( @@ -60,30 +61,30 @@ def test_needs_human_threshold_boundary(): result = evaluate_verdict_policy(verdicts, policy="worst") - assert result.needs_human is False + assert result.needs_human is True -def test_needs_human_true_below_threshold(): - """Concerns below the threshold should trigger needs_human.""" +def test_needs_human_true_above_threshold(): + """Concerns above the threshold should trigger needs_human.""" verdicts = [ ProviderVerdict("openai", "gpt-5.2", "PASS", 0.92), - ProviderVerdict("anthropic", "claude-sonnet-4-5", "CONCERNS", 0.40), + ProviderVerdict("anthropic", "claude-sonnet-4-5", "CONCERNS", 0.90), ] result = evaluate_verdict_policy(verdicts, policy="worst") assert result.needs_human is True assert result.split_verdict is True - assert "low-confidence" in result.needs_human_reason + assert "high-confidence" in result.needs_human_reason def test_moderate_confidence_concerns_do_not_block(): - """Regression: 72% concerns in a split verdict should not trigger needs_human. + """Moderate-confidence concerns in a split verdict should not trigger needs_human. - Previously CONCERNS_NEEDS_HUMAN_THRESHOLD was 0.85, which caused any - split verdict with <85% concerns to be flagged. The lowered threshold - (0.50) allows moderate-confidence concerns to proceed with automatic - follow-up creation. + needs_human only fires when the CONCERNS provider is highly confident + (>= 0.85), indicating the LLM is quite sure there are real problems. + Moderate confidence means the LLM is uncertain — that's a weaker signal + and shouldn't block follow-up automation. """ verdicts = [ ProviderVerdict("openai", "gpt-5.2", "CONCERNS", 72), diff --git a/tests/test_verdict_policy_integration.py b/tests/test_verdict_policy_integration.py index 1cca00d00..40562e48f 100755 --- a/tests/test_verdict_policy_integration.py +++ b/tests/test_verdict_policy_integration.py @@ -23,7 +23,8 @@ def _build_summary(*rows: str) -> str: return f"## Provider Summary\n\n{header}{body}\n" -def test_split_verdict_confidence_boundary_needs_human_false(): +def test_split_verdict_confidence_boundary_needs_human_true(): + """At exactly the threshold (0.85), needs_human should be True.""" summary = _build_summary( "| openai | gpt-5.2 | PASS | 0.92 | Looks good. |", "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.85 | Missing edge case. |", @@ -33,11 +34,12 @@ def test_split_verdict_confidence_boundary_needs_human_false(): followup_result = _followup_result(summary) assert workflow_result.verdict == followup_result.verdict == "CONCERNS" - assert workflow_result.needs_human is False - assert followup_result.needs_human is False + assert workflow_result.needs_human is True + assert followup_result.needs_human is True -def test_split_verdict_low_confidence_needs_human_true(): +def test_split_verdict_low_confidence_needs_human_false(): + """Below threshold, low-confidence concerns should NOT trigger needs_human.""" summary = _build_summary( "| openai | gpt-5.2 | PASS | 0.92 | Looks good. |", "| anthropic | claude-sonnet-4-5 | CONCERNS | 0.40 | Missing edge case. |", @@ -47,8 +49,8 @@ def test_split_verdict_low_confidence_needs_human_true(): followup_result = _followup_result(summary) assert workflow_result.verdict == followup_result.verdict == "CONCERNS" - assert workflow_result.needs_human is True - assert followup_result.needs_human is True + assert workflow_result.needs_human is False + assert followup_result.needs_human is False def test_split_verdict_row_order_invariance(): From 371370e04b2f027f3b451ed6ba589e9d82f07ee4 Mon Sep 17 00:00:00 2001 From: stranske Date: Thu, 12 Feb 2026 15:23:28 +0000 Subject: [PATCH 06/11] =?UTF-8?q?fix:=20harden=20Codex=20pipeline=20?= =?UTF-8?q?=E2=80=94=20corrupt=20ledger=20resilience,=20autofix=20limits,?= =?UTF-8?q?=20task-focused=20prompts,=20PR=20meta=20debounce?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ledger_migrate_base.py: skip corrupt YAML files instead of blocking all belt worker runs (root cause of issue #1418 stall) - agents-autofix-loop: reduce max_attempts 3→2 (standard) and 2→1 (escalated) to cut autofix churn observed in PR #4906 - agents-72-codex-belt-worker: emit task_title output and include task-focused directive in activation comment for higher first-commit success rate - agents-pr-meta: add PR-number concurrency grouping with cancel-in-progress for pull_request events to debounce redundant runs - All template counterparts updated in sync - 2 new tests for corrupt ledger handling --- .../workflows/agents-72-codex-belt-worker.yml | 19 ++++++- .github/workflows/agents-autofix-loop.yml | 4 +- .github/workflows/agents-pr-meta-v4.yml | 4 +- scripts/ledger_migrate_base.py | 14 ++++- .../workflows/agents-72-codex-belt-worker.yml | 19 ++++++- .../.github/workflows/agents-autofix-loop.yml | 4 +- .../.github/workflows/agents-pr-meta.yml | 4 +- tests/scripts/test_ledger_migrate_base.py | 51 ++++++++++++++++++- 8 files changed, 108 insertions(+), 11 deletions(-) diff --git a/.github/workflows/agents-72-codex-belt-worker.yml b/.github/workflows/agents-72-codex-belt-worker.yml index ec87d6d2f..29a4dbefc 100644 --- a/.github/workflows/agents-72-codex-belt-worker.yml +++ b/.github/workflows/agents-72-codex-belt-worker.yml @@ -905,6 +905,7 @@ jobs: if gh_output: with open(gh_output, 'a', encoding='utf-8') as handle: handle.write(f"task_id={start_info['task']['id'] if start_info['task'] else ''}\n") + handle.write(f"task_title={start_info['task']['title'] if start_info['task'] else ''}\n") handle.write(f"task_status={start_info['task']['current_status'] if start_info['task'] else ''}\n") handle.write(f"ledger_changed={'true' if changed else 'false'}\n") handle.write(f"ledger_created={'true' if start_info['created'] else 'false'}\n") @@ -1187,12 +1188,28 @@ jobs: const prNumber = Number('${{ steps.pr.outputs.number }}'); const branch = ('${{ steps.ctx.outputs.branch }}' || '').trim() || '(unknown branch)'; const dryRun = '${{ steps.mode.outputs.dry_run }}' === 'true'; + const taskId = ('${{ steps.ledger_start.outputs.task_id }}' || '').trim(); + const taskTitle = ('${{ steps.ledger_start.outputs.task_title }}' || '').trim(); const { owner, repo } = context.repo; const marker = ''; const summary = dryRun ? `Codex Worker activated for branch \`${branch}\` (dry run preview).` : `Codex Worker activated for branch \`${branch}\`.`; - const body = `${marker}\n${summary}\n\n@codex start\n\nAutomated belt worker prepared this PR. Please continue implementing the requested changes.`; + // Direct Codex to focus on the single next ledger task for higher + // first-commit success probability. Full issue context is in the + // PR body; this comment narrows the immediate scope. + let taskDirective = ''; + if (taskId && taskTitle) { + taskDirective = [ + '', + `**Focus on this task first:** \`${taskId}\` — ${taskTitle}`, + '', + 'Implement **only** this task in your first commit.', + 'Ensure the code compiles and existing tests pass before moving on.', + 'The keepalive loop will assign subsequent tasks after this one is complete.', + ].join('\n'); + } + const body = `${marker}\n${summary}\n\n@codex start${taskDirective}`; try { const comments = await paginateWithRetry( diff --git a/.github/workflows/agents-autofix-loop.yml b/.github/workflows/agents-autofix-loop.yml index f88518eb5..a989768c3 100644 --- a/.github/workflows/agents-autofix-loop.yml +++ b/.github/workflows/agents-autofix-loop.yml @@ -162,7 +162,7 @@ jobs: appendix: '', stop_reason: '', attempts: '0', - max_attempts: '3', + max_attempts: '2', trigger_reason: 'unknown', trigger_job: '', trigger_step: '', @@ -287,7 +287,7 @@ jobs: // Reduce attempts for auto-escalated PRs (they weren't agent-initiated) const isEscalated = labels.includes('autofix:escalated'); const maxAttempts = isEscalated - ? Math.min(2, Number(outputs.max_attempts)) + ? 1 : Number(outputs.max_attempts); const previousRuns = await paginateWithRetry( github, diff --git a/.github/workflows/agents-pr-meta-v4.yml b/.github/workflows/agents-pr-meta-v4.yml index 117b658f2..a7b4a7390 100644 --- a/.github/workflows/agents-pr-meta-v4.yml +++ b/.github/workflows/agents-pr-meta-v4.yml @@ -37,9 +37,11 @@ concurrency: && github.event.comment && github.event.comment.id && format('agents-pr-meta-comment-{0}', github.event.comment.id) + || github.event_name == 'pull_request' + && format('agents-pr-meta-pr-{0}', github.event.pull_request.number) || format('agents-pr-meta-run-{0}', github.run_id) }} - cancel-in-progress: false + cancel-in-progress: ${{ github.event_name == 'pull_request' }} jobs: comment_event_context: diff --git a/scripts/ledger_migrate_base.py b/scripts/ledger_migrate_base.py index 2a551ecac..a7fcadc8d 100644 --- a/scripts/ledger_migrate_base.py +++ b/scripts/ledger_migrate_base.py @@ -174,14 +174,26 @@ def main(argv: Iterable[str] | None = None) -> int: mismatches: list[LedgerResult] = [] updated: list[LedgerResult] = [] + skipped: list[tuple[Path, str]] = [] for ledger_path in ledgers: - result = migrate_ledger(ledger_path, default_branch, check=args.check) + try: + result = migrate_ledger(ledger_path, default_branch, check=args.check) + except (MigrationError, yaml.YAMLError) as exc: + # One corrupt ledger must not block processing of the remaining files. + print(f"::warning::Skipping {ledger_path.name}: {exc}") + skipped.append((ledger_path, str(exc))) + continue if args.check: if result.previous != default_branch: mismatches.append(result) elif result.changed: updated.append(result) + if skipped: + print(f"Skipped {len(skipped)} corrupt ledger(s):") + for path, reason in skipped: + print(f" - {path.name}: {reason}") + if args.check: if mismatches: print("Found ledgers with stale base values:") diff --git a/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml b/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml index 14e6f2bfb..3695d5226 100644 --- a/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml +++ b/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml @@ -905,6 +905,7 @@ jobs: if gh_output: with open(gh_output, 'a', encoding='utf-8') as handle: handle.write(f"task_id={start_info['task']['id'] if start_info['task'] else ''}\n") + handle.write(f"task_title={start_info['task']['title'] if start_info['task'] else ''}\n") handle.write(f"task_status={start_info['task']['current_status'] if start_info['task'] else ''}\n") handle.write(f"ledger_changed={'true' if changed else 'false'}\n") handle.write(f"ledger_created={'true' if start_info['created'] else 'false'}\n") @@ -1187,12 +1188,28 @@ jobs: const prNumber = Number('${{ steps.pr.outputs.number }}'); const branch = ('${{ steps.ctx.outputs.branch }}' || '').trim() || '(unknown branch)'; const dryRun = '${{ steps.mode.outputs.dry_run }}' === 'true'; + const taskId = ('${{ steps.ledger_start.outputs.task_id }}' || '').trim(); + const taskTitle = ('${{ steps.ledger_start.outputs.task_title }}' || '').trim(); const { owner, repo } = context.repo; const marker = ''; const summary = dryRun ? `Codex Worker activated for branch \`${branch}\` (dry run preview).` : `Codex Worker activated for branch \`${branch}\`.`; - const body = `${marker}\n${summary}\n\n@codex start\n\nAutomated belt worker prepared this PR. Please continue implementing the requested changes.`; + // Direct Codex to focus on the single next ledger task for higher + // first-commit success probability. Full issue context is in the + // PR body; this comment narrows the immediate scope. + let taskDirective = ''; + if (taskId && taskTitle) { + taskDirective = [ + '', + `**Focus on this task first:** \`${taskId}\` — ${taskTitle}`, + '', + 'Implement **only** this task in your first commit.', + 'Ensure the code compiles and existing tests pass before moving on.', + 'The keepalive loop will assign subsequent tasks after this one is complete.', + ].join('\n'); + } + const body = `${marker}\n${summary}\n\n@codex start${taskDirective}`; try { const comments = await paginateWithRetry( diff --git a/templates/consumer-repo/.github/workflows/agents-autofix-loop.yml b/templates/consumer-repo/.github/workflows/agents-autofix-loop.yml index 677da056a..902cdf831 100644 --- a/templates/consumer-repo/.github/workflows/agents-autofix-loop.yml +++ b/templates/consumer-repo/.github/workflows/agents-autofix-loop.yml @@ -155,7 +155,7 @@ jobs: appendix: '', stop_reason: '', attempts: '0', - max_attempts: '3', + max_attempts: '2', trigger_reason: 'unknown', trigger_job: '', trigger_step: '', @@ -280,7 +280,7 @@ jobs: // Reduce attempts for auto-escalated PRs (they weren't agent-initiated) const isEscalated = labels.includes('autofix:escalated'); const maxAttempts = isEscalated - ? Math.min(2, Number(outputs.max_attempts)) + ? 1 : Number(outputs.max_attempts); const previousRuns = await paginateWithRetry( github, diff --git a/templates/consumer-repo/.github/workflows/agents-pr-meta.yml b/templates/consumer-repo/.github/workflows/agents-pr-meta.yml index f07a33ee7..24a480544 100644 --- a/templates/consumer-repo/.github/workflows/agents-pr-meta.yml +++ b/templates/consumer-repo/.github/workflows/agents-pr-meta.yml @@ -47,8 +47,10 @@ concurrency: group: >- ${{ github.event_name == 'issue_comment' && format('agents-pr-meta-comment-{0}', github.event.comment.id) || + github.event_name == 'pull_request' && + format('agents-pr-meta-pr-{0}', github.event.pull_request.number) || format('agents-pr-meta-run-{0}', github.run_id) }} - cancel-in-progress: false + cancel-in-progress: ${{ github.event_name == 'pull_request' }} jobs: # Resolve PR context for issue_comment events diff --git a/tests/scripts/test_ledger_migrate_base.py b/tests/scripts/test_ledger_migrate_base.py index 0fe7bac40..89cb4fc9f 100644 --- a/tests/scripts/test_ledger_migrate_base.py +++ b/tests/scripts/test_ledger_migrate_base.py @@ -248,10 +248,12 @@ def test_main_check_reports_mismatches(monkeypatch, capsys, tmp_path) -> None: agents_dir.mkdir() ledger_path = agents_dir / "issue-9-ledger.yml" ledger_path.write_text( - textwrap.dedent("""\ + textwrap.dedent( + """\ base: develop items: [] - """), + """ + ), encoding="utf-8", ) @@ -318,3 +320,48 @@ def test_main_reports_no_updates(monkeypatch, capsys, tmp_path) -> None: assert exit_code == 0 out = capsys.readouterr().out assert "Ledgers already matched the default branch; no updates written." in out + + +def test_main_skips_corrupt_ledger_and_continues(monkeypatch, capsys, tmp_path) -> None: + """A corrupt YAML ledger must not block processing of other ledgers.""" + agents_dir = tmp_path / ".agents" + agents_dir.mkdir() + # Write a corrupt ledger with invalid YAML + corrupt = agents_dir / "issue-10-ledger.yml" + corrupt.write_text("base: main\ntasks:\n - title: `backtick breaks yaml`\n", encoding="utf-8") + # Write a valid ledger that should still be processed + valid = agents_dir / "issue-20-ledger.yml" + _write_ledger(valid, {"base": "develop", "items": []}) + + monkeypatch.setattr(ledger_migrate_base, "find_repo_root", lambda: tmp_path) + monkeypatch.setattr(ledger_migrate_base, "detect_default_branch", lambda _=None: "main") + + exit_code = ledger_migrate_base.main([]) + + assert exit_code == 0 + out = capsys.readouterr().out + # The corrupt ledger was skipped with a warning + assert "Skipping issue-10-ledger.yml" in out + assert "Skipped 1 corrupt ledger(s):" in out + # The valid ledger was still updated + assert yaml.safe_load(valid.read_text(encoding="utf-8"))["base"] == "main" + + +def test_main_check_skips_corrupt_ledger(monkeypatch, capsys, tmp_path) -> None: + """--check mode also skips corrupt ledgers without failing.""" + agents_dir = tmp_path / ".agents" + agents_dir.mkdir() + corrupt = agents_dir / "issue-5-ledger.yml" + corrupt.write_text("not: valid: yaml: `oops`\n", encoding="utf-8") + valid = agents_dir / "issue-6-ledger.yml" + valid.write_text("base: main\n", encoding="utf-8") + + monkeypatch.setattr(ledger_migrate_base, "find_repo_root", lambda: tmp_path) + monkeypatch.setattr(ledger_migrate_base, "detect_default_branch", lambda _=None: "main") + + exit_code = ledger_migrate_base.main(["--check"]) + + assert exit_code == 0 + out = capsys.readouterr().out + assert "Skipping issue-5-ledger.yml" in out + assert "All ledgers already track the default branch." in out From 6f5e110beb7aecd8bde358f08b0f2831ae8c307a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 12 Feb 2026 15:31:30 +0000 Subject: [PATCH 07/11] chore(autofix): formatting/lint --- tests/scripts/test_ledger_migrate_base.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/scripts/test_ledger_migrate_base.py b/tests/scripts/test_ledger_migrate_base.py index 89cb4fc9f..180eaac5a 100644 --- a/tests/scripts/test_ledger_migrate_base.py +++ b/tests/scripts/test_ledger_migrate_base.py @@ -248,12 +248,10 @@ def test_main_check_reports_mismatches(monkeypatch, capsys, tmp_path) -> None: agents_dir.mkdir() ledger_path = agents_dir / "issue-9-ledger.yml" ledger_path.write_text( - textwrap.dedent( - """\ + textwrap.dedent("""\ base: develop items: [] - """ - ), + """), encoding="utf-8", ) From 99b3d5207305a442de453c47cac3a767a91fc22a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 15:35:54 +0000 Subject: [PATCH 08/11] chore(codex-autofix): apply updates (PR #1484) --- tests/scripts/test_ledger_migrate_base.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/scripts/test_ledger_migrate_base.py b/tests/scripts/test_ledger_migrate_base.py index 89cb4fc9f..180eaac5a 100644 --- a/tests/scripts/test_ledger_migrate_base.py +++ b/tests/scripts/test_ledger_migrate_base.py @@ -248,12 +248,10 @@ def test_main_check_reports_mismatches(monkeypatch, capsys, tmp_path) -> None: agents_dir.mkdir() ledger_path = agents_dir / "issue-9-ledger.yml" ledger_path.write_text( - textwrap.dedent( - """\ + textwrap.dedent("""\ base: develop items: [] - """ - ), + """), encoding="utf-8", ) From 52bd3542ab1c01a4160e9999de5e3219bc53b819 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 15:40:51 +0000 Subject: [PATCH 09/11] chore(codex-autofix): apply updates (PR #1484) --- .../scripts/__tests__/detect-changes.test.js | 62 +++++++++++++++++++ .github/scripts/detect-changes.js | 57 ++++++++++++++--- 2 files changed, 110 insertions(+), 9 deletions(-) diff --git a/.github/scripts/__tests__/detect-changes.test.js b/.github/scripts/__tests__/detect-changes.test.js index bd7b8c119..09fe2604c 100644 --- a/.github/scripts/__tests__/detect-changes.test.js +++ b/.github/scripts/__tests__/detect-changes.test.js @@ -82,3 +82,65 @@ test('detectChanges fetches files via callback', async () => { assert.equal(result.outputs.run_core, 'true'); assert.equal(result.outputs.workflow_changed, 'false'); }); + +test('detectChanges falls back to conservative defaults when listFiles is inaccessible', async () => { + const warnings = []; + const result = await detectChanges({ + core: { + warning(message) { + warnings.push(String(message)); + }, + setOutput() {}, + }, + context: { + eventName: 'pull_request', + repo: { owner: 'octo', repo: 'demo' }, + payload: { pull_request: { number: 42 } }, + }, + github: { + rest: { + pulls: { + listFiles: async () => ({ data: [] }), + }, + }, + paginate: { + iterator: () => { + const error = new Error('Resource not accessible by integration'); + error.status = 403; + throw error; + }, + }, + }, + }); + + assert.equal(result.outputs.doc_only, 'false'); + assert.equal(result.outputs.run_core, 'true'); + assert.equal(result.outputs.reason, 'rate_limited'); + assert.equal(result.outputs.docker_changed, 'false'); + assert.equal(result.outputs.workflow_changed, 'true'); + assert.equal(warnings.length, 1); + assert.match(warnings[0], /Unable to determine changed files via API/); +}); + +test('detectChanges supports clients without paginate.iterator', async () => { + const result = await detectChanges({ + context: { + eventName: 'pull_request', + repo: { owner: 'octo', repo: 'demo' }, + payload: { pull_request: { number: 1 } }, + }, + github: { + rest: { + pulls: { + listFiles: async () => ({ data: [] }), + }, + }, + paginate: async () => [{ filename: 'docs/README.md' }], + }, + }); + + assert.equal(result.outputs.doc_only, 'true'); + assert.equal(result.outputs.run_core, 'false'); + assert.equal(result.outputs.reason, 'docs_only'); + assert.equal(result.outputs.workflow_changed, 'false'); +}); diff --git a/.github/scripts/detect-changes.js b/.github/scripts/detect-changes.js index 45c31c4f5..64d34cb68 100644 --- a/.github/scripts/detect-changes.js +++ b/.github/scripts/detect-changes.js @@ -168,6 +168,28 @@ function isRateLimitError(error) { return message.includes('rate limit') || message.includes('ratelimit'); } +function isNonFatalListFilesError(error) { + if (!error) { + return false; + } + if (isRateLimitError(error)) { + return true; + } + const status = error.status || error?.response?.status; + if ([401, 403, 404, 422].includes(status)) { + return true; + } + const message = String(error.message || error?.response?.data?.message || '').toLowerCase(); + return ( + message.includes('resource not accessible by integration') || + message.includes('insufficient permission') || + message.includes('requires higher permissions') || + message.includes('not found') || + message.includes('unprocessable') || + message.includes('validation failed') + ); +} + async function listChangedFiles({ github, context }) { const pull = context?.payload?.pull_request; const number = pull?.number; @@ -175,25 +197,42 @@ async function listChangedFiles({ github, context }) { return []; } try { - const iterator = github.paginate.iterator(github.rest.pulls.listFiles, { + const files = []; + const params = { owner: context.repo.owner, repo: context.repo.repo, pull_number: number, per_page: 100, - }); - const files = []; - for await (const page of iterator) { - if (Array.isArray(page.data)) { - for (const item of page.data) { + }; + if (typeof github?.paginate?.iterator === 'function') { + const iterator = github.paginate.iterator(github.rest.pulls.listFiles, params); + for await (const page of iterator) { + if (Array.isArray(page.data)) { + for (const item of page.data) { + if (item && typeof item.filename === 'string') { + files.push(item.filename); + } + } + } + } + return files; + } + + if (typeof github?.paginate === 'function') { + const items = await github.paginate(github.rest.pulls.listFiles, params); + if (Array.isArray(items)) { + for (const item of items) { if (item && typeof item.filename === 'string') { files.push(item.filename); } } } + return files; } - return files; + + throw new Error('GitHub paginate API is unavailable'); } catch (error) { - if (isRateLimitError(error)) { + if (isNonFatalListFilesError(error)) { return null; } throw error; @@ -261,7 +300,7 @@ async function detectChanges({ github, context, core, files, fetchFiles } = {}) workflow_changed: 'true', }; const warn = core?.warning ? core.warning.bind(core) : console.warn.bind(console); - warn('Rate limit reached while determining changed files; assuming code changes (but not docker).'); + warn('Unable to determine changed files via API; assuming code changes (but not docker).'); if (core) { for (const [key, value] of Object.entries(outputs)) { core.setOutput(key, value); From ffda55206332984f4163eba4e20e07c602c0dbc1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 15:41:08 +0000 Subject: [PATCH 10/11] chore: sync template scripts --- .../.github/scripts/detect-changes.js | 57 ++++++++++++++++--- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/templates/consumer-repo/.github/scripts/detect-changes.js b/templates/consumer-repo/.github/scripts/detect-changes.js index 45c31c4f5..64d34cb68 100644 --- a/templates/consumer-repo/.github/scripts/detect-changes.js +++ b/templates/consumer-repo/.github/scripts/detect-changes.js @@ -168,6 +168,28 @@ function isRateLimitError(error) { return message.includes('rate limit') || message.includes('ratelimit'); } +function isNonFatalListFilesError(error) { + if (!error) { + return false; + } + if (isRateLimitError(error)) { + return true; + } + const status = error.status || error?.response?.status; + if ([401, 403, 404, 422].includes(status)) { + return true; + } + const message = String(error.message || error?.response?.data?.message || '').toLowerCase(); + return ( + message.includes('resource not accessible by integration') || + message.includes('insufficient permission') || + message.includes('requires higher permissions') || + message.includes('not found') || + message.includes('unprocessable') || + message.includes('validation failed') + ); +} + async function listChangedFiles({ github, context }) { const pull = context?.payload?.pull_request; const number = pull?.number; @@ -175,25 +197,42 @@ async function listChangedFiles({ github, context }) { return []; } try { - const iterator = github.paginate.iterator(github.rest.pulls.listFiles, { + const files = []; + const params = { owner: context.repo.owner, repo: context.repo.repo, pull_number: number, per_page: 100, - }); - const files = []; - for await (const page of iterator) { - if (Array.isArray(page.data)) { - for (const item of page.data) { + }; + if (typeof github?.paginate?.iterator === 'function') { + const iterator = github.paginate.iterator(github.rest.pulls.listFiles, params); + for await (const page of iterator) { + if (Array.isArray(page.data)) { + for (const item of page.data) { + if (item && typeof item.filename === 'string') { + files.push(item.filename); + } + } + } + } + return files; + } + + if (typeof github?.paginate === 'function') { + const items = await github.paginate(github.rest.pulls.listFiles, params); + if (Array.isArray(items)) { + for (const item of items) { if (item && typeof item.filename === 'string') { files.push(item.filename); } } } + return files; } - return files; + + throw new Error('GitHub paginate API is unavailable'); } catch (error) { - if (isRateLimitError(error)) { + if (isNonFatalListFilesError(error)) { return null; } throw error; @@ -261,7 +300,7 @@ async function detectChanges({ github, context, core, files, fetchFiles } = {}) workflow_changed: 'true', }; const warn = core?.warning ? core.warning.bind(core) : console.warn.bind(console); - warn('Rate limit reached while determining changed files; assuming code changes (but not docker).'); + warn('Unable to determine changed files via API; assuming code changes (but not docker).'); if (core) { for (const [key, value] of Object.entries(outputs)) { core.setOutput(key, value); From e053e6d0728bd3908da64d625e0c3ce10fb4dfde Mon Sep 17 00:00:00 2001 From: stranske Date: Thu, 12 Feb 2026 16:00:49 +0000 Subject: [PATCH 11/11] fix: sanitize task_title for GITHUB_OUTPUT and normalize warning annotations Address inline review feedback on PR #1484: - Sanitize task_title by replacing newlines/carriage returns with spaces before writing to $GITHUB_OUTPUT (prevents broken output parsing) - Normalize yaml.YAMLError messages to single-line in ::warning:: annotations (prevents malformed GitHub Actions annotations) - Both belt-worker copies updated in sync --- .github/workflows/agents-72-codex-belt-worker.yml | 8 +++++--- scripts/ledger_migrate_base.py | 5 +++-- .../.github/workflows/agents-72-codex-belt-worker.yml | 8 +++++--- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/workflows/agents-72-codex-belt-worker.yml b/.github/workflows/agents-72-codex-belt-worker.yml index 29a4dbefc..15f8271f4 100644 --- a/.github/workflows/agents-72-codex-belt-worker.yml +++ b/.github/workflows/agents-72-codex-belt-worker.yml @@ -903,10 +903,12 @@ jobs: gh_output = os.environ.get('GITHUB_OUTPUT') if gh_output: + task = start_info['task'] or {} + task_title = (task.get('title') or '').replace('\r', ' ').replace('\n', ' ') with open(gh_output, 'a', encoding='utf-8') as handle: - handle.write(f"task_id={start_info['task']['id'] if start_info['task'] else ''}\n") - handle.write(f"task_title={start_info['task']['title'] if start_info['task'] else ''}\n") - handle.write(f"task_status={start_info['task']['current_status'] if start_info['task'] else ''}\n") + handle.write(f"task_id={task.get('id', '')}\n") + handle.write(f"task_title={task_title}\n") + handle.write(f"task_status={task.get('current_status', '')}\n") handle.write(f"ledger_changed={'true' if changed else 'false'}\n") handle.write(f"ledger_created={'true' if start_info['created'] else 'false'}\n") handle.write(f"ledger_base_aligned={'true' if base_aligned else 'false'}\n") diff --git a/scripts/ledger_migrate_base.py b/scripts/ledger_migrate_base.py index a7fcadc8d..9f87285b0 100644 --- a/scripts/ledger_migrate_base.py +++ b/scripts/ledger_migrate_base.py @@ -180,8 +180,9 @@ def main(argv: Iterable[str] | None = None) -> int: result = migrate_ledger(ledger_path, default_branch, check=args.check) except (MigrationError, yaml.YAMLError) as exc: # One corrupt ledger must not block processing of the remaining files. - print(f"::warning::Skipping {ledger_path.name}: {exc}") - skipped.append((ledger_path, str(exc))) + reason = str(exc).replace("\n", " ").replace("\r", " ") + print(f"::warning::Skipping {ledger_path.name}: {reason}") + skipped.append((ledger_path, reason)) continue if args.check: if result.previous != default_branch: diff --git a/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml b/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml index 3695d5226..ab0801413 100644 --- a/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml +++ b/templates/consumer-repo/.github/workflows/agents-72-codex-belt-worker.yml @@ -903,10 +903,12 @@ jobs: gh_output = os.environ.get('GITHUB_OUTPUT') if gh_output: + task = start_info['task'] or {} + task_title = (task.get('title') or '').replace('\r', ' ').replace('\n', ' ') with open(gh_output, 'a', encoding='utf-8') as handle: - handle.write(f"task_id={start_info['task']['id'] if start_info['task'] else ''}\n") - handle.write(f"task_title={start_info['task']['title'] if start_info['task'] else ''}\n") - handle.write(f"task_status={start_info['task']['current_status'] if start_info['task'] else ''}\n") + handle.write(f"task_id={task.get('id', '')}\n") + handle.write(f"task_title={task_title}\n") + handle.write(f"task_status={task.get('current_status', '')}\n") handle.write(f"ledger_changed={'true' if changed else 'false'}\n") handle.write(f"ledger_created={'true' if start_info['created'] else 'false'}\n") handle.write(f"ledger_base_aligned={'true' if base_aligned else 'false'}\n")