Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/scripts/error_classifier.js
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,10 @@ function classifyByMessage(message) {
function classifyError(error) {
const message = normaliseMessage(error);
const preview = message ? message.slice(0, 50) : 'unknown';
// eslint-disable-next-line no-console
console.log(`[error_classifier] Classifying error: ${preview}`);
if (process.env.RUNNER_DEBUG === '1') {
// eslint-disable-next-line no-console
console.log(`[error_classifier] Classifying error: ${preview}`);
}
const status = getStatusCode(error);

const statusCategory = status ? classifyByStatus(status, message) : null;
Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/keepalive_loop.js
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ async function fetchRepoVariables({ github, context, core, names = [] }) {
}
} catch (error) {
if (core) {
core.info(`Failed to fetch repository variables for timeout config: ${error.message}`);
core.debug(`Repository variables not accessible for timeout config (using defaults): ${error.message}`);
}
}

Expand Down
11 changes: 9 additions & 2 deletions .github/workflows/agents-72-codex-belt-worker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,13 @@ jobs:
fetch-depth: 1
path: .belt-tools

- name: Re-install API client after branch checkout
if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }}
uses: ./.belt-tools/.github/actions/setup-api-client
with:
secrets: ${{ toJSON(secrets) }}
github_token: ${{ env.GH_BELT_TOKEN || github.token }}

- name: Validate ledger base branch
if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }}
env:
Expand Down Expand Up @@ -764,7 +771,7 @@ jobs:
return super().increase_indent(flow, False)

def iso_now() -> str:
return dt.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z'
return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z')

issue = os.environ["ISSUE"].strip()
branch = os.environ["BRANCH"].strip()
Expand Down Expand Up @@ -1272,7 +1279,7 @@ jobs:
return super().increase_indent(flow, False)

def iso_now() -> str:
return dt.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z'
return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z')

issue = os.environ.get('ISSUE', '').strip()
branch = os.environ.get('BRANCH', '').strip()
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/agents-keepalive-loop-reporter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ jobs:
token: ${{ steps.app_token.outputs.token || github.token }}
sparse-checkout: |
.github/scripts
.github/actions
sparse-checkout-cone-mode: false
fetch-depth: 1

Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/health-75-api-rate-diagnostic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ jobs:
uses: actions/checkout@v6
- name: Export load-balancer tokens
uses: ./.github/actions/setup-api-client
with:
secrets: ${{ toJSON(secrets) }}
github_token: ${{ github.token }}

- name: Check GITHUB_TOKEN rate limits
id: github_token
Expand Down Expand Up @@ -1457,6 +1460,9 @@ jobs:
uses: actions/checkout@v6
- name: Export load-balancer tokens
uses: ./.github/actions/setup-api-client
with:
secrets: ${{ toJSON(secrets) }}
github_token: ${{ github.token }}
- name: Check for critical utilization
id: check
run: |
Expand Down Expand Up @@ -1570,6 +1576,9 @@ jobs:
uses: actions/checkout@v6
- name: Export load-balancer tokens
uses: ./.github/actions/setup-api-client
with:
secrets: ${{ toJSON(secrets) }}
github_token: ${{ github.token }}
- name: Create issue on repeated failures
env:
GH_TOKEN: ${{ github.token }}
Expand Down Expand Up @@ -1692,6 +1701,9 @@ jobs:
uses: actions/checkout@v6
- name: Export load-balancer tokens
uses: ./.github/actions/setup-api-client
with:
secrets: ${{ toJSON(secrets) }}
github_token: ${{ github.token }}
- name: Parse date range
id: dates
run: |
Expand Down
20 changes: 10 additions & 10 deletions .github/workflows/reusable-codex-run.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1018,16 +1018,16 @@ jobs:

# Extract key fields for downstream use
if [ -f "$ANALYSIS_FILE" ]; then
COMPLETED=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('completed_tasks', [])))")
PROVIDER=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('provider', 'unknown'))")
MODEL=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('model', 'unknown'))")
CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('confidence', 0))")
COMPLETED=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('completed_tasks', [])), end='')")
PROVIDER=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('provider', 'unknown'), end='')")
MODEL=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('model', 'unknown'), end='')")
CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('confidence', 0), end='')")
# Quality metrics for keepalive integration
RAW_CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('raw_confidence', d.get('confidence', 0)))")
EFFORT_SCORE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('effort_score', 0))")
DATA_QUALITY=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('data_quality', 'unknown'))")
ANALYSIS_LENGTH=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('analysis_text_length', 0))")
QUALITY_WARNINGS=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('quality_warnings', [])))")
RAW_CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('raw_confidence', d.get('confidence', 0)), end='')")
EFFORT_SCORE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('effort_score', 0), end='')")
DATA_QUALITY=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('data_quality', 'unknown'), end='')")
ANALYSIS_LENGTH=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('analysis_text_length', 0), end='')")
QUALITY_WARNINGS=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('quality_warnings', [])), end='')")
{
echo "completed-tasks=$COMPLETED"
echo "provider=$PROVIDER"
Expand Down Expand Up @@ -1185,7 +1185,7 @@ jobs:

# Check if there are any staged changes remaining after reset
if git diff --cached --quiet; then
echo "::warning::No non-artifact changes to commit after filtering. All changes were workflow artifacts."
echo "::notice::No non-artifact changes to commit after filtering. All changes were workflow artifacts."
# Even if nothing to commit, check for unpushed commits that need pushing
git fetch "${REMOTE_URL}" "${TARGET_BRANCH}" 2>/dev/null || true
UNPUSHED=$(git rev-list FETCH_HEAD..HEAD --count 2>/dev/null || echo "0")
Expand Down
3 changes: 2 additions & 1 deletion scripts/langchain/capability_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ def _is_multi_action_task(task: str) -> bool:
def _requires_admin_access(task: str) -> bool:
patterns = [
r"\bgithub\s+secrets?\b",
r"\bsecrets?\b",
r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\s+secrets?\b",
r"\bsecrets?\s+(?:management|configuration|rotation)\b",
r"\brepository\s+settings\b",
r"\brepo\s+settings\b",
r"\bbranch\s+protection\b",
Expand Down
2 changes: 1 addition & 1 deletion scripts/langchain/verdict_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"fail": 3,
}

CONCERNS_NEEDS_HUMAN_THRESHOLD = 0.85
CONCERNS_NEEDS_HUMAN_THRESHOLD = 0.50


@dataclass(frozen=True)
Expand Down
6 changes: 4 additions & 2 deletions templates/consumer-repo/.github/scripts/error_classifier.js
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,10 @@ function classifyByMessage(message) {
function classifyError(error) {
const message = normaliseMessage(error);
const preview = message ? message.slice(0, 50) : 'unknown';
// eslint-disable-next-line no-console
console.log(`[error_classifier] Classifying error: ${preview}`);
if (process.env.RUNNER_DEBUG === '1') {
// eslint-disable-next-line no-console
console.log(`[error_classifier] Classifying error: ${preview}`);
}
const status = getStatusCode(error);

const statusCategory = status ? classifyByStatus(status, message) : null;
Expand Down
2 changes: 1 addition & 1 deletion templates/consumer-repo/.github/scripts/keepalive_loop.js
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ async function fetchRepoVariables({ github, context, core, names = [] }) {
}
} catch (error) {
if (core) {
core.info(`Failed to fetch repository variables for timeout config: ${error.message}`);
core.debug(`Repository variables not accessible for timeout config (using defaults): ${error.message}`);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,13 @@ jobs:
fetch-depth: 1
path: .belt-tools

- name: Re-install API client after branch checkout
if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }}
uses: ./.belt-tools/.github/actions/setup-api-client
with:
secrets: ${{ toJSON(secrets) }}
github_token: ${{ env.GH_BELT_TOKEN || github.token }}

- name: Validate ledger base branch
if: ${{ steps.parallel.outputs.allowed == 'true' && (inputs.keepalive != true || steps.keepalive_gate.outputs.action != 'skip') }}
env:
Expand Down Expand Up @@ -764,7 +771,7 @@ jobs:
return super().increase_indent(flow, False)

def iso_now() -> str:
return dt.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z'
return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z')

issue = os.environ["ISSUE"].strip()
branch = os.environ["BRANCH"].strip()
Expand Down Expand Up @@ -1272,7 +1279,7 @@ jobs:
return super().increase_indent(flow, False)

def iso_now() -> str:
return dt.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z'
return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z')

issue = os.environ.get('ISSUE', '').strip()
branch = os.environ.get('BRANCH', '').strip()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@ def _is_multi_action_task(task: str) -> bool:
def _requires_admin_access(task: str) -> bool:
patterns = [
r"\bgithub\s+secrets?\b",
r"\bsecrets?\b",
r"\b(?:manage|configure|set|create|update|delete|add|modify|rotate)\s+secrets?\b",
r"\bsecrets?\s+(?:management|configuration|rotation)\b",
r"\brepository\s+settings\b",
r"\brepo\s+settings\b",
r"\bbranch\s+protection\b",
Expand Down
15 changes: 15 additions & 0 deletions tests/scripts/test_capability_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,21 @@ def test_fallback_flags_admin_requirement(self) -> None:
assert result.blocked_tasks[0]["task"] == "Update GitHub secrets"
assert "admin" in result.blocked_tasks[0]["reason"].lower()

def test_fallback_does_not_flag_negated_secrets_mention(self) -> None:
"""Regression: 'no secrets' in constraint text must not trigger admin block."""
task = "safety rules (no secrets, no workflow edits, no file writes)"
with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None):
result = classify_capabilities([task], "")
assert result.recommendation != "BLOCKED"
assert all(item["task"] != task for item in result.blocked_tasks)

def test_fallback_flags_manage_secrets(self) -> None:
"""Specific secrets-management verbs should still be blocked."""
with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None):
result = classify_capabilities(["manage secrets for deployment"], "")
assert result.recommendation == "BLOCKED"
assert "admin" in result.blocked_tasks[0]["reason"].lower()

def test_fallback_suggests_decomposition(self) -> None:
with mock.patch("scripts.langchain.capability_check._get_llm_client", return_value=None):
result = classify_capabilities(["Refactor auth + add tests + update docs"], "")
Expand Down
2 changes: 1 addition & 1 deletion tests/test_followup_issue_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ def test_split_low_confidence_requires_needs_human(self):
verification_data = VerificationData(
provider_verdicts={
"openai": {"verdict": "PASS", "confidence": 90},
"anthropic": {"verdict": "CONCERNS", "confidence": 70},
"anthropic": {"verdict": "CONCERNS", "confidence": 49},
},
concerns=["Missing test coverage"],
)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_verdict_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def _parse_github_output(raw: str) -> dict[str, str]:
def test_verdict_extract_emits_structured_github_outputs(tmp_path):
summary = _build_summary(
"| openai | gpt-5.2 | PASS | 0.92 | Looks good. |",
"| anthropic | claude-sonnet-4-5 | CONCERNS | 0.84 | Missing edge case. |",
"| anthropic | claude-sonnet-4-5 | CONCERNS | 0.49 | Missing edge case. |",
)
result = verdict_extract.build_verdict_result(summary, policy="worst")
output_path = tmp_path / "github_output.txt"
Expand Down
34 changes: 34 additions & 0 deletions tests/test_verdict_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,37 @@ def test_needs_human_threshold_boundary():
result = evaluate_verdict_policy(verdicts, policy="worst")

assert result.needs_human is False


def test_needs_human_true_below_threshold():
"""Concerns below the threshold should trigger needs_human."""
verdicts = [
ProviderVerdict("openai", "gpt-5.2", "PASS", 0.92),
ProviderVerdict("anthropic", "claude-sonnet-4-5", "CONCERNS", 0.40),
]

result = evaluate_verdict_policy(verdicts, policy="worst")

assert result.needs_human is True
assert result.split_verdict is True
assert "low-confidence" in result.needs_human_reason


def test_moderate_confidence_concerns_do_not_block():
"""Regression: 72% concerns in a split verdict should not trigger needs_human.

Previously CONCERNS_NEEDS_HUMAN_THRESHOLD was 0.85, which caused any
split verdict with <85% concerns to be flagged. The lowered threshold
(0.50) allows moderate-confidence concerns to proceed with automatic
follow-up creation.
"""
verdicts = [
ProviderVerdict("openai", "gpt-5.2", "CONCERNS", 72),
ProviderVerdict("anthropic", "claude-sonnet-4-5", "PASS", 85),
]

result = evaluate_verdict_policy(verdicts, policy="worst")

assert result.split_verdict is True
assert result.needs_human is False
assert result.verdict == "CONCERNS"
2 changes: 1 addition & 1 deletion tests/test_verdict_policy_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_split_verdict_confidence_boundary_needs_human_false():
def test_split_verdict_low_confidence_needs_human_true():
summary = _build_summary(
"| openai | gpt-5.2 | PASS | 0.92 | Looks good. |",
"| anthropic | claude-sonnet-4-5 | CONCERNS | 0.84 | Missing edge case. |",
"| anthropic | claude-sonnet-4-5 | CONCERNS | 0.49 | Missing edge case. |",
)

workflow_result = _workflow_result(summary)
Expand Down
Loading