From b1bbc7af98f18cdc1505501458fcb35ba7d62345 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 24 Dec 2025 10:12:18 +0000 Subject: [PATCH 1/9] chore(codex): bootstrap PR for issue #93 --- agents/codex-93.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 agents/codex-93.md diff --git a/agents/codex-93.md b/agents/codex-93.md new file mode 100644 index 000000000..7b554e0ae --- /dev/null +++ b/agents/codex-93.md @@ -0,0 +1 @@ + From f632427daf3ad01088ceb2a029aa8926f2fc50c8 Mon Sep 17 00:00:00 2001 From: stranske Date: Wed, 24 Dec 2025 05:48:45 -0600 Subject: [PATCH 2/9] Add metrics reporting to autofix loop (#97) --- .github/workflows/agents-autofix-loop.yml | 177 ++++++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/.github/workflows/agents-autofix-loop.yml b/.github/workflows/agents-autofix-loop.yml index 595818e6d..e5c4819fc 100644 --- a/.github/workflows/agents-autofix-loop.yml +++ b/.github/workflows/agents-autofix-loop.yml @@ -30,6 +30,11 @@ jobs: stop_reason: ${{ steps.evaluate.outputs.stop_reason }} attempts: ${{ steps.evaluate.outputs.attempts }} max_attempts: ${{ steps.evaluate.outputs.max_attempts }} + trigger_reason: ${{ steps.evaluate.outputs.trigger_reason }} + trigger_job: ${{ steps.evaluate.outputs.trigger_job }} + trigger_step: ${{ steps.evaluate.outputs.trigger_step }} + gate_conclusion: ${{ steps.evaluate.outputs.gate_conclusion }} + gate_run_id: ${{ steps.evaluate.outputs.gate_run_id }} security_blocked: ${{ steps.security_gate.outputs.blocked }} security_reason: ${{ steps.security_gate.outputs.reason }} steps: @@ -110,6 +115,11 @@ jobs: stop_reason: '', attempts: '0', max_attempts: '3', + trigger_reason: 'unknown', + trigger_job: '', + trigger_step: '', + gate_conclusion: String(run?.conclusion || run?.status || ''), + gate_run_id: String(run?.id || ''), }; const stop = (reason, stopReason = '') => { @@ -202,6 +212,8 @@ jobs: outputs.max_attempts = String(maxAttempts); const failingJobs = []; + let triggerJob = null; + let triggerStep = null; for (const job of jobs) { const conclusion = (job.conclusion || job.status || '').toLowerCase(); if (!conclusion || ['success', 'skipped'].includes(conclusion)) { @@ -223,8 +235,36 @@ jobs: detailLines.push(` - steps: ${failingSteps.join('; ')}`); } failingJobs.push(detailLines.join('\n')); + + if (!triggerJob) { + triggerJob = job; + const failingStep = Array.isArray(job.steps) + ? job.steps.find((step) => { + const stepConclusion = (step.conclusion || step.status || '').toLowerCase(); + return stepConclusion && !['success', 'skipped'].includes(stepConclusion); + }) + : null; + triggerStep = failingStep || null; + } } + const inferTriggerReason = (job, step) => { + const text = [job?.name, step?.name] + .filter(Boolean) + .map((value) => String(value).toLowerCase()) + .join(' '); + + if (!text) return 'unknown'; + if (text.includes('mypy')) return 'mypy'; + if (text.includes('lint') || text.includes('flake8') || text.includes('ruff')) return 'lint'; + if (text.includes('pytest') || text.includes('test')) return 'pytest'; + return 'unknown'; + }; + + outputs.trigger_reason = inferTriggerReason(triggerJob, triggerStep); + outputs.trigger_job = triggerJob?.name || triggerJob?.id || ''; + outputs.trigger_step = triggerStep?.name || ''; + const appendixLines = [ `Gate run: ${run.html_url || run.id}`, `Conclusion: ${run.conclusion || run.status || 'unknown'}`, @@ -319,3 +359,140 @@ jobs: issue_number: prNumber, body, }); + + metrics: + name: Record autofix metrics + needs: + - prepare + - autofix + if: always() + runs-on: ubuntu-latest + environment: agent-standard + steps: + - name: Collect metrics + id: collect + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const prNumber = Number('${{ needs.prepare.outputs.pr_number || 0 }}') || 0; + const attemptNumber = Number('${{ needs.prepare.outputs.attempts || 0 }}') || 0; + const attemptLimit = Number('${{ needs.prepare.outputs.max_attempts || 0 }}') || 0; + const headShaBefore = '${{ needs.prepare.outputs.head_sha }}'; + const gateConclusionBefore = '${{ needs.prepare.outputs.gate_conclusion }}' || (context.payload.workflow_run?.conclusion || ''); + const gateRunId = '${{ needs.prepare.outputs.gate_run_id }}' || String(context.payload.workflow_run?.id || ''); + const triggerReason = '${{ needs.prepare.outputs.trigger_reason || 'unknown' }}'; + const triggerJob = '${{ needs.prepare.outputs.trigger_job }}'; + const triggerStep = '${{ needs.prepare.outputs.trigger_step }}'; + const stopReason = '${{ needs.prepare.outputs.stop_reason }}'; + const autofixResult = '${{ needs.autofix.result }}'; + + const { owner, repo } = context.repo; + let fixApplied = false; + let headShaAfter = headShaBefore; + let gateResultAfter = gateConclusionBefore || 'unknown'; + + if (prNumber) { + try { + const { data: pr } = await github.rest.pulls.get({ + owner, + repo, + pull_number: prNumber, + }); + headShaAfter = pr.head?.sha || headShaAfter; + fixApplied = Boolean(headShaBefore && headShaAfter && headShaBefore !== headShaAfter); + + const gateWorkflow = 'pr-00-gate.yml'; + const runs = await github.paginate(github.rest.actions.listWorkflowRuns, { + owner, + repo, + workflow_id: gateWorkflow, + head_sha: headShaAfter, + per_page: 20, + }); + const latestGateRun = runs[0]; + if (latestGateRun) { + gateResultAfter = latestGateRun.conclusion || latestGateRun.status || 'unknown'; + } else { + gateResultAfter = 'not-found'; + } + } catch (error) { + core.warning(`Failed to resolve PR or gate status: ${error.message}`); + } + } + + const metrics = { + workflow_run_id: gateRunId, + pr_number: prNumber, + attempt_number: attemptNumber, + attempt_limit: attemptLimit, + trigger_reason: triggerReason || 'unknown', + trigger_job: triggerJob, + trigger_step: triggerStep, + fix_applied: fixApplied, + gate_result_after: gateResultAfter || 'unknown', + gate_conclusion_before: gateConclusionBefore || 'unknown', + stop_reason: stopReason || '', + autofix_result: autofixResult || 'unknown', + head_sha_before: headShaBefore, + head_sha_after: headShaAfter, + recorded_at: new Date().toISOString(), + }; + + core.setOutput('metrics_json', JSON.stringify(metrics)); + + - name: Write summary and artifact + env: + METRICS_JSON: ${{ steps.collect.outputs.metrics_json }} + run: | + set -euo pipefail + if [ -z "${METRICS_JSON:-}" ]; then + echo "No metrics JSON captured; skipping summary." + exit 0 + fi + + python - <<'PY' + import json + import os + + metrics = json.loads(os.environ["METRICS_JSON"]) + order = [ + "pr_number", + "attempt_number", + "attempt_limit", + "trigger_reason", + "trigger_job", + "trigger_step", + "fix_applied", + "gate_conclusion_before", + "gate_result_after", + "autofix_result", + "stop_reason", + "workflow_run_id", + "head_sha_before", + "head_sha_after", + "recorded_at", + ] + + lines = ["## Autofix loop metrics", ""] + ["| Field | Value |", "| --- | --- |"] + for key in order: + value = metrics.get(key, "") + lines.append(f"| {key} | `{value}` |") + + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") + if summary_path: + with open(summary_path, "a", encoding="utf-8") as fp: + fp.write("\n".join(lines) + "\n") + + out_path = "autofix-metrics.ndjson" + with open(out_path, "a", encoding="utf-8") as fp: + fp.write(json.dumps(metrics) + "\n") + print(f"Wrote metrics to {out_path}") + PY + + - name: Upload metrics artifact + uses: actions/upload-artifact@v4 + with: + name: agents-autofix-metrics + path: autofix-metrics.ndjson + retention-days: 30 From e45db7b9b238073d199f358c7fd8f727409ea225 Mon Sep 17 00:00:00 2001 From: stranske Date: Wed, 24 Dec 2025 05:49:55 -0600 Subject: [PATCH 3/9] Add keepalive metrics summary and artifact (#98) --- .github/workflows/agents-keepalive-loop.yml | 79 +++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/.github/workflows/agents-keepalive-loop.yml b/.github/workflows/agents-keepalive-loop.yml index c3427a741..87af58f06 100644 --- a/.github/workflows/agents-keepalive-loop.yml +++ b/.github/workflows/agents-keepalive-loop.yml @@ -36,12 +36,16 @@ jobs: autofix_enabled: ${{ steps.evaluate.outputs.autofix_enabled }} has_agent_label: ${{ steps.evaluate.outputs.has_agent_label }} trace: ${{ steps.evaluate.outputs.trace }} + start_ts: ${{ steps.timestamps.outputs.start_ts }} security_blocked: ${{ steps.security_gate.outputs.blocked }} security_reason: ${{ steps.security_gate.outputs.reason }} steps: - name: Checkout uses: actions/checkout@v4 + - name: Capture timestamps + id: timestamps + run: echo "start_ts=$(date -u +%s)" >> "$GITHUB_OUTPUT" - name: Security gate - prompt injection guard id: security_gate uses: actions/github-script@v7 @@ -146,6 +150,81 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Emit keepalive metrics + id: keepalive-metrics + env: + PR_NUMBER: ${{ needs.evaluate.outputs.pr_number }} + ACTION: ${{ needs.evaluate.outputs.action }} + REASON: ${{ needs.evaluate.outputs.reason }} + GATE_CONCLUSION: ${{ needs.evaluate.outputs.gate_conclusion }} + ITERATION: ${{ needs.evaluate.outputs.iteration }} + MAX_ITERATIONS: ${{ needs.evaluate.outputs.max_iterations }} + TASKS_TOTAL: ${{ needs.evaluate.outputs.tasks_total }} + TASKS_UNCHECKED: ${{ needs.evaluate.outputs.tasks_unchecked }} + START_TS: ${{ needs.evaluate.outputs.start_ts }} + run: | + set -euo pipefail + + now=$(date -u +%s) + if [[ "${START_TS:-}" =~ ^[0-9]+$ ]]; then + duration=$(( now - START_TS )) + if [ "$duration" -lt 0 ]; then duration=0; fi + else + duration=0 + fi + + tasks_total=${TASKS_TOTAL:-0} + tasks_unchecked=${TASKS_UNCHECKED:-0} + if ! [[ "$tasks_total" =~ ^-?[0-9]+$ ]]; then tasks_total=0; fi + if ! [[ "$tasks_unchecked" =~ ^-?[0-9]+$ ]]; then tasks_unchecked=0; fi + tasks_completed=$(( tasks_total - tasks_unchecked )) + if [ "$tasks_completed" -lt 0 ]; then tasks_completed=0; fi + + metrics_json=$(jq -n \ + --arg pr "${PR_NUMBER:-0}" \ + --arg iteration "${ITERATION:-0}" \ + --arg action "${ACTION:-}" \ + --arg stop_reason "${REASON:-}" \ + --arg gate_conclusion "${GATE_CONCLUSION:-}" \ + --arg tasks_total "$tasks_total" \ + --arg tasks_completed "$tasks_completed" \ + --arg duration "$duration" \ + '{ + pr_number: ($pr | tonumber? // 0), + iteration_count: ($iteration | tonumber? // 0), + action: $action, + stop_reason: $stop_reason, + gate_conclusion: $gate_conclusion, + tasks_total: ($tasks_total | tonumber? // 0), + tasks_completed: ($tasks_completed | tonumber? // 0), + duration_seconds: ($duration | tonumber? // 0) + }') + + { + echo '### Keepalive metrics' + echo '' + echo '| Field | Value |' + echo '| --- | --- |' + echo "| pr_number | $(echo "$metrics_json" | jq -r '.pr_number') |" + echo "| iteration_count | $(echo "$metrics_json" | jq -r '.iteration_count') |" + echo "| action | $(echo "$metrics_json" | jq -r '.action') |" + echo "| stop_reason | $(echo "$metrics_json" | jq -r '.stop_reason') |" + echo "| gate_conclusion | $(echo "$metrics_json" | jq -r '.gate_conclusion') |" + echo "| tasks_total | $(echo "$metrics_json" | jq -r '.tasks_total') |" + echo "| tasks_completed | $(echo "$metrics_json" | jq -r '.tasks_completed') |" + echo "| duration_seconds | $(echo "$metrics_json" | jq -r '.duration_seconds') |" + } >> "$GITHUB_STEP_SUMMARY" + + echo "$metrics_json" >> keepalive-metrics.ndjson + + - name: Upload keepalive metrics artifact + uses: actions/upload-artifact@v4 + with: + name: keepalive-metrics + path: keepalive-metrics.ndjson + retention-days: 30 + if-no-files-found: error + - name: Update summary comment uses: actions/github-script@v7 with: From 9a538a9289f44cf24641a38290936c701c010705 Mon Sep 17 00:00:00 2001 From: stranske Date: Wed, 24 Dec 2025 05:54:17 -0600 Subject: [PATCH 4/9] Add verifier workflow metrics emission --- .github/workflows/agents-verifier.yml | 116 +++++++++++++++++++++++++- 1 file changed, 115 insertions(+), 1 deletion(-) diff --git a/.github/workflows/agents-verifier.yml b/.github/workflows/agents-verifier.yml index 42db9e1b5..04d597e03 100644 --- a/.github/workflows/agents-verifier.yml +++ b/.github/workflows/agents-verifier.yml @@ -101,6 +101,7 @@ jobs: echo "verdict=$verdict" >> "$GITHUB_OUTPUT" - name: Open follow-up issue on verifier failure + id: failure_issue if: steps.context.outputs.should_run == 'true' && steps.verdict.outputs.verdict == 'fail' uses: actions/github-script@v7 env: @@ -129,9 +130,122 @@ jobs: lines.push(''); lines.push('- [ ] Re-run verifier after addressing the failures.'); const body = lines.join('\n'); - await github.rest.issues.create({ + const { data: issue } = await github.rest.issues.create({ ...context.repo, title, body, labels: ['agent:codex'], }); + core.setOutput('issue_number', issue?.number ? String(issue.number) : ''); + + - name: Collect verifier metrics + if: always() + id: collect_metrics + env: + SHOULD_RUN: ${{ steps.context.outputs.should_run }} + PR_NUMBER: ${{ steps.context.outputs.pr_number }} + VERDICT: ${{ steps.verdict.outputs.verdict }} + CONTEXT_PATH: ${{ steps.context.outputs.context_path }} + CODEX_OUTPUT: codex-output.md + ISSUE_NUMBER: ${{ steps.failure_issue.outputs.issue_number }} + SKIP_REASON: ${{ steps.context.outputs.skip_reason }} + run: | + set -euo pipefail + python - <<'PY' + import json + import os + import re + from pathlib import Path + from datetime import datetime, timezone + + def count_checkboxes(text: str) -> int: + return sum(1 for line in text.splitlines() if re.match(r"^- \\[[ xX]\\]", line.strip())) + + should_run = (os.environ.get("SHOULD_RUN") or "").lower() == "true" + pr_number = int(os.environ.get("PR_NUMBER") or 0) + verdict = os.environ.get("VERDICT") or "unknown" + context_path = Path(os.environ.get("CONTEXT_PATH") or "") + skip_reason = os.environ.get("SKIP_REASON") or "" + issue_number = os.environ.get("ISSUE_NUMBER") or "" + + if not should_run: + verdict = "skipped" + + issues_created = 1 if issue_number else 0 + issue_numbers = [issue_number] if issue_number else [] + + acceptance_criteria_count = 0 + if context_path.is_file(): + acceptance_criteria_count = count_checkboxes(context_path.read_text(encoding="utf-8")) + + checks_run = 0 + codex_output_path = Path(os.environ.get("CODEX_OUTPUT") or "") + if codex_output_path.is_file(): + content = codex_output_path.read_text(encoding="utf-8") + checks_run = sum(1 for line in content.splitlines()[1:] if line.lstrip().startswith("- ")) + + metrics = { + "pr_number": pr_number, + "verdict": verdict, + "issues_created": issues_created, + "issue_numbers": issue_numbers, + "acceptance_criteria_count": acceptance_criteria_count, + "checks_run": checks_run, + "skip_reason": skip_reason, + "recorded_at": datetime.now(timezone.utc).isoformat(), + } + + print(json.dumps(metrics, indent=2)) + with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as fp: + fp.write(f"metrics_json={json.dumps(metrics)}\n") + PY + + - name: Write verifier summary + if: always() + env: + METRICS_JSON: ${{ steps.collect_metrics.outputs.metrics_json }} + run: | + set -euo pipefail + if [ -z "${METRICS_JSON:-}" ]; then + echo "No verifier metrics captured; skipping summary." + exit 0 + fi + python - <<'PY' + import json + import os + + metrics = json.loads(os.environ["METRICS_JSON"]) + order = [ + "pr_number", + "verdict", + "issues_created", + "issue_numbers", + "acceptance_criteria_count", + "checks_run", + "skip_reason", + "recorded_at", + ] + + lines = ["## Verifier metrics", ""] + ["| Field | Value |", "| --- | --- |"] + for key in order: + value = metrics.get(key, "") + lines.append(f"| {key} | `{value}` |") + + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") + if summary_path: + with open(summary_path, "a", encoding="utf-8") as fp: + fp.write("\n".join(lines) + "\n") + + out_path = "verifier-metrics.ndjson" + with open(out_path, "a", encoding="utf-8") as fp: + fp.write(json.dumps(metrics) + "\n") + print(f"Wrote metrics to {out_path}") + PY + + - name: Upload verifier metrics + if: always() + uses: actions/upload-artifact@v4 + with: + name: agents-verifier-metrics + path: verifier-metrics.ndjson + retention-days: 30 From 0040471a963ce6dcf8e2b37f50779251d70c30f2 Mon Sep 17 00:00:00 2001 From: stranske Date: Wed, 24 Dec 2025 06:12:29 -0600 Subject: [PATCH 5/9] Add weekly agent metrics aggregation --- .github/workflows/agents-metrics-weekly.yml | 135 ++++++++++ scripts/aggregate_agent_metrics.py | 274 ++++++++++++++++++++ 2 files changed, 409 insertions(+) create mode 100644 .github/workflows/agents-metrics-weekly.yml create mode 100644 scripts/aggregate_agent_metrics.py diff --git a/.github/workflows/agents-metrics-weekly.yml b/.github/workflows/agents-metrics-weekly.yml new file mode 100644 index 000000000..b4739f624 --- /dev/null +++ b/.github/workflows/agents-metrics-weekly.yml @@ -0,0 +1,135 @@ +name: Agents metrics weekly + +on: + schedule: + - cron: "0 9 * * MON" + workflow_dispatch: + +permissions: + contents: read + actions: read + issues: write + +jobs: + aggregate: + name: Aggregate agent metrics + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Prepare artifact directory + run: mkdir -p metrics-artifacts + + - name: Download metrics artifacts + id: download + uses: actions/github-script@v7 + env: + OUTPUT_DIR: metrics-artifacts + CUTOFF_DAYS: 35 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const fs = require('fs'); + const path = require('path'); + const { owner, repo } = context.repo; + const outputDir = process.env.OUTPUT_DIR || 'metrics-artifacts'; + const cutoffDays = Number(process.env.CUTOFF_DAYS || 35); + const cutoff = new Date(Date.now() - cutoffDays * 24 * 60 * 60 * 1000); + const patterns = [/keepalive-metrics/i, /agents-autofix-metrics/i, /autofix-metrics/i, /verifier-metrics/i]; + + fs.mkdirSync(outputDir, { recursive: true }); + let page = 1; + const perPage = 100; + const matches = []; + + while (true) { + const { data } = await github.rest.actions.listArtifactsForRepo({ + owner, + repo, + per_page: perPage, + page, + }); + + const artifacts = data.artifacts || []; + for (const artifact of artifacts) { + const created = new Date(artifact.created_at); + if (artifact.expired) continue; + if (created < cutoff) continue; + if (!patterns.some((re) => re.test(artifact.name))) continue; + matches.push(artifact); + } + + if (artifacts.length < perPage) break; + page += 1; + if (page > 10) break; + } + + for (const artifact of matches) { + const response = await github.rest.actions.downloadArtifact({ + owner, + repo, + artifact_id: artifact.id, + archive_format: 'zip', + }); + const dest = path.join(outputDir, `${artifact.name}-${artifact.id}.zip`); + fs.writeFileSync(dest, Buffer.from(response.data)); + core.info(`Downloaded ${artifact.name} to ${dest}`); + } + + core.setOutput('artifact_count', matches.length); + + - name: Extract artifacts + if: steps.download.outputs.artifact_count != '0' + run: | + mkdir -p metrics-artifacts/extracted + shopt -s nullglob + for archive in metrics-artifacts/*.zip; do + unzip -o "$archive" -d metrics-artifacts/extracted + done + + - name: Aggregate metrics + id: aggregate + run: | + set -euo pipefail + python scripts/aggregate_agent_metrics.py --input metrics-artifacts --output metrics-summary.md --recent-days 35 + + - name: Publish summary to issue + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const fs = require('fs'); + const summaryPath = 'metrics-summary.md'; + const body = fs.existsSync(summaryPath) + ? fs.readFileSync(summaryPath, 'utf-8') + : 'No metrics available for this period.'; + const issueNumber = 93; + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issueNumber, + body, + }); + + - name: Upload weekly summary artifact + uses: actions/upload-artifact@v4 + with: + name: agents-metrics-summary + path: metrics-summary.md + if-no-files-found: warn + retention-days: 45 + + - name: Append to job summary + if: always() + run: | + if [ -f metrics-summary.md ]; then + cat metrics-summary.md >> "$GITHUB_STEP_SUMMARY" + else + echo "No metrics summary generated." >> "$GITHUB_STEP_SUMMARY" + fi diff --git a/scripts/aggregate_agent_metrics.py b/scripts/aggregate_agent_metrics.py new file mode 100644 index 000000000..b3897e66a --- /dev/null +++ b/scripts/aggregate_agent_metrics.py @@ -0,0 +1,274 @@ +""" +Aggregate agent workflow metrics from NDJSON artifacts and emit a Markdown summary. + +Usage: + python scripts/aggregate_agent_metrics.py --input artifacts/ --output summary.md +""" + +from __future__ import annotations + +import argparse +import json +import statistics +from collections import Counter, defaultdict +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Dict, Iterable, List, Tuple + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--input", + "-i", + action="append", + default=[], + help="File or directory containing NDJSON metrics (can be passed multiple times).", + ) + parser.add_argument( + "--recent-days", + type=int, + default=35, + help="Only include metrics recorded in the last N days (default: 35).", + ) + parser.add_argument( + "--output", + "-o", + type=str, + default="", + help="Optional file path to write the Markdown summary.", + ) + return parser.parse_args() + + +def iter_ndjson(paths: Iterable[Path]) -> Iterable[Tuple[Dict, Path]]: + for path in paths: + if path.is_dir(): + yield from iter_ndjson(path.rglob("*.ndjson")) + continue + if path.suffix.lower() != ".ndjson": + continue + try: + for line in path.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + yield json.loads(line), path + except json.JSONDecodeError: + continue + except FileNotFoundError: + continue + + +def _parse_datetime(value: str): + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")) + except Exception: + return None + + +def load_metrics(paths: Iterable[Path], recent_days: int): + cutoff = datetime.now(timezone.utc) - timedelta(days=recent_days) + keepalive, autofix, verifier = [], [], [] + for record, source in iter_ndjson(paths): + if not isinstance(record, dict): + continue + recorded_at = _parse_datetime(str(record.get("recorded_at", ""))) or cutoff + if recorded_at < cutoff: + continue + # Try to infer record type + if "iteration_count" in record: + keepalive.append(record) + elif "attempt_number" in record: + autofix.append(record) + elif "verdict" in record: + verifier.append(record) + else: + # Fallback to filename hint + name = source.name.lower() + if "keepalive" in name: + keepalive.append(record) + elif "autofix" in name: + autofix.append(record) + elif "verifier" in name: + verifier.append(record) + return keepalive, autofix, verifier + + +def summarise_keepalive(records: List[Dict]) -> Dict[str, object]: + if not records: + return { + "count": 0, + "pr_count": 0, + "avg_iterations": 0.0, + "completion_pct": 0.0, + "top_stop_reasons": [], + } + + stop_reasons = Counter() + iterations: List[int] = [] + pr_counts = Counter() + completed_prs = set() + + for rec in records: + pr = int(rec.get("pr_number") or 0) + pr_counts[pr] += 1 + iterations.append(int(rec.get("iteration_count") or 0)) + stop = str(rec.get("stop_reason", "")).lower() + if stop: + stop_reasons[stop] += 1 + tasks_total = int(rec.get("tasks_total") or 0) + tasks_completed = int(rec.get("tasks_completed") or 0) + + if "complete" in stop or (tasks_total > 0 and tasks_completed >= tasks_total): + completed_prs.add(pr) + + avg_iterations = statistics.mean(iterations) if iterations else 0.0 + pr_total = len(pr_counts) + completion_pct = (len(completed_prs) / pr_total * 100.0) if pr_total else 0.0 + top_stop_reasons = stop_reasons.most_common(5) + + return { + "count": len(records), + "pr_count": pr_total, + "avg_iterations": round(avg_iterations, 2), + "completion_pct": round(completion_pct, 2), + "top_stop_reasons": top_stop_reasons, + } + + +def summarise_autofix(records: List[Dict]) -> Dict[str, object]: + if not records: + return { + "count": 0, + "pr_count": 0, + "success_pct": 0.0, + "avg_attempts": 0.0, + } + pr_attempts: defaultdict[int, List[int]] = defaultdict(list) + success = 0 + for rec in records: + pr = int(rec.get("pr_number") or 0) + attempt = int(rec.get("attempt_number") or 0) + pr_attempts[pr].append(attempt) + if rec.get("fix_applied"): + success += 1 + avg_attempts = ( + statistics.mean(max(v) for v in pr_attempts.values()) if pr_attempts else 0.0 + ) + success_pct = (success / len(records) * 100.0) if records else 0.0 + return { + "count": len(records), + "pr_count": len(pr_attempts), + "success_pct": round(success_pct, 2), + "avg_attempts": round(avg_attempts, 2), + } + + +def summarise_verifier(records: List[Dict]) -> Dict[str, object]: + if not records: + return { + "count": 0, + "pass_pct": 0.0, + "avg_checks": 0.0, + "issues_created": 0, + } + verdicts = Counter() + checks: List[int] = [] + issues_created = 0 + for rec in records: + verdict = str(rec.get("verdict", "unknown")).lower() + verdicts[verdict] += 1 + checks.append(int(rec.get("checks_run") or 0)) + issues_created += int(rec.get("issues_created") or 0) + + pass_total = verdicts.get("pass", 0) + total = sum(verdicts.values()) + pass_pct = (pass_total / total * 100.0) if total else 0.0 + avg_checks = statistics.mean(checks) if checks else 0.0 + return { + "count": total, + "pass_pct": round(pass_pct, 2), + "avg_checks": round(avg_checks, 2), + "issues_created": issues_created, + "verdicts": verdicts.most_common(), + } + + +def build_markdown( + keepalive_summary: Dict[str, object], + autofix_summary: Dict[str, object], + verifier_summary: Dict[str, object], + recent_days: int, +) -> str: + lines = [ + "# Weekly agent metrics summary", + "", + f"_Covers metrics from the last **{recent_days} days**._", + "", + "## Keepalive loop", + f"- Records analyzed: **{keepalive_summary['count']}** across **{keepalive_summary['pr_count']} PRs**", + f"- Average iterations per record: **{keepalive_summary['avg_iterations']}**", + f"- PRs completed via keepalive: **{keepalive_summary['completion_pct']}%**", + ] + + top_stops = keepalive_summary.get("top_stop_reasons") or [] + if top_stops: + lines.append("- Top stop reasons:") + for reason, count in top_stops: + lines.append(f" - `{reason or 'unknown'}` — {count}") + + lines += [ + "", + "## Autofix loop", + f"- Records analyzed: **{autofix_summary['count']}** across **{autofix_summary['pr_count']} PRs**", + f"- Fix applied success rate: **{autofix_summary['success_pct']}%**", + f"- Average attempts per PR (max): **{autofix_summary['avg_attempts']}**", + "", + "## Verifier", + f"- Records analyzed: **{verifier_summary['count']}**", + f"- Pass rate: **{verifier_summary['pass_pct']}%**", + f"- Average checks run: **{verifier_summary['avg_checks']}**", + f"- Issues created: **{verifier_summary['issues_created']}**", + ] + + verdicts = verifier_summary.get("verdicts") or [] + if verdicts: + lines.append("- Verdict distribution:") + for verdict, count in verdicts: + lines.append(f" - `{verdict}` — {count}") + + lines.append("") + lines.append("## Key question: keepalive completion rate") + lines.append( + f"- Percentage of PRs completing via keepalive without human intervention: **{keepalive_summary['completion_pct']}%**" + ) + return "\n".join(lines) + "\n" + + +def main(): + args = parse_args() + inputs = [Path(p) for p in (args.input or [])] + if not inputs: + inputs = [Path("metrics-artifacts"), Path("artifacts"), Path(".")] + + keepalive_records, autofix_records, verifier_records = load_metrics( + inputs, args.recent_days + ) + + keepalive_summary = summarise_keepalive(keepalive_records) + autofix_summary = summarise_autofix(autofix_records) + verifier_summary = summarise_verifier(verifier_records) + + markdown = build_markdown( + keepalive_summary, autofix_summary, verifier_summary, args.recent_days + ) + print(markdown) + + output_path = Path(args.output) if args.output else None + if output_path: + output_path.write_text(markdown, encoding="utf-8") + + +if __name__ == "__main__": + main() From d96ca89fd3df2ffb2ef820115e885e3bc044e79c Mon Sep 17 00:00:00 2001 From: stranske Date: Wed, 24 Dec 2025 23:50:59 -0600 Subject: [PATCH 6/9] Update .github/workflows/agents-metrics-weekly.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/workflows/agents-metrics-weekly.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/agents-metrics-weekly.yml b/.github/workflows/agents-metrics-weekly.yml index b4739f624..b906da3d4 100644 --- a/.github/workflows/agents-metrics-weekly.yml +++ b/.github/workflows/agents-metrics-weekly.yml @@ -95,6 +95,7 @@ jobs: - name: Aggregate metrics id: aggregate + if: steps.download.outputs.artifact_count != '0' run: | set -euo pipefail python scripts/aggregate_agent_metrics.py --input metrics-artifacts --output metrics-summary.md --recent-days 35 From 30099c7df26db66e1236725b31a581a9b26f18d2 Mon Sep 17 00:00:00 2001 From: stranske Date: Wed, 24 Dec 2025 23:51:10 -0600 Subject: [PATCH 7/9] Update scripts/aggregate_agent_metrics.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- scripts/aggregate_agent_metrics.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/aggregate_agent_metrics.py b/scripts/aggregate_agent_metrics.py index b3897e66a..829ad0eef 100644 --- a/scripts/aggregate_agent_metrics.py +++ b/scripts/aggregate_agent_metrics.py @@ -148,9 +148,18 @@ def summarise_autofix(records: List[Dict]) -> Dict[str, object]: pr_attempts: defaultdict[int, List[int]] = defaultdict(list) success = 0 for rec in records: - pr = int(rec.get("pr_number") or 0) + pr_raw = rec.get("pr_number") + pr: int | None = None + if pr_raw is not None: + try: + pr_int = int(pr_raw) + if pr_int > 0: + pr = pr_int + except (TypeError, ValueError): + pr = None attempt = int(rec.get("attempt_number") or 0) - pr_attempts[pr].append(attempt) + if pr is not None: + pr_attempts[pr].append(attempt) if rec.get("fix_applied"): success += 1 avg_attempts = ( From 8077985efca5108cd61b61b7d6911dee7a4aa28c Mon Sep 17 00:00:00 2001 From: stranske Date: Wed, 24 Dec 2025 23:51:20 -0600 Subject: [PATCH 8/9] Update scripts/aggregate_agent_metrics.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- scripts/aggregate_agent_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/aggregate_agent_metrics.py b/scripts/aggregate_agent_metrics.py index 829ad0eef..aef46dd8e 100644 --- a/scripts/aggregate_agent_metrics.py +++ b/scripts/aggregate_agent_metrics.py @@ -63,7 +63,7 @@ def iter_ndjson(paths: Iterable[Path]) -> Iterable[Tuple[Dict, Path]]: def _parse_datetime(value: str): try: return datetime.fromisoformat(value.replace("Z", "+00:00")) - except Exception: + except ValueError: return None From 3dfafeabfdd1c0650f489700808b33f9bdd3eee4 Mon Sep 17 00:00:00 2001 From: stranske Date: Wed, 24 Dec 2025 23:51:42 -0600 Subject: [PATCH 9/9] Update scripts/aggregate_agent_metrics.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- scripts/aggregate_agent_metrics.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/aggregate_agent_metrics.py b/scripts/aggregate_agent_metrics.py index aef46dd8e..90df41c30 100644 --- a/scripts/aggregate_agent_metrics.py +++ b/scripts/aggregate_agent_metrics.py @@ -155,17 +155,19 @@ def summarise_autofix(records: List[Dict]) -> Dict[str, object]: pr_int = int(pr_raw) if pr_int > 0: pr = pr_int - except (TypeError, ValueError): - pr = None + successful_prs = set() + for rec in records: + pr = int(rec.get("pr_number") or 0) attempt = int(rec.get("attempt_number") or 0) - if pr is not None: - pr_attempts[pr].append(attempt) + pr_attempts[pr].append(attempt) if rec.get("fix_applied"): - success += 1 + successful_prs.add(pr) avg_attempts = ( statistics.mean(max(v) for v in pr_attempts.values()) if pr_attempts else 0.0 ) - success_pct = (success / len(records) * 100.0) if records else 0.0 + success_pct = ( + (len(successful_prs) / len(pr_attempts) * 100.0) if pr_attempts else 0.0 + ) return { "count": len(records), "pr_count": len(pr_attempts),