From b1bbc7af98f18cdc1505501458fcb35ba7d62345 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 24 Dec 2025 10:12:18 +0000
Subject: [PATCH 1/9] chore(codex): bootstrap PR for issue #93

---
 agents/codex-93.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 agents/codex-93.md

diff --git a/agents/codex-93.md b/agents/codex-93.md
new file mode 100644
index 000000000..7b554e0ae
--- /dev/null
+++ b/agents/codex-93.md
@@ -0,0 +1 @@
+<!-- bootstrap for codex on issue #93 -->

From f632427daf3ad01088ceb2a029aa8926f2fc50c8 Mon Sep 17 00:00:00 2001
From: stranske <stranske@gmail.com>
Date: Wed, 24 Dec 2025 05:48:45 -0600
Subject: [PATCH 2/9] Add metrics reporting to autofix loop (#97)

---
 .github/workflows/agents-autofix-loop.yml | 177 ++++++++++++++++++++++
 1 file changed, 177 insertions(+)

diff --git a/.github/workflows/agents-autofix-loop.yml b/.github/workflows/agents-autofix-loop.yml
index 595818e6d..e5c4819fc 100644
--- a/.github/workflows/agents-autofix-loop.yml
+++ b/.github/workflows/agents-autofix-loop.yml
@@ -30,6 +30,11 @@ jobs:
       stop_reason: ${{ steps.evaluate.outputs.stop_reason }}
       attempts: ${{ steps.evaluate.outputs.attempts }}
       max_attempts: ${{ steps.evaluate.outputs.max_attempts }}
+      trigger_reason: ${{ steps.evaluate.outputs.trigger_reason }}
+      trigger_job: ${{ steps.evaluate.outputs.trigger_job }}
+      trigger_step: ${{ steps.evaluate.outputs.trigger_step }}
+      gate_conclusion: ${{ steps.evaluate.outputs.gate_conclusion }}
+      gate_run_id: ${{ steps.evaluate.outputs.gate_run_id }}
       security_blocked: ${{ steps.security_gate.outputs.blocked }}
       security_reason: ${{ steps.security_gate.outputs.reason }}
     steps:
@@ -110,6 +115,11 @@ jobs:
               stop_reason: '',
               attempts: '0',
               max_attempts: '3',
+              trigger_reason: 'unknown',
+              trigger_job: '',
+              trigger_step: '',
+              gate_conclusion: String(run?.conclusion || run?.status || ''),
+              gate_run_id: String(run?.id || ''),
             };
 
             const stop = (reason, stopReason = '') => {
@@ -202,6 +212,8 @@ jobs:
             outputs.max_attempts = String(maxAttempts);
 
             const failingJobs = [];
+            let triggerJob = null;
+            let triggerStep = null;
             for (const job of jobs) {
               const conclusion = (job.conclusion || job.status || '').toLowerCase();
               if (!conclusion || ['success', 'skipped'].includes(conclusion)) {
@@ -223,8 +235,36 @@ jobs:
                 detailLines.push(`  - steps: ${failingSteps.join('; ')}`);
               }
               failingJobs.push(detailLines.join('\n'));
+
+              if (!triggerJob) {
+                triggerJob = job;
+                const failingStep = Array.isArray(job.steps)
+                  ? job.steps.find((step) => {
+                      const stepConclusion = (step.conclusion || step.status || '').toLowerCase();
+                      return stepConclusion && !['success', 'skipped'].includes(stepConclusion);
+                    })
+                  : null;
+                triggerStep = failingStep || null;
+              }
             }
 
+            const inferTriggerReason = (job, step) => {
+              const text = [job?.name, step?.name]
+                .filter(Boolean)
+                .map((value) => String(value).toLowerCase())
+                .join(' ');
+
+              if (!text) return 'unknown';
+              if (text.includes('mypy')) return 'mypy';
+              if (text.includes('lint') || text.includes('flake8') || text.includes('ruff')) return 'lint';
+              if (text.includes('pytest') || text.includes('test')) return 'pytest';
+              return 'unknown';
+            };
+
+            outputs.trigger_reason = inferTriggerReason(triggerJob, triggerStep);
+            outputs.trigger_job = triggerJob?.name || triggerJob?.id || '';
+            outputs.trigger_step = triggerStep?.name || '';
+
             const appendixLines = [
               `Gate run: ${run.html_url || run.id}`,
               `Conclusion: ${run.conclusion || run.status || 'unknown'}`,
@@ -319,3 +359,140 @@ jobs:
               issue_number: prNumber,
               body,
             });
+
+  metrics:
+    name: Record autofix metrics
+    needs:
+      - prepare
+      - autofix
+    if: always()
+    runs-on: ubuntu-latest
+    environment: agent-standard
+    steps:
+      - name: Collect metrics
+        id: collect
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const prNumber = Number('${{ needs.prepare.outputs.pr_number || 0 }}') || 0;
+            const attemptNumber = Number('${{ needs.prepare.outputs.attempts || 0 }}') || 0;
+            const attemptLimit = Number('${{ needs.prepare.outputs.max_attempts || 0 }}') || 0;
+            const headShaBefore = '${{ needs.prepare.outputs.head_sha }}';
+            const gateConclusionBefore = '${{ needs.prepare.outputs.gate_conclusion }}' || (context.payload.workflow_run?.conclusion || '');
+            const gateRunId = '${{ needs.prepare.outputs.gate_run_id }}' || String(context.payload.workflow_run?.id || '');
+            const triggerReason = '${{ needs.prepare.outputs.trigger_reason || 'unknown' }}';
+            const triggerJob = '${{ needs.prepare.outputs.trigger_job }}';
+            const triggerStep = '${{ needs.prepare.outputs.trigger_step }}';
+            const stopReason = '${{ needs.prepare.outputs.stop_reason }}';
+            const autofixResult = '${{ needs.autofix.result }}';
+
+            const { owner, repo } = context.repo;
+            let fixApplied = false;
+            let headShaAfter = headShaBefore;
+            let gateResultAfter = gateConclusionBefore || 'unknown';
+
+            if (prNumber) {
+              try {
+                const { data: pr } = await github.rest.pulls.get({
+                  owner,
+                  repo,
+                  pull_number: prNumber,
+                });
+                headShaAfter = pr.head?.sha || headShaAfter;
+                fixApplied = Boolean(headShaBefore && headShaAfter && headShaBefore !== headShaAfter);
+
+                const gateWorkflow = 'pr-00-gate.yml';
+                const runs = await github.paginate(github.rest.actions.listWorkflowRuns, {
+                  owner,
+                  repo,
+                  workflow_id: gateWorkflow,
+                  head_sha: headShaAfter,
+                  per_page: 20,
+                });
+                const latestGateRun = runs[0];
+                if (latestGateRun) {
+                  gateResultAfter = latestGateRun.conclusion || latestGateRun.status || 'unknown';
+                } else {
+                  gateResultAfter = 'not-found';
+                }
+              } catch (error) {
+                core.warning(`Failed to resolve PR or gate status: ${error.message}`);
+              }
+            }
+
+            const metrics = {
+              workflow_run_id: gateRunId,
+              pr_number: prNumber,
+              attempt_number: attemptNumber,
+              attempt_limit: attemptLimit,
+              trigger_reason: triggerReason || 'unknown',
+              trigger_job: triggerJob,
+              trigger_step: triggerStep,
+              fix_applied: fixApplied,
+              gate_result_after: gateResultAfter || 'unknown',
+              gate_conclusion_before: gateConclusionBefore || 'unknown',
+              stop_reason: stopReason || '',
+              autofix_result: autofixResult || 'unknown',
+              head_sha_before: headShaBefore,
+              head_sha_after: headShaAfter,
+              recorded_at: new Date().toISOString(),
+            };
+
+            core.setOutput('metrics_json', JSON.stringify(metrics));
+
+      - name: Write summary and artifact
+        env:
+          METRICS_JSON: ${{ steps.collect.outputs.metrics_json }}
+        run: |
+          set -euo pipefail
+          if [ -z "${METRICS_JSON:-}" ]; then
+            echo "No metrics JSON captured; skipping summary."
+            exit 0
+          fi
+
+          python - <<'PY'
+          import json
+          import os
+
+          metrics = json.loads(os.environ["METRICS_JSON"])
+          order = [
+              "pr_number",
+              "attempt_number",
+              "attempt_limit",
+              "trigger_reason",
+              "trigger_job",
+              "trigger_step",
+              "fix_applied",
+              "gate_conclusion_before",
+              "gate_result_after",
+              "autofix_result",
+              "stop_reason",
+              "workflow_run_id",
+              "head_sha_before",
+              "head_sha_after",
+              "recorded_at",
+          ]
+
+          lines = ["## Autofix loop metrics", ""] + ["| Field | Value |", "| --- | --- |"]
+          for key in order:
+              value = metrics.get(key, "")
+              lines.append(f"| {key} | `{value}` |")
+
+          summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+          if summary_path:
+              with open(summary_path, "a", encoding="utf-8") as fp:
+                  fp.write("\n".join(lines) + "\n")
+
+          out_path = "autofix-metrics.ndjson"
+          with open(out_path, "a", encoding="utf-8") as fp:
+              fp.write(json.dumps(metrics) + "\n")
+          print(f"Wrote metrics to {out_path}")
+          PY
+
+      - name: Upload metrics artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: agents-autofix-metrics
+          path: autofix-metrics.ndjson
+          retention-days: 30

From e45db7b9b238073d199f358c7fd8f727409ea225 Mon Sep 17 00:00:00 2001
From: stranske <stranske@gmail.com>
Date: Wed, 24 Dec 2025 05:49:55 -0600
Subject: [PATCH 3/9] Add keepalive metrics summary and artifact (#98)

---
 .github/workflows/agents-keepalive-loop.yml | 79 +++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/.github/workflows/agents-keepalive-loop.yml b/.github/workflows/agents-keepalive-loop.yml
index c3427a741..87af58f06 100644
--- a/.github/workflows/agents-keepalive-loop.yml
+++ b/.github/workflows/agents-keepalive-loop.yml
@@ -36,12 +36,16 @@ jobs:
       autofix_enabled: ${{ steps.evaluate.outputs.autofix_enabled }}
       has_agent_label: ${{ steps.evaluate.outputs.has_agent_label }}
       trace: ${{ steps.evaluate.outputs.trace }}
+      start_ts: ${{ steps.timestamps.outputs.start_ts }}
       security_blocked: ${{ steps.security_gate.outputs.blocked }}
       security_reason: ${{ steps.security_gate.outputs.reason }}
     steps:
       - name: Checkout
         uses: actions/checkout@v4
 
+      - name: Capture timestamps
+        id: timestamps
+        run: echo "start_ts=$(date -u +%s)" >> "$GITHUB_OUTPUT"
       - name: Security gate - prompt injection guard
         id: security_gate
         uses: actions/github-script@v7
@@ -146,6 +150,81 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
+      - name: Emit keepalive metrics
+        id: keepalive-metrics
+        env:
+          PR_NUMBER: ${{ needs.evaluate.outputs.pr_number }}
+          ACTION: ${{ needs.evaluate.outputs.action }}
+          REASON: ${{ needs.evaluate.outputs.reason }}
+          GATE_CONCLUSION: ${{ needs.evaluate.outputs.gate_conclusion }}
+          ITERATION: ${{ needs.evaluate.outputs.iteration }}
+          MAX_ITERATIONS: ${{ needs.evaluate.outputs.max_iterations }}
+          TASKS_TOTAL: ${{ needs.evaluate.outputs.tasks_total }}
+          TASKS_UNCHECKED: ${{ needs.evaluate.outputs.tasks_unchecked }}
+          START_TS: ${{ needs.evaluate.outputs.start_ts }}
+        run: |
+          set -euo pipefail
+
+          now=$(date -u +%s)
+          if [[ "${START_TS:-}" =~ ^[0-9]+$ ]]; then
+            duration=$(( now - START_TS ))
+            if [ "$duration" -lt 0 ]; then duration=0; fi
+          else
+            duration=0
+          fi
+
+          tasks_total=${TASKS_TOTAL:-0}
+          tasks_unchecked=${TASKS_UNCHECKED:-0}
+          if ! [[ "$tasks_total" =~ ^-?[0-9]+$ ]]; then tasks_total=0; fi
+          if ! [[ "$tasks_unchecked" =~ ^-?[0-9]+$ ]]; then tasks_unchecked=0; fi
+          tasks_completed=$(( tasks_total - tasks_unchecked ))
+          if [ "$tasks_completed" -lt 0 ]; then tasks_completed=0; fi
+
+          metrics_json=$(jq -n \
+            --arg pr "${PR_NUMBER:-0}" \
+            --arg iteration "${ITERATION:-0}" \
+            --arg action "${ACTION:-}" \
+            --arg stop_reason "${REASON:-}" \
+            --arg gate_conclusion "${GATE_CONCLUSION:-}" \
+            --arg tasks_total "$tasks_total" \
+            --arg tasks_completed "$tasks_completed" \
+            --arg duration "$duration" \
+            '{
+              pr_number: ($pr | tonumber? // 0),
+              iteration_count: ($iteration | tonumber? // 0),
+              action: $action,
+              stop_reason: $stop_reason,
+              gate_conclusion: $gate_conclusion,
+              tasks_total: ($tasks_total | tonumber? // 0),
+              tasks_completed: ($tasks_completed | tonumber? // 0),
+              duration_seconds: ($duration | tonumber? // 0)
+            }')
+
+          {
+            echo '### Keepalive metrics'
+            echo ''
+            echo '| Field | Value |'
+            echo '| --- | --- |'
+            echo "| pr_number | $(echo "$metrics_json" | jq -r '.pr_number') |"
+            echo "| iteration_count | $(echo "$metrics_json" | jq -r '.iteration_count') |"
+            echo "| action | $(echo "$metrics_json" | jq -r '.action') |"
+            echo "| stop_reason | $(echo "$metrics_json" | jq -r '.stop_reason') |"
+            echo "| gate_conclusion | $(echo "$metrics_json" | jq -r '.gate_conclusion') |"
+            echo "| tasks_total | $(echo "$metrics_json" | jq -r '.tasks_total') |"
+            echo "| tasks_completed | $(echo "$metrics_json" | jq -r '.tasks_completed') |"
+            echo "| duration_seconds | $(echo "$metrics_json" | jq -r '.duration_seconds') |"
+          } >> "$GITHUB_STEP_SUMMARY"
+
+          echo "$metrics_json" >> keepalive-metrics.ndjson
+
+      - name: Upload keepalive metrics artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: keepalive-metrics
+          path: keepalive-metrics.ndjson
+          retention-days: 30
+          if-no-files-found: error
+
       - name: Update summary comment
         uses: actions/github-script@v7
         with:

From 9a538a9289f44cf24641a38290936c701c010705 Mon Sep 17 00:00:00 2001
From: stranske <stranske@gmail.com>
Date: Wed, 24 Dec 2025 05:54:17 -0600
Subject: [PATCH 4/9] Add verifier workflow metrics emission

---
 .github/workflows/agents-verifier.yml | 116 +++++++++++++++++++++++++-
 1 file changed, 115 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/agents-verifier.yml b/.github/workflows/agents-verifier.yml
index 42db9e1b5..04d597e03 100644
--- a/.github/workflows/agents-verifier.yml
+++ b/.github/workflows/agents-verifier.yml
@@ -101,6 +101,7 @@ jobs:
           echo "verdict=$verdict" >> "$GITHUB_OUTPUT"
 
       - name: Open follow-up issue on verifier failure
+        id: failure_issue
         if: steps.context.outputs.should_run == 'true' && steps.verdict.outputs.verdict == 'fail'
         uses: actions/github-script@v7
         env:
@@ -129,9 +130,122 @@ jobs:
             lines.push('');
             lines.push('- [ ] Re-run verifier after addressing the failures.');
             const body = lines.join('\n');
-            await github.rest.issues.create({
+            const { data: issue } = await github.rest.issues.create({
               ...context.repo,
               title,
               body,
               labels: ['agent:codex'],
             });
+            core.setOutput('issue_number', issue?.number ? String(issue.number) : '');
+
+      - name: Collect verifier metrics
+        if: always()
+        id: collect_metrics
+        env:
+          SHOULD_RUN: ${{ steps.context.outputs.should_run }}
+          PR_NUMBER: ${{ steps.context.outputs.pr_number }}
+          VERDICT: ${{ steps.verdict.outputs.verdict }}
+          CONTEXT_PATH: ${{ steps.context.outputs.context_path }}
+          CODEX_OUTPUT: codex-output.md
+          ISSUE_NUMBER: ${{ steps.failure_issue.outputs.issue_number }}
+          SKIP_REASON: ${{ steps.context.outputs.skip_reason }}
+        run: |
+          set -euo pipefail
+          python - <<'PY'
+          import json
+          import os
+          import re
+          from pathlib import Path
+          from datetime import datetime, timezone
+
+          def count_checkboxes(text: str) -> int:
+              return sum(1 for line in text.splitlines() if re.match(r"^- \\[[ xX]\\]", line.strip()))
+
+          should_run = (os.environ.get("SHOULD_RUN") or "").lower() == "true"
+          pr_number = int(os.environ.get("PR_NUMBER") or 0)
+          verdict = os.environ.get("VERDICT") or "unknown"
+          context_path = Path(os.environ.get("CONTEXT_PATH") or "")
+          skip_reason = os.environ.get("SKIP_REASON") or ""
+          issue_number = os.environ.get("ISSUE_NUMBER") or ""
+
+          if not should_run:
+              verdict = "skipped"
+
+          issues_created = 1 if issue_number else 0
+          issue_numbers = [issue_number] if issue_number else []
+
+          acceptance_criteria_count = 0
+          if context_path.is_file():
+              acceptance_criteria_count = count_checkboxes(context_path.read_text(encoding="utf-8"))
+
+          checks_run = 0
+          codex_output_path = Path(os.environ.get("CODEX_OUTPUT") or "")
+          if codex_output_path.is_file():
+              content = codex_output_path.read_text(encoding="utf-8")
+              checks_run = sum(1 for line in content.splitlines()[1:] if line.lstrip().startswith("- "))
+
+          metrics = {
+              "pr_number": pr_number,
+              "verdict": verdict,
+              "issues_created": issues_created,
+              "issue_numbers": issue_numbers,
+              "acceptance_criteria_count": acceptance_criteria_count,
+              "checks_run": checks_run,
+              "skip_reason": skip_reason,
+              "recorded_at": datetime.now(timezone.utc).isoformat(),
+          }
+
+          print(json.dumps(metrics, indent=2))
+          with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as fp:
+              fp.write(f"metrics_json={json.dumps(metrics)}\n")
+          PY
+
+      - name: Write verifier summary
+        if: always()
+        env:
+          METRICS_JSON: ${{ steps.collect_metrics.outputs.metrics_json }}
+        run: |
+          set -euo pipefail
+          if [ -z "${METRICS_JSON:-}" ]; then
+            echo "No verifier metrics captured; skipping summary."
+            exit 0
+          fi
+          python - <<'PY'
+          import json
+          import os
+
+          metrics = json.loads(os.environ["METRICS_JSON"])
+          order = [
+              "pr_number",
+              "verdict",
+              "issues_created",
+              "issue_numbers",
+              "acceptance_criteria_count",
+              "checks_run",
+              "skip_reason",
+              "recorded_at",
+          ]
+
+          lines = ["## Verifier metrics", ""] + ["| Field | Value |", "| --- | --- |"]
+          for key in order:
+              value = metrics.get(key, "")
+              lines.append(f"| {key} | `{value}` |")
+
+          summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+          if summary_path:
+              with open(summary_path, "a", encoding="utf-8") as fp:
+                  fp.write("\n".join(lines) + "\n")
+
+          out_path = "verifier-metrics.ndjson"
+          with open(out_path, "a", encoding="utf-8") as fp:
+              fp.write(json.dumps(metrics) + "\n")
+          print(f"Wrote metrics to {out_path}")
+          PY
+
+      - name: Upload verifier metrics
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: agents-verifier-metrics
+          path: verifier-metrics.ndjson
+          retention-days: 30

From 0040471a963ce6dcf8e2b37f50779251d70c30f2 Mon Sep 17 00:00:00 2001
From: stranske <stranske@gmail.com>
Date: Wed, 24 Dec 2025 06:12:29 -0600
Subject: [PATCH 5/9] Add weekly agent metrics aggregation

---
 .github/workflows/agents-metrics-weekly.yml | 135 ++++++++++
 scripts/aggregate_agent_metrics.py          | 274 ++++++++++++++++++++
 2 files changed, 409 insertions(+)
 create mode 100644 .github/workflows/agents-metrics-weekly.yml
 create mode 100644 scripts/aggregate_agent_metrics.py

diff --git a/.github/workflows/agents-metrics-weekly.yml b/.github/workflows/agents-metrics-weekly.yml
new file mode 100644
index 000000000..b4739f624
--- /dev/null
+++ b/.github/workflows/agents-metrics-weekly.yml
@@ -0,0 +1,135 @@
+name: Agents metrics weekly
+
+on:
+  schedule:
+    - cron: "0 9 * * MON"
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  actions: read
+  issues: write
+
+jobs:
+  aggregate:
+    name: Aggregate agent metrics
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Prepare artifact directory
+        run: mkdir -p metrics-artifacts
+
+      - name: Download metrics artifacts
+        id: download
+        uses: actions/github-script@v7
+        env:
+          OUTPUT_DIR: metrics-artifacts
+          CUTOFF_DAYS: 35
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const path = require('path');
+            const { owner, repo } = context.repo;
+            const outputDir = process.env.OUTPUT_DIR || 'metrics-artifacts';
+            const cutoffDays = Number(process.env.CUTOFF_DAYS || 35);
+            const cutoff = new Date(Date.now() - cutoffDays * 24 * 60 * 60 * 1000);
+            const patterns = [/keepalive-metrics/i, /agents-autofix-metrics/i, /autofix-metrics/i, /verifier-metrics/i];
+
+            fs.mkdirSync(outputDir, { recursive: true });
+            let page = 1;
+            const perPage = 100;
+            const matches = [];
+
+            while (true) {
+              const { data } = await github.rest.actions.listArtifactsForRepo({
+                owner,
+                repo,
+                per_page: perPage,
+                page,
+              });
+
+              const artifacts = data.artifacts || [];
+              for (const artifact of artifacts) {
+                const created = new Date(artifact.created_at);
+                if (artifact.expired) continue;
+                if (created < cutoff) continue;
+                if (!patterns.some((re) => re.test(artifact.name))) continue;
+                matches.push(artifact);
+              }
+
+              if (artifacts.length < perPage) break;
+              page += 1;
+              if (page > 10) break;
+            }
+
+            for (const artifact of matches) {
+              const response = await github.rest.actions.downloadArtifact({
+                owner,
+                repo,
+                artifact_id: artifact.id,
+                archive_format: 'zip',
+              });
+              const dest = path.join(outputDir, `${artifact.name}-${artifact.id}.zip`);
+              fs.writeFileSync(dest, Buffer.from(response.data));
+              core.info(`Downloaded ${artifact.name} to ${dest}`);
+            }
+
+            core.setOutput('artifact_count', matches.length);
+
+      - name: Extract artifacts
+        if: steps.download.outputs.artifact_count != '0'
+        run: |
+          mkdir -p metrics-artifacts/extracted
+          shopt -s nullglob
+          for archive in metrics-artifacts/*.zip; do
+            unzip -o "$archive" -d metrics-artifacts/extracted
+          done
+
+      - name: Aggregate metrics
+        id: aggregate
+        run: |
+          set -euo pipefail
+          python scripts/aggregate_agent_metrics.py --input metrics-artifacts --output metrics-summary.md --recent-days 35
+
+      - name: Publish summary to issue
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const summaryPath = 'metrics-summary.md';
+            const body = fs.existsSync(summaryPath)
+              ? fs.readFileSync(summaryPath, 'utf-8')
+              : 'No metrics available for this period.';
+            const issueNumber = 93;
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: issueNumber,
+              body,
+            });
+
+      - name: Upload weekly summary artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: agents-metrics-summary
+          path: metrics-summary.md
+          if-no-files-found: warn
+          retention-days: 45
+
+      - name: Append to job summary
+        if: always()
+        run: |
+          if [ -f metrics-summary.md ]; then
+            cat metrics-summary.md >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "No metrics summary generated." >> "$GITHUB_STEP_SUMMARY"
+          fi
diff --git a/scripts/aggregate_agent_metrics.py b/scripts/aggregate_agent_metrics.py
new file mode 100644
index 000000000..b3897e66a
--- /dev/null
+++ b/scripts/aggregate_agent_metrics.py
@@ -0,0 +1,274 @@
+"""
+Aggregate agent workflow metrics from NDJSON artifacts and emit a Markdown summary.
+
+Usage:
+    python scripts/aggregate_agent_metrics.py --input artifacts/ --output summary.md
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+from collections import Counter, defaultdict
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--input",
+        "-i",
+        action="append",
+        default=[],
+        help="File or directory containing NDJSON metrics (can be passed multiple times).",
+    )
+    parser.add_argument(
+        "--recent-days",
+        type=int,
+        default=35,
+        help="Only include metrics recorded in the last N days (default: 35).",
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        default="",
+        help="Optional file path to write the Markdown summary.",
+    )
+    return parser.parse_args()
+
+
+def iter_ndjson(paths: Iterable[Path]) -> Iterable[Tuple[Dict, Path]]:
+    for path in paths:
+        if path.is_dir():
+            yield from iter_ndjson(path.rglob("*.ndjson"))
+            continue
+        if path.suffix.lower() != ".ndjson":
+            continue
+        try:
+            for line in path.read_text(encoding="utf-8").splitlines():
+                if not line.strip():
+                    continue
+                try:
+                    yield json.loads(line), path
+                except json.JSONDecodeError:
+                    continue
+        except FileNotFoundError:
+            continue
+
+
+def _parse_datetime(value: str):
+    try:
+        return datetime.fromisoformat(value.replace("Z", "+00:00"))
+    except Exception:
+        return None
+
+
+def load_metrics(paths: Iterable[Path], recent_days: int):
+    cutoff = datetime.now(timezone.utc) - timedelta(days=recent_days)
+    keepalive, autofix, verifier = [], [], []
+    for record, source in iter_ndjson(paths):
+        if not isinstance(record, dict):
+            continue
+        recorded_at = _parse_datetime(str(record.get("recorded_at", ""))) or cutoff
+        if recorded_at < cutoff:
+            continue
+        # Try to infer record type
+        if "iteration_count" in record:
+            keepalive.append(record)
+        elif "attempt_number" in record:
+            autofix.append(record)
+        elif "verdict" in record:
+            verifier.append(record)
+        else:
+            # Fallback to filename hint
+            name = source.name.lower()
+            if "keepalive" in name:
+                keepalive.append(record)
+            elif "autofix" in name:
+                autofix.append(record)
+            elif "verifier" in name:
+                verifier.append(record)
+    return keepalive, autofix, verifier
+
+
+def summarise_keepalive(records: List[Dict]) -> Dict[str, object]:
+    if not records:
+        return {
+            "count": 0,
+            "pr_count": 0,
+            "avg_iterations": 0.0,
+            "completion_pct": 0.0,
+            "top_stop_reasons": [],
+        }
+
+    stop_reasons = Counter()
+    iterations: List[int] = []
+    pr_counts = Counter()
+    completed_prs = set()
+
+    for rec in records:
+        pr = int(rec.get("pr_number") or 0)
+        pr_counts[pr] += 1
+        iterations.append(int(rec.get("iteration_count") or 0))
+        stop = str(rec.get("stop_reason", "")).lower()
+        if stop:
+            stop_reasons[stop] += 1
+        tasks_total = int(rec.get("tasks_total") or 0)
+        tasks_completed = int(rec.get("tasks_completed") or 0)
+
+        if "complete" in stop or (tasks_total > 0 and tasks_completed >= tasks_total):
+            completed_prs.add(pr)
+
+    avg_iterations = statistics.mean(iterations) if iterations else 0.0
+    pr_total = len(pr_counts)
+    completion_pct = (len(completed_prs) / pr_total * 100.0) if pr_total else 0.0
+    top_stop_reasons = stop_reasons.most_common(5)
+
+    return {
+        "count": len(records),
+        "pr_count": pr_total,
+        "avg_iterations": round(avg_iterations, 2),
+        "completion_pct": round(completion_pct, 2),
+        "top_stop_reasons": top_stop_reasons,
+    }
+
+
+def summarise_autofix(records: List[Dict]) -> Dict[str, object]:
+    if not records:
+        return {
+            "count": 0,
+            "pr_count": 0,
+            "success_pct": 0.0,
+            "avg_attempts": 0.0,
+        }
+    pr_attempts: defaultdict[int, List[int]] = defaultdict(list)
+    success = 0
+    for rec in records:
+        pr = int(rec.get("pr_number") or 0)
+        attempt = int(rec.get("attempt_number") or 0)
+        pr_attempts[pr].append(attempt)
+        if rec.get("fix_applied"):
+            success += 1
+    avg_attempts = (
+        statistics.mean(max(v) for v in pr_attempts.values()) if pr_attempts else 0.0
+    )
+    success_pct = (success / len(records) * 100.0) if records else 0.0
+    return {
+        "count": len(records),
+        "pr_count": len(pr_attempts),
+        "success_pct": round(success_pct, 2),
+        "avg_attempts": round(avg_attempts, 2),
+    }
+
+
+def summarise_verifier(records: List[Dict]) -> Dict[str, object]:
+    if not records:
+        return {
+            "count": 0,
+            "pass_pct": 0.0,
+            "avg_checks": 0.0,
+            "issues_created": 0,
+        }
+    verdicts = Counter()
+    checks: List[int] = []
+    issues_created = 0
+    for rec in records:
+        verdict = str(rec.get("verdict", "unknown")).lower()
+        verdicts[verdict] += 1
+        checks.append(int(rec.get("checks_run") or 0))
+        issues_created += int(rec.get("issues_created") or 0)
+
+    pass_total = verdicts.get("pass", 0)
+    total = sum(verdicts.values())
+    pass_pct = (pass_total / total * 100.0) if total else 0.0
+    avg_checks = statistics.mean(checks) if checks else 0.0
+    return {
+        "count": total,
+        "pass_pct": round(pass_pct, 2),
+        "avg_checks": round(avg_checks, 2),
+        "issues_created": issues_created,
+        "verdicts": verdicts.most_common(),
+    }
+
+
+def build_markdown(
+    keepalive_summary: Dict[str, object],
+    autofix_summary: Dict[str, object],
+    verifier_summary: Dict[str, object],
+    recent_days: int,
+) -> str:
+    lines = [
+        "# Weekly agent metrics summary",
+        "",
+        f"_Covers metrics from the last **{recent_days} days**._",
+        "",
+        "## Keepalive loop",
+        f"- Records analyzed: **{keepalive_summary['count']}** across **{keepalive_summary['pr_count']} PRs**",
+        f"- Average iterations per record: **{keepalive_summary['avg_iterations']}**",
+        f"- PRs completed via keepalive: **{keepalive_summary['completion_pct']}%**",
+    ]
+
+    top_stops = keepalive_summary.get("top_stop_reasons") or []
+    if top_stops:
+        lines.append("- Top stop reasons:")
+        for reason, count in top_stops:
+            lines.append(f"  - `{reason or 'unknown'}` — {count}")
+
+    lines += [
+        "",
+        "## Autofix loop",
+        f"- Records analyzed: **{autofix_summary['count']}** across **{autofix_summary['pr_count']} PRs**",
+        f"- Fix applied success rate: **{autofix_summary['success_pct']}%**",
+        f"- Average attempts per PR (max): **{autofix_summary['avg_attempts']}**",
+        "",
+        "## Verifier",
+        f"- Records analyzed: **{verifier_summary['count']}**",
+        f"- Pass rate: **{verifier_summary['pass_pct']}%**",
+        f"- Average checks run: **{verifier_summary['avg_checks']}**",
+        f"- Issues created: **{verifier_summary['issues_created']}**",
+    ]
+
+    verdicts = verifier_summary.get("verdicts") or []
+    if verdicts:
+        lines.append("- Verdict distribution:")
+        for verdict, count in verdicts:
+            lines.append(f"  - `{verdict}` — {count}")
+
+    lines.append("")
+    lines.append("## Key question: keepalive completion rate")
+    lines.append(
+        f"- Percentage of PRs completing via keepalive without human intervention: **{keepalive_summary['completion_pct']}%**"
+    )
+    return "\n".join(lines) + "\n"
+
+
+def main():
+    args = parse_args()
+    inputs = [Path(p) for p in (args.input or [])]
+    if not inputs:
+        inputs = [Path("metrics-artifacts"), Path("artifacts"), Path(".")]
+
+    keepalive_records, autofix_records, verifier_records = load_metrics(
+        inputs, args.recent_days
+    )
+
+    keepalive_summary = summarise_keepalive(keepalive_records)
+    autofix_summary = summarise_autofix(autofix_records)
+    verifier_summary = summarise_verifier(verifier_records)
+
+    markdown = build_markdown(
+        keepalive_summary, autofix_summary, verifier_summary, args.recent_days
+    )
+    print(markdown)
+
+    output_path = Path(args.output) if args.output else None
+    if output_path:
+        output_path.write_text(markdown, encoding="utf-8")
+
+
+if __name__ == "__main__":
+    main()

From d96ca89fd3df2ffb2ef820115e885e3bc044e79c Mon Sep 17 00:00:00 2001
From: stranske <stranske@gmail.com>
Date: Wed, 24 Dec 2025 23:50:59 -0600
Subject: [PATCH 6/9] Update .github/workflows/agents-metrics-weekly.yml

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .github/workflows/agents-metrics-weekly.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/agents-metrics-weekly.yml b/.github/workflows/agents-metrics-weekly.yml
index b4739f624..b906da3d4 100644
--- a/.github/workflows/agents-metrics-weekly.yml
+++ b/.github/workflows/agents-metrics-weekly.yml
@@ -95,6 +95,7 @@ jobs:
 
       - name: Aggregate metrics
         id: aggregate
+        if: steps.download.outputs.artifact_count != '0'
         run: |
           set -euo pipefail
           python scripts/aggregate_agent_metrics.py --input metrics-artifacts --output metrics-summary.md --recent-days 35

From 30099c7df26db66e1236725b31a581a9b26f18d2 Mon Sep 17 00:00:00 2001
From: stranske <stranske@gmail.com>
Date: Wed, 24 Dec 2025 23:51:10 -0600
Subject: [PATCH 7/9] Update scripts/aggregate_agent_metrics.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 scripts/aggregate_agent_metrics.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/scripts/aggregate_agent_metrics.py b/scripts/aggregate_agent_metrics.py
index b3897e66a..829ad0eef 100644
--- a/scripts/aggregate_agent_metrics.py
+++ b/scripts/aggregate_agent_metrics.py
@@ -148,9 +148,18 @@ def summarise_autofix(records: List[Dict]) -> Dict[str, object]:
     pr_attempts: defaultdict[int, List[int]] = defaultdict(list)
     success = 0
     for rec in records:
-        pr = int(rec.get("pr_number") or 0)
+        pr_raw = rec.get("pr_number")
+        pr: int | None = None
+        if pr_raw is not None:
+            try:
+                pr_int = int(pr_raw)
+                if pr_int > 0:
+                    pr = pr_int
+            except (TypeError, ValueError):
+                pr = None
         attempt = int(rec.get("attempt_number") or 0)
-        pr_attempts[pr].append(attempt)
+        if pr is not None:
+            pr_attempts[pr].append(attempt)
         if rec.get("fix_applied"):
             success += 1
     avg_attempts = (

From 8077985efca5108cd61b61b7d6911dee7a4aa28c Mon Sep 17 00:00:00 2001
From: stranske <stranske@gmail.com>
Date: Wed, 24 Dec 2025 23:51:20 -0600
Subject: [PATCH 8/9] Update scripts/aggregate_agent_metrics.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 scripts/aggregate_agent_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/aggregate_agent_metrics.py b/scripts/aggregate_agent_metrics.py
index 829ad0eef..aef46dd8e 100644
--- a/scripts/aggregate_agent_metrics.py
+++ b/scripts/aggregate_agent_metrics.py
@@ -63,7 +63,7 @@ def iter_ndjson(paths: Iterable[Path]) -> Iterable[Tuple[Dict, Path]]:
 def _parse_datetime(value: str):
     try:
         return datetime.fromisoformat(value.replace("Z", "+00:00"))
-    except Exception:
+    except ValueError:
         return None
 
 

From 3dfafeabfdd1c0650f489700808b33f9bdd3eee4 Mon Sep 17 00:00:00 2001
From: stranske <stranske@gmail.com>
Date: Wed, 24 Dec 2025 23:51:42 -0600
Subject: [PATCH 9/9] Update scripts/aggregate_agent_metrics.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 scripts/aggregate_agent_metrics.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/scripts/aggregate_agent_metrics.py b/scripts/aggregate_agent_metrics.py
index aef46dd8e..90df41c30 100644
--- a/scripts/aggregate_agent_metrics.py
+++ b/scripts/aggregate_agent_metrics.py
@@ -155,17 +155,19 @@ def summarise_autofix(records: List[Dict]) -> Dict[str, object]:
                 pr_int = int(pr_raw)
                 if pr_int > 0:
                     pr = pr_int
-            except (TypeError, ValueError):
-                pr = None
+    successful_prs = set()
+    for rec in records:
+        pr = int(rec.get("pr_number") or 0)
         attempt = int(rec.get("attempt_number") or 0)
-        if pr is not None:
-            pr_attempts[pr].append(attempt)
+        pr_attempts[pr].append(attempt)
         if rec.get("fix_applied"):
-            success += 1
+            successful_prs.add(pr)
     avg_attempts = (
         statistics.mean(max(v) for v in pr_attempts.values()) if pr_attempts else 0.0
     )
-    success_pct = (success / len(records) * 100.0) if records else 0.0
+    success_pct = (
+        (len(successful_prs) / len(pr_attempts) * 100.0) if pr_attempts else 0.0
+    )
     return {
         "count": len(records),
         "pr_count": len(pr_attempts),