From 22155f792872c843235d5e3fe2a782773cd353ab Mon Sep 17 00:00:00 2001 From: marcusquinn <6428977+marcusquinn@users.noreply.github.com> Date: Fri, 20 Feb 2026 19:33:35 +0000 Subject: [PATCH] t1276: subtask-aware queue analysis, fix auto-pickup head-50 limit, orphan issue intake - Fix auto-pickup Strategy 4 head -50 limit that silently skipped subtask inheritance for parents beyond the 50th (only 2 of 242 parents have open subtasks, but they were beyond position 50). Now scans only open parents with no arbitrary limit. - Enhance runners-check to report subtask-aware queue depth: total open items (parents + subtasks), dispatchable count (tagged + inherited from parent), blocked count, and claimed/in-progress count. - Add TODO entries for 3 orphan user-reported GitHub issues: GH#1791 (setup.sh unbound variable), GH#1975 (skill discovery CLI), GH#1991 (wp-helper.sh CONFIG_FILE bug). - Re-add completed task entries for t1260, t1261, t1273 with pr: refs so issue-sync can close their stale open GitHub issues. --- .agents/scripts/commands/runners-check.md | 55 +++++++++++++++++++++-- .agents/scripts/supervisor/cron.sh | 8 ++-- TODO.md | 7 +++ 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/.agents/scripts/commands/runners-check.md b/.agents/scripts/commands/runners-check.md index 69e8dcf679..a377833fcd 100644 --- a/.agents/scripts/commands/runners-check.md +++ b/.agents/scripts/commands/runners-check.md @@ -16,14 +16,53 @@ Run these commands in parallel and present a unified report: # 1. Active batch status ~/.aidevops/agents/scripts/supervisor-helper.sh status 2>&1 -# 2. Open PRs from workers (need merge/review) +# 2. TODO.md queue analysis (subtask-aware) +# Count ALL open items including subtasks — subtasks are the actual dispatchable units +TODO_FILE="$(git rev-parse --show-toplevel 2>/dev/null)/TODO.md" +if [[ -f "$TODO_FILE" ]]; then + total_open=$(grep -c '^[[:space:]]*- \[ \]' "$TODO_FILE" 2>/dev/null || echo 0) + parent_open=$(grep -c '^- \[ \]' "$TODO_FILE" 2>/dev/null || echo 0) + subtask_open=$((total_open - parent_open)) + # Dispatchable: open, has #auto-dispatch (or parent does), not blocked, not claimed + dispatchable=$(grep -E '^[[:space:]]*- \[ \] t[0-9]+' "$TODO_FILE" 2>/dev/null | \ + grep -v 'assignee:\|started:' | \ + grep -v 'blocked-by:' | \ + grep -c '#auto-dispatch' 2>/dev/null || echo 0) + # Subtasks whose parent has #auto-dispatch (inherited dispatchability) + # For each open subtask, check if its parent line has #auto-dispatch + inherited=0 + while IFS= read -r line; do + task_id=$(echo "$line" | grep -oE 't[0-9]+\.[0-9]+' | head -1) + if [[ -n "$task_id" ]]; then + parent_id=$(echo "$task_id" | sed 's/\.[0-9]*$//') + if grep -qE "^- \[.\] ${parent_id} .*#auto-dispatch" "$TODO_FILE" 2>/dev/null; then + # Check not blocked or claimed + if ! echo "$line" | grep -qE 'assignee:|started:'; then + if ! echo "$line" | grep -qE 'blocked-by:'; then + inherited=$((inherited + 1)) + fi + fi + fi + fi + done < <(grep -E '^[[:space:]]+- \[ \] t[0-9]+\.[0-9]+' "$TODO_FILE" 2>/dev/null | grep -v '#auto-dispatch') + total_dispatchable=$((dispatchable + inherited)) + blocked=$(grep -E '^[[:space:]]*- \[ \]' "$TODO_FILE" 2>/dev/null | grep -c 'blocked-by:' || echo 0) + claimed=$(grep -E '^[[:space:]]*- \[ \]' "$TODO_FILE" 2>/dev/null | grep -cE 'assignee:|started:' || echo 0) + echo "=== TODO.md Queue ===" + echo "Total open: $total_open ($parent_open parents, $subtask_open subtasks)" + echo "Dispatchable: $total_dispatchable (tagged: $dispatchable, inherited: $inherited)" + echo "Blocked: $blocked" + echo "Claimed/in-progress: $claimed" +fi + +# 3. Open PRs from workers (need merge/review) gh pr list --state open --json number,title,headRefName,createdAt,statusCheckRollup \ --jq '.[] | "\(.number) [\(.headRefName)] \(.title) checks:\(.statusCheckRollup | map(.conclusion // .state) | join(","))"' 2>/dev/null -# 3. Active worktrees (worker sessions) +# 4. Active worktrees (worker sessions) git worktree list 2>/dev/null -# 4. System resources +# 5. System resources ~/.aidevops/agents/scripts/supervisor-helper.sh db \ "SELECT id, state, retries FROM tasks WHERE state NOT IN ('deployed','cancelled','failed') ORDER BY state;" 2>/dev/null ``` @@ -32,6 +71,14 @@ git worktree list 2>/dev/null Present results as a concise dashboard: +### Queue Depth (subtask-aware) + +- **Total open**: X (Y parents, Z subtasks) — subtasks are the actual work units +- **Dispatchable now**: N (M tagged #auto-dispatch, K inherited from parent) +- **Blocked**: B (waiting on dependencies) +- **Claimed/in-progress**: C (assigned to workers or interactive sessions) +- Flag if dispatchable count is 0 but open count is high (queue stall) + ### Batch Status - Batch name, total/completed/queued/running/failed counts @@ -40,11 +87,13 @@ Present results as a concise dashboard: ### Action Items Flag these for the user (most important first): + 1. **PRs ready to merge** — all CI green, no review comments 2. **PRs with CI failures** — need investigation 3. **Tasks stuck** — in retrying/evaluating too long 4. **Tasks at max retries** — need manual intervention or re-queue 5. **Stale worktrees** — for tasks already deployed/merged +6. **Subtasks missing #auto-dispatch** — parent has tag but subtasks don't (dispatch gap) ### System Health diff --git a/.agents/scripts/supervisor/cron.sh b/.agents/scripts/supervisor/cron.sh index 7ddeb7ecfe..f9b2295bfa 100755 --- a/.agents/scripts/supervisor/cron.sh +++ b/.agents/scripts/supervisor/cron.sh @@ -743,10 +743,12 @@ cmd_auto_pickup() { # from parent when the subtask has no explicit model: field. # This unblocks subtask trees like t1081.1-t1081.4 and t1082.1-t1082.4. - # Step 1: Collect parent task IDs that have #auto-dispatch + # Step 1: Collect OPEN parent task IDs that have #auto-dispatch (t1276) + # Only open parents ([ ]) — completed parents' subtasks are irrelevant. + # Previous head -50 limit caused parents beyond the 50th to be silently skipped. local parent_ids - parent_ids=$(grep -E '^[[:space:]]*- \[[ xX-]\] (t[0-9]+) .*#auto-dispatch' "$todo_file" 2>/dev/null | - grep -oE 't[0-9]+' | head -50 | sort -u || true) + parent_ids=$(grep -E '^[[:space:]]*- \[ \] (t[0-9]+) .*#auto-dispatch' "$todo_file" 2>/dev/null | + grep -oE 't[0-9]+' | sort -u || true) if [[ -n "$parent_ids" ]]; then while IFS= read -r parent_id; do diff --git a/TODO.md b/TODO.md index ef3fa25e43..fa359cf363 100644 --- a/TODO.md +++ b/TODO.md @@ -1801,6 +1801,8 @@ t019.3.4,Update AGENTS.md with Beads integration docs,,beads,1h,45m,2025-12-21T1 - [ ] t1257 Add sequential dependency enforcement for t1120 subtask chain #bugfix #auto-dispatch #self-improvement ~15m model:haiku category:process — t1120.1, t1120.2, and t1120.4 are all eligible for auto-dispatch but have an implicit sequential dependency (extract functions → add adapter → test). Without explicit blocked-by fields, they could be dispatched simultaneously and t1120.2/t1120.4 would fail because t1120.1 hasn't landed yet. Add blocked-by:t1120.1 to t1120.2 and blocked-by:t1120.2 to t1120.4 in TODO.md to prevent wasted dispatch cycles. ref:GH#1964 - [x] t1258 Investigate high volume of stale evaluating recovery events #bugfix #auto-dispatch #self-improvement ~2h model:sonnet category:reliability — 15 of 15 recently completed tasks in the last 24h show 'Stale evaluating recovery (Phase 0.7)' notes. This suggests tasks are routinely getting stuck in 'evaluating' state and requiring Phase 0.7 recovery rather than completing normally. Root cause investigation needed: is the evaluation step timing out? Is there a race condition between worker completion and evaluation? The recovery mechanism works but shouldn't be the primary completion path. Check supervisor-helper.sh evaluate flow, worker sentinel timing, and whether evaluation is blocking on external calls (GitHub API rate limits, PR merge checks). assignee:marcusquinn started:2026-02-19T13:35:18Z ref:GH#1965 pr:#1966 completed:2026-02-19 - [x] t1259 Investigate and mitigate stale evaluating recovery pattern across all recent completions #bugfix #auto-dispatch #self-improvement ~2h model:sonnet category:reliability — All 15 recently completed tasks in the last 24h show 'Stale evaluating recovery (Phase 0.7)' in their notes. While tasks are completing successfully, this indicates workers are consistently entering a stale state that requires recovery intervention. Root cause analysis needed: are workers failing to report completion, is the evaluation phase timing out, or is there a race condition in the pulse cycle? Fixing this would reduce unnecessary recovery cycles and improve dispatch latency. ref:GH#1967 assignee:marcusquinn pr:#1968 completed:2026-02-19 +- [x] t1260 Fix setup.sh to install launchd schedulers on macOS for all users #bugfix ~1h model:sonnet ref:GH#1970 pr:#1971 completed:2026-02-19 +- [x] t1261 Fix dispatch stall: orphaned DB tasks, grep syntax error, create_task commit verification #bugfix ~1h model:sonnet ref:GH#1973 pr:#1974 completed:2026-02-19 - [x] t1262 Fix symlink SCRIPT_DIR resolution for launchd agents #bugfix ~30m model:sonnet category:reliability — supervisor-helper.sh and auto-update-helper.sh use BASH_SOURCE[0] to determine SCRIPT_DIR, but when invoked via symlinks in ~/.aidevops/bin/ (created by t1260 launchd setup), BASH_SOURCE resolves to the symlink path not the target. This causes shared-constants.sh sourcing to fail, breaking both launchd agents (exit code 1). Fix: resolve symlinks before dirname. ref:GH#1976 pr:#1977 completed:2026-02-19 - [ ] t1264 Daily repo sync: auto-pull latest for git repos in configured parent directories #feature #auto-dispatch ~4h model:sonnet category:automation ref:GH#1984 — New `repo-sync-helper.sh` script (follows auto-update-helper.sh pattern) that runs daily via launchd/cron. Scans configured parent directories (default: `~/Git/`) for git repos cloned from a remote, and runs `git pull --ff-only` on repos where the working tree is clean and on the default branch (main/master). Config: add `git_parent_dirs` array to `~/.config/aidevops/repos.json`. CLI: `aidevops repo-sync [enable|disable|status|check|logs]`. Setup.sh offers to enable during install (like auto-update and supervisor pulse). Onboarding asks user to specify parent directories. Safety: only ff-only pulls, skip dirty working trees, skip non-default branches, log failures without stopping. Worktrees are irrelevant — only the main checkout matters. - [x] t1264.1 Add `git_parent_dirs` config to repos.json — extend `init_repos_file()` in aidevops.sh to include `git_parent_dirs: ["~/Git"]` default. Add `aidevops repo-sync dirs [add|remove|list]` subcommand to manage the list. #auto-dispatch ~30m model:sonnet ref:GH#1985 pr:#1997 completed:2026-02-20 @@ -1816,5 +1818,10 @@ t019.3.4,Update AGENTS.md with Beads integration docs,,beads,1h,45m,2025-12-21T1 - [x] t1269 Fix stuck evaluating tasks: crash-resilient evaluation with immediate rollback on pulse death — Phase 1 evaluation runs inline in the pulse loop. If the pulse process dies mid-evaluation (SIGTERM, OOM, cron overlap), the task is stranded in 'evaluating' with no live process. Phase 0.7 catches these but only after 120-600s grace periods, wasting concurrency slots. 205 stale recovery events logged: eval_process_died (36), worker_failed_before_eval (34), pulse_killed_after_pr_persist (33), worker_oom_killed (24). Fix: (1) add SIGTERM/SIGINT trap in the Phase 1 evaluation loop that rolls back any in-progress evaluation to 'running' state on pulse death, (2) reduce evaluating grace period for tasks with no heartbeat activity, (3) add self-improvement memory so supervisor learns to prevent this pattern. #bugfix #supervisor #self-improvement ~2h model:sonnet ref:GH#2001 assignee:marcusquinn started:2026-02-20T04:15:00Z logged:2026-02-20 pr:#2002 completed:2026-02-20 - [x] t1268 issue-sync: auto-detect plan references from PLANS.md `Task:` field and compose rich issue bodies #enhancement #issue-sync #auto-dispatch ~3h model:sonnet ref:GH#1999 assignee:marcusquinn started:2026-02-20T03:30:46Z logged:2026-02-20 — `compose_issue_body()` in `issue-sync-helper.sh` already supports pulling plan context (Purpose, Progress, Decisions, Discoveries) into GitHub issue descriptions, but only when TODO.md tasks contain the explicit `→ [todo/PLANS.md#anchor]` link format. No user actually adds these manually. Fix: (1) In `compose_issue_body()`, when `plan_link` is empty, scan PLANS.md for plans whose `**Task:**` field matches the current task ID (e.g., `**Task:** t004`). This is how awardsapp's PLANS.md already cross-references — `p001` has `**Task:** t004`, `p002` has `**Task:** t007`. (2) When a matching plan is found, extract the full plan section and include Purpose, Research Links, Architecture Decision, Tool Matrix, Linkage table, Progress, and Decisions in the issue body — not just the 4 current subsections. (3) For subtasks (e.g., t004.2), walk up to the parent task ID (t004) and find its plan. (4) Include the plan ID (p001, p002) in the issue metadata header. (5) For the `enrich` command, re-compose bodies for all issues that currently lack plan context. This ensures every user who writes plans in PLANS.md with `**Task:**` cross-references gets rich issue descriptions automatically — no manual `→ [...]` linking required. pr:#2000 completed:2026-02-20 +- [x] t1273 Supervisor sanity-check for self-healing queue stalls #enhancement ~2h model:sonnet ref:GH#2014 pr:#2015 completed:2026-02-20 - [x] t1274 Resolve t1200 merge conflict and retry dispatch #bugfix #auto-dispatch ~30m model:sonnet — Task t1200 (IP reputation check agent) failed with failure_mode:LOGIC and outcome 'blocked:merge_conflict' at sonnet tier. The worker completed work but couldn't merge. Steps: (1) Check if a PR exists for t1200 with merge conflicts. (2) Resolve the conflicts. (3) Re-dispatch or manually complete the merge. This is a completed feature blocked only by git state, not by code quality. assignee:marcusquinn started:2026-02-20T18:23:51Z ref:GH#2019 pr:#2020 completed:2026-02-20 - [x] t1275 KIRA-informed completion discipline: universal behavioural corrections across all autonomy levels #enhancement ~2h ref:GH#2021 — Based on KIRA/terminal-bench research (Kangwook Lee), add completion discipline to build.txt (universal), AGENTS.md (dev lifecycle), dispatch.sh worker prompt (headless), ralph-loop.md (loop agents), and full-loop.md (full-loop sessions). Addresses: false completion, bad self-evaluation, poor replanning, narrow solutions. assignee:marcusquinn started:2026-02-20T19:00:00Z pr:#2022 completed:2026-02-20 +- [ ] t1276 Subtask-aware queue analysis and orphan issue intake #bugfix #enhancement #auto-dispatch ~1h model:sonnet ref:GH#2024 — Fix auto-pickup Strategy 4 head -50 limit that silently skipped subtask inheritance for parents beyond the 50th. Fix runners-check to report subtask-aware queue depth (total open, dispatchable, blocked, claimed). Add TODO entries for 3 orphan user-reported GitHub issues. Close 3 stale GH issues with merged PRs. assignee:marcusquinn started:2026-02-20T19:20:00Z +- [ ] t1277 Fix setup.sh unbound variable extra_args[@] during Bun install on macOS #bugfix #auto-dispatch ~30m model:sonnet ref:GH#1791 — User-reported: setup.sh fails at line 151 with `unbound variable extra_args[@]` when installing Bun. macOS 26.2, installed via npm. Fix: initialize extra_args as empty array before use, or use `${extra_args[@]+"${extra_args[@]}"}` pattern for safe expansion under set -u. +- [ ] t1278 Feature: Interactive Skill Discovery & Management CLI (aidevops skills) #feature #plan ~4h model:sonnet ref:GH#1975 — User-requested: add `aidevops skills` CLI command and companion `@skills` chat agent for conversational discovery, description, and management of installed skills. See GH#1975 for full feature request. +- [ ] t1279 Fix wp-helper.sh run_wp_command() CONFIG_FILE not propagated from get_site_config subshell #bugfix #auto-dispatch ~1h model:sonnet ref:GH#1991 — Two related bugs: (1) CONFIG_FILE set inside get_site_config() command substitution is lost when subshell exits. (2) server_ref resolution fails for the most common code path (single site commands via run_wp_command). See GH#1991 for full analysis.