diff --git a/.agents/scripts/supervisor/ai-context.sh b/.agents/scripts/supervisor/ai-context.sh index 36b7e6f0f..a456f56d2 100755 --- a/.agents/scripts/supervisor/ai-context.sh +++ b/.agents/scripts/supervisor/ai-context.sh @@ -960,6 +960,33 @@ build_health_context() { output+="| Currently blocked | $blocked_count |\n" output+="| Avg retries (7d) | ${avg_retries:-0} |\n" + # Phase 3 throughput metrics (t1336: supervisor self-diagnosis) + # Shows whether the AI lifecycle phase is actually processing tasks. + # Phase 3 was silently broken for days — these metrics make that visible. + local log_file="${SUPERVISOR_LOG:-$HOME/.aidevops/logs/supervisor.log}" + if [[ -f "$log_file" ]]; then + local last_lifecycle + last_lifecycle=$(grep 'ai-lifecycle.*evaluated.*actioned' "$log_file" 2>/dev/null | tail -1 || echo "") + if [[ -n "$last_lifecycle" ]]; then + local p3_eval p3_action + p3_eval=$(echo "$last_lifecycle" | grep -oE 'evaluated [0-9]+' | grep -oE '[0-9]+' || echo "0") + p3_action=$(echo "$last_lifecycle" | grep -oE 'actioned [0-9]+' | grep -oE '[0-9]+' || echo "0") + output+="| Phase 3 last eval | $p3_eval |\n" + output+="| Phase 3 last actioned | $p3_action |\n" + else + output+="| Phase 3 last eval | no data |\n" + output+="| Phase 3 last actioned | no data |\n" + fi + + # Count zero-eval streaks in recent log (last 50 lifecycle entries) + local recent_zeros total_recent + total_recent=$(grep 'ai-lifecycle.*evaluated' "$log_file" 2>/dev/null | tail -50 | wc -l | tr -d ' ') + recent_zeros=$(grep 'ai-lifecycle.*evaluated' "$log_file" 2>/dev/null | tail -50 | grep -c 'evaluated 0' || echo "0") + if [[ "$total_recent" -gt 0 ]]; then + output+="| Phase 3 zero-eval rate (last 50) | ${recent_zeros}/${total_recent} |\n" + fi + fi + printf '%b' "$output" return 0 } diff --git a/.agents/scripts/supervisor/sanity-check.sh b/.agents/scripts/supervisor/sanity-check.sh index d43615728..f8ebf6335 100755 --- a/.agents/scripts/supervisor/sanity-check.sh +++ b/.agents/scripts/supervisor/sanity-check.sh @@ -330,7 +330,150 @@ _build_sanity_state_snapshot() { fi snapshot+="\n" - # 6. Identity context + # 6. Pipeline phase health (t1336) + # Check if critical pipeline phases are actually working by examining + # recent log output. Phase 3 (ai-lifecycle) was silently broken for days + # because gather_task_state referenced a non-existent column — every task + # failed with "could not gather state" but nothing flagged it. + snapshot+="### Pipeline Phase Health\n" + local log_file="${SUPERVISOR_LOG:-$HOME/.aidevops/logs/supervisor.log}" + if [[ -f "$log_file" ]]; then + # Phase 3: check last ai-lifecycle summary line + local last_lifecycle + last_lifecycle=$(grep 'ai-lifecycle.*evaluated.*actioned' "$log_file" 2>/dev/null | tail -1 || echo "") + if [[ -n "$last_lifecycle" ]]; then + local eval_count action_count + eval_count=$(echo "$last_lifecycle" | grep -oE 'evaluated [0-9]+' | grep -oE '[0-9]+' || echo "0") + action_count=$(echo "$last_lifecycle" | grep -oE 'actioned [0-9]+' | grep -oE '[0-9]+' || echo "0") + snapshot+="Phase 3 (ai-lifecycle): last run evaluated=$eval_count actioned=$action_count\n" + if [[ "$eval_count" == "0" ]]; then + # Count how many consecutive 0-evaluated runs + local zero_streak + zero_streak=$(grep -c 'ai-lifecycle.*evaluated 0' "$log_file" 2>/dev/null || echo "0") + snapshot+="WARNING: Phase 3 evaluated 0 tasks ($zero_streak consecutive zero-runs in log)\n" + fi + else + snapshot+="Phase 3 (ai-lifecycle): no recent runs found in log\n" + fi + + # Check for repeated "could not gather state" errors + local gather_failures + gather_failures=$(grep -c 'could not gather state' "$log_file" 2>/dev/null || echo "0") + if [[ "$gather_failures" -gt 10 ]]; then + snapshot+="WARNING: $gather_failures 'could not gather state' errors in log — possible schema drift or query bug\n" + fi + + # Phase 2b: check for stall/underutilisation entries + local stall_count underutil_count + stall_count=$(grep -c 'Dispatch stall detected' "$log_file" 2>/dev/null || echo "0") + underutil_count=$(grep -c 'Concurrency underutilised' "$log_file" 2>/dev/null || echo "0") + if [[ "$stall_count" -gt 5 ]]; then + snapshot+="WARNING: $stall_count dispatch stalls in log\n" + fi + if [[ "$underutil_count" -gt 5 ]]; then + snapshot+="WARNING: $underutil_count concurrency underutilisation events in log\n" + fi + else + snapshot+="(log file not found)\n" + fi + snapshot+="\n" + + # 7. Schema validation (t1336) + # Verify that key queries used by supervisor modules reference columns + # that actually exist in the tasks table. This catches the exact bug where + # gather_task_state referenced worker_pid after it was removed. + snapshot+="### Schema Validation\n" + local actual_columns + actual_columns=$(sqlite3 "$SUPERVISOR_DB" "PRAGMA table_info(tasks);" 2>/dev/null | cut -d'|' -f2 | tr '\n' ',' || echo "") + if [[ -n "$actual_columns" ]]; then + # Check for known query columns that must exist + local required_cols="id status pr_url repo branch worktree error rebase_attempts retries max_retries model session_id" + local missing_cols="" + for col in $required_cols; do + if ! echo ",$actual_columns" | grep -q ",$col,"; then + missing_cols="${missing_cols}${col} " + fi + done + if [[ -n "$missing_cols" ]]; then + snapshot+="CRITICAL: Missing columns in tasks table: $missing_cols\n" + snapshot+="Queries referencing these columns will silently fail!\n" + else + snapshot+="All required columns present in tasks table\n" + fi + else + snapshot+="Could not read tasks schema\n" + fi + snapshot+="\n" + + # 8. Cross-repo issue tag truthfulness (t1336) + # Compare GitHub issue labels (status:*) against actual DB state. + # Catches drift where GH says "status:in-review" but DB says "queued". + snapshot+="### Cross-Repo Issue Tag Truthfulness\n" + local all_repos + all_repos=$(db "$SUPERVISOR_DB" "SELECT DISTINCT repo FROM tasks WHERE repo IS NOT NULL AND repo != '';" 2>/dev/null || echo "") + if [[ -n "$all_repos" ]]; then + while IFS= read -r check_repo; do + [[ -z "$check_repo" || ! -d "$check_repo" ]] && continue + local check_slug + check_slug=$(detect_repo_slug "$check_repo" 2>/dev/null || echo "") + [[ -z "$check_slug" ]] && continue + local repo_name + repo_name=$(basename "$check_repo") + + # Get tasks with both DB status and GH issue + local db_tasks_with_issues + db_tasks_with_issues=$(db -separator '|' "$SUPERVISOR_DB" " + SELECT id, status, issue_url FROM tasks + WHERE repo = '$(sql_escape "$check_repo")' + AND issue_url IS NOT NULL AND issue_url != '' + AND status NOT IN ('cancelled') + ORDER BY id; + " 2>/dev/null || echo "") + + if [[ -z "$db_tasks_with_issues" ]]; then + continue + fi + + local drift_count=0 + while IFS='|' read -r dtid dtstatus dtissue; do + [[ -z "$dtid" || -z "$dtissue" ]] && continue + # Extract issue number + local issue_num + issue_num=$(echo "$dtissue" | grep -oE '[0-9]+$' || echo "") + [[ -z "$issue_num" ]] && continue + + # Get GH labels for this issue (cached per-pulse is fine) + local gh_labels + gh_labels=$(gh issue view "$issue_num" --repo "$check_slug" --json labels --jq '[.labels[].name] | join(",")' 2>/dev/null || echo "") + + # Map DB status to expected GH label + local expected_label="" + case "$dtstatus" in + queued) expected_label="status:queued" ;; + running | dispatched) expected_label="status:claimed" ;; + pr_review | review_triage) expected_label="status:in-review" ;; + blocked) expected_label="status:blocked" ;; + complete | verified | deployed) expected_label="status:in-review" ;; # PR submitted + esac + + if [[ -n "$expected_label" ]] && ! echo "$gh_labels" | grep -q "$expected_label"; then + snapshot+="DRIFT ($repo_name): $dtid DB=$dtstatus but GH#$issue_num labels=[$gh_labels] (expected $expected_label)\n" + drift_count=$((drift_count + 1)) + fi + done <<<"$db_tasks_with_issues" + + if [[ "$drift_count" -eq 0 ]]; then + snapshot+="$repo_name: all issue tags match DB state\n" + else + snapshot+="$repo_name: $drift_count tag drift(s) found\n" + fi + done <<<"$all_repos" + else + snapshot+="(no repos in DB)\n" + fi + snapshot+="\n" + + # 9. Identity context local identity identity=$(get_aidevops_identity 2>/dev/null || whoami) snapshot+="### Context\n" @@ -370,7 +513,13 @@ $state_snapshot 5. **Stale claims**: Tasks with assignee:/started: but no corresponding running worker in the DB or PID files. The claim may be from a dead session. -6. **Any other contradiction** you can identify between the three state sources. +6. **Pipeline phase failures**: Check the "Pipeline Phase Health" section. If Phase 3 evaluated 0 tasks for multiple consecutive runs, or there are many "could not gather state" errors, the pipeline is broken — likely a schema drift or query bug. Recommend "log_only" with a clear diagnosis so a human can investigate. + +7. **Schema drift**: Check the "Schema Validation" section. If any required columns are missing, this is CRITICAL — queries referencing those columns silently return empty results. Recommend "log_only" with urgency. + +8. **Issue tag drift**: Check the "Cross-Repo Issue Tag Truthfulness" section. If GitHub issue labels don't match DB state, recommend "log_only" with the specific drifts so the label sync can be triggered. + +9. **Any other contradiction** you can identify between the state sources. ## Output Format @@ -388,7 +537,8 @@ Example: {"action": "unclaim", "task_id": "t123", "reasoning": "DB says failed but TODO.md has active claim"}, {"action": "cancel_orphan", "task_id": "t456", "reasoning": "In DB as queued but missing from TODO.md"}, {"action": "remove_blocker", "task_id": "t789", "blocker_id": "t456", "reasoning": "Blocker t456 permanently failed (3/3 retries)"}, - {"action": "add_auto_dispatch", "task_id": "t101", "reasoning": "Has model:sonnet, ~2h estimate, no blockers — eligible for dispatch"} + {"action": "add_auto_dispatch", "task_id": "t101", "reasoning": "Has model:sonnet, ~2h estimate, no blockers — eligible for dispatch"}, + {"action": "log_only", "task_id": "pipeline", "message": "Phase 3 evaluated 0 tasks for 5 consecutive runs — possible schema drift in gather_task_state query"} ] If nothing needs fixing, return: []