diff --git a/.agents/scripts/supervisor/ai-actions.sh b/.agents/scripts/supervisor/ai-actions.sh index c3e25ded4..1516d8201 100644 --- a/.agents/scripts/supervisor/ai-actions.sh +++ b/.agents/scripts/supervisor/ai-actions.sh @@ -333,10 +333,8 @@ validate_action_fields() { echo "missing required field: task_id" return 0 fi - if [[ -z "$new_priority" ]]; then - echo "missing required field: new_priority" - return 0 - fi + # new_priority is no longer strictly required — the executor infers it + # from reasoning text if missing (see _exec_adjust_priority) ;; close_verified) local issue_number pr_number @@ -715,9 +713,25 @@ _exec_adjust_priority() { local task_id new_priority reasoning task_id=$(printf '%s' "$action" | jq -r '.task_id') - new_priority=$(printf '%s' "$action" | jq -r '.new_priority') + new_priority=$(printf '%s' "$action" | jq -r '.new_priority // empty') reasoning=$(printf '%s' "$action" | jq -r '.reasoning // "No reasoning provided"') + # Infer priority from reasoning if the AI omitted the field (common pattern — + # the AI has omitted new_priority in 13+ actions across 5+ cycles) + if [[ -z "$new_priority" || "$new_priority" == "null" ]]; then + if printf '%s' "$reasoning" | grep -qi 'critical\|urgent\|blocker\|blocking'; then + new_priority="critical" + elif printf '%s' "$reasoning" | grep -qi 'high\|important\|prioriti'; then + new_priority="high" + elif printf '%s' "$reasoning" | grep -qi 'low\|minor\|defer'; then + new_priority="low" + else + # Default to high — the AI is recommending a change, usually an escalation + new_priority="high" + fi + log_warn "AI Actions: adjust_priority inferred new_priority='$new_priority' from reasoning (field was missing)" + fi + # Find the task's GitHub issue number local issue_number="" if declare -f find_task_issue_number &>/dev/null; then diff --git a/.agents/scripts/supervisor/ai-reason.sh b/.agents/scripts/supervisor/ai-reason.sh index c830b364f..f9e6122ef 100755 --- a/.agents/scripts/supervisor/ai-reason.sh +++ b/.agents/scripts/supervisor/ai-reason.sh @@ -282,10 +282,22 @@ ${user_prompt}" if [[ -z "$action_plan" || "$action_plan" == "null" ]]; then log_warn "AI Reasoning: no parseable action plan in response" + # Debug diagnostics for intermittent parse failures + local response_len json_block_count first_bytes last_bytes + response_len=$(printf '%s' "$ai_result" | wc -c | tr -d ' ') + json_block_count=$(printf '%s' "$ai_result" | grep -c '^```json' || echo 0) + first_bytes=$(printf '%s' "$ai_result" | head -c 100 | tr '\n' ' ') + last_bytes=$(printf '%s' "$ai_result" | tail -c 100 | tr '\n' ' ') { echo "## Parsing Result" echo "" echo "Status: FAILED - no parseable JSON action plan" + echo "" + echo "### Debug Diagnostics" + echo "- Response length: $response_len bytes" + echo "- \`\`\`json blocks found: $json_block_count" + echo "- First 100 bytes: \`$first_bytes\`" + echo "- Last 100 bytes: \`$last_bytes\`" } >>"$reason_log" echo '{"error":"no_action_plan","actions":[]}' _release_ai_lock @@ -517,6 +529,30 @@ extract_action_plan() { fi fi + # Try 5: Write response to temp file and parse from file + # This handles edge cases where the shell variable may have lost data + # (e.g., null bytes, very long lines, or subshell truncation) + local tmpfile + tmpfile=$(mktemp "${TMPDIR:-/tmp}/ai-parse-XXXXXX") + printf '%s' "$response" >"$tmpfile" + + # Try file-based extraction of last ```json block + json_block=$(awk ' + /^```json/ { capture=1; block=""; next } + /^```$/ && capture { capture=0; last_block=block; next } + capture { block = block (block ? "\n" : "") $0 } + END { if (capture && block) print block; else if (last_block) print last_block } + ' "$tmpfile") + rm -f "$tmpfile" + + if [[ -n "$json_block" ]]; then + parsed=$(printf '%s' "$json_block" | jq '.' 2>/dev/null) + if [[ $? -eq 0 && -n "$parsed" ]]; then + printf '%s' "$parsed" + return 0 + fi + fi + # Failed to parse echo "" return 0 diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh index 13d846194..17449a1ac 100755 --- a/.agents/scripts/supervisor/pulse.sh +++ b/.agents/scripts/supervisor/pulse.sh @@ -497,6 +497,65 @@ cmd_pulse() { done <<<"$retrying_tasks" fi + # Phase 1c: Auto-reap stuck evaluating tasks (self-healing) + # Tasks can get stuck in 'evaluating' when the worker dies but evaluation + # fails or times out. Phase 1 handles tasks with dead workers that it finds + # in the running_tasks query, but tasks can also get stuck if: + # - The evaluation itself crashed (jq error, timeout, etc.) + # - The task was left in evaluating from a previous pulse that was killed + # This phase catches any evaluating task older than 10 minutes with no + # live worker process, and force-transitions it to failed for retry. + local stuck_evaluating + stuck_evaluating=$(db -separator '|' "$SUPERVISOR_DB" " + SELECT id, updated_at FROM tasks + WHERE status = 'evaluating' + AND updated_at < strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-10 minutes') + ORDER BY updated_at ASC; + " 2>/dev/null || echo "") + + if [[ -n "$stuck_evaluating" ]]; then + while IFS='|' read -r stuck_id stuck_updated; do + [[ -z "$stuck_id" ]] && continue + + # Double-check: is the worker actually dead? + local stuck_pid_file="$SUPERVISOR_DIR/pids/${stuck_id}.pid" + local stuck_alive=false + if [[ -f "$stuck_pid_file" ]]; then + local stuck_pid + stuck_pid=$(cat "$stuck_pid_file" 2>/dev/null || echo "") + if [[ -n "$stuck_pid" ]] && kill -0 "$stuck_pid" 2>/dev/null; then + stuck_alive=true + fi + fi + + if [[ "$stuck_alive" == "true" ]]; then + log_info " Phase 1c: $stuck_id evaluating since $stuck_updated but worker still alive — skipping" + continue + fi + + log_warn " Phase 1c: $stuck_id stuck in evaluating since $stuck_updated (worker dead) — force-transitioning to failed" + + # Check retry count + local stuck_retries stuck_max_retries + stuck_retries=$(db "$SUPERVISOR_DB" "SELECT retries FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo 0) + stuck_max_retries=$(db "$SUPERVISOR_DB" "SELECT max_retries FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo 3) + + if [[ "$stuck_retries" -lt "$stuck_max_retries" ]]; then + # Transition to retrying so it gets re-dispatched + cmd_transition "$stuck_id" "retrying" --error "Auto-reaped: stuck in evaluating >10min with dead worker (Phase 1c)" 2>>"$SUPERVISOR_LOG" || true + db "$SUPERVISOR_DB" "UPDATE tasks SET retries = retries + 1, updated_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now') WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || true + log_info " Phase 1c: $stuck_id → retrying (retry $((stuck_retries + 1))/$stuck_max_retries)" + else + # Max retries exhausted — mark as failed + cmd_transition "$stuck_id" "failed" --error "Auto-reaped: stuck in evaluating >10min, max retries exhausted (Phase 1c)" 2>>"$SUPERVISOR_LOG" || true + log_warn " Phase 1c: $stuck_id → failed (max retries exhausted)" + fi + + # Clean up PID file + cleanup_worker_processes "$stuck_id" 2>>"$SUPERVISOR_LOG" || true + done <<<"$stuck_evaluating" + fi + # Phase 2: Dispatch queued tasks up to concurrency limit if [[ -n "$batch_id" ]]; then @@ -555,6 +614,59 @@ cmd_pulse() { fi fi + # Phase 2b: Dispatch stall detection and auto-recovery + # If there are queued tasks but nothing was dispatched and nothing is running, + # the pipeline is stalled. Common causes: + # - No active batch (auto-pickup creates batches, but may have failed) + # - All tasks stuck in non-dispatchable states (evaluating, blocked) + # - Provider unavailable for extended period + # - Concurrency limit misconfigured to 0 + if [[ "$dispatched_count" -eq 0 ]]; then + local queued_count running_count + queued_count=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM tasks WHERE status = 'queued';" 2>/dev/null || echo 0) + running_count=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM tasks WHERE status IN ('running', 'dispatched');" 2>/dev/null || echo 0) + + if [[ "$queued_count" -gt 0 && "$running_count" -eq 0 ]]; then + log_warn "Phase 2b: Dispatch stall detected — $queued_count queued, 0 running, 0 dispatched this pulse" + + # Diagnose: is there an active batch? + local active_batch_count + active_batch_count=$(db "$SUPERVISOR_DB" " + SELECT COUNT(*) FROM batches + WHERE status IN ('active', 'running');" 2>/dev/null || echo 0) + + if [[ "$active_batch_count" -eq 0 ]]; then + log_warn "Phase 2b: No active batch found — queued tasks have no batch to dispatch from" + # Auto-recovery: trigger auto-pickup to create a batch + # This handles the case where tasks were added to the DB but no batch was created + local stall_repos + stall_repos=$(db "$SUPERVISOR_DB" "SELECT DISTINCT repo FROM tasks WHERE status = 'queued';" 2>/dev/null || echo "") + if [[ -n "$stall_repos" ]]; then + while IFS= read -r stall_repo; do + [[ -z "$stall_repo" ]] && continue + log_info "Phase 2b: Re-running auto-pickup for $stall_repo to create batch" + cmd_auto_pickup --repo "$stall_repo" 2>>"$SUPERVISOR_LOG" || true + done <<<"$stall_repos" + fi + else + # Batch exists but dispatch failed — log diagnostic info + local batch_info + batch_info=$(db -separator '|' "$SUPERVISOR_DB" " + SELECT id, concurrency, status FROM batches + WHERE status IN ('active', 'running') + LIMIT 1;" 2>/dev/null || echo "") + log_warn "Phase 2b: Active batch exists ($batch_info) but dispatch produced 0 — check concurrency limits and provider health" + fi + + # Track stall count in state_log for the AI self-reflection to pick up + db "$SUPERVISOR_DB" " + INSERT INTO state_log (task_id, from_state, to_state, reason) + VALUES ('supervisor', 'dispatch', 'stalled', + '$(sql_escape "Dispatch stall: $queued_count queued, 0 running, 0 dispatched. Active batches: $active_batch_count")'); + " 2>/dev/null || true + fi + fi + # Phase 2.5: Contest mode — check running contests for completion (t1011) # If any contest has all entries complete, evaluate cross-rankings and apply winner local contest_helper="${SCRIPT_DIR}/contest-helper.sh"