diff --git a/.agents/scripts/supervisor-helper.sh b/.agents/scripts/supervisor-helper.sh index 31691073a..0c8e202f3 100755 --- a/.agents/scripts/supervisor-helper.sh +++ b/.agents/scripts/supervisor-helper.sh @@ -7222,6 +7222,49 @@ cmd_pr_lifecycle() { stage_timings="${stage_timings}deploying:$((stage_end - stage_start))s," fi + # Step 4b: Auto-recover stuck deploying state (t222) + # If a task is already in 'deploying' (from a prior pulse where the deploy + # succeeded but the transition to 'deployed' failed), re-attempt the + # transition and housekeeping steps. The deploy itself already completed + # successfully — only the state transition was lost. + if [[ "$tstatus" == "deploying" ]]; then + local stage_start + stage_start=$(date +%s) + + log_warn "Task $task_id stuck in deploying state — attempting auto-recovery (t222)" + + if [[ "$dry_run" == "false" ]]; then + # Re-run housekeeping that may have been skipped when the prior + # transition failed (all non-blocking, best-effort) + cleanup_after_merge "$task_id" 2>>"$SUPERVISOR_LOG" || log_warn "Worktree cleanup issue for $task_id during recovery (non-blocking)" + update_todo_on_complete "$task_id" 2>>"$SUPERVISOR_LOG" || log_warn "TODO.md update issue for $task_id during recovery (non-blocking)" + populate_verify_queue "$task_id" "$tpr" "$trepo" 2>>"$SUPERVISOR_LOG" || log_warn "Verify queue population issue for $task_id during recovery (non-blocking)" + + # Attempt the transition that previously failed + if cmd_transition "$task_id" "deployed" 2>>"$SUPERVISOR_LOG"; then + log_success "Auto-recovered $task_id: deploying -> deployed (t222)" + send_task_notification "$task_id" "deployed" "Auto-recovered from stuck deploying state" 2>>"$SUPERVISOR_LOG" || true + store_success_pattern "$task_id" "deployed" "" 2>>"$SUPERVISOR_LOG" || true + write_proof_log --task "$task_id" --event "auto_recover" --stage "deploying" \ + --decision "deploying->deployed" --evidence "stuck_state_recovery" \ + --maker "pr_lifecycle:t222" 2>/dev/null || true + else + log_error "Auto-recovery failed for $task_id — transition to deployed rejected" + # If the transition itself is invalid, something is deeply wrong. + # Transition to failed so the task doesn't stay stuck forever. + cmd_transition "$task_id" "failed" --error "Auto-recovery failed: deploying->deployed transition rejected (t222)" 2>>"$SUPERVISOR_LOG" || true + send_task_notification "$task_id" "failed" "Stuck in deploying, auto-recovery failed" 2>>"$SUPERVISOR_LOG" || true + fi + else + log_info "[dry-run] Would auto-recover $task_id from deploying to deployed" + fi + + # t222: Record recovery timing + local stage_end + stage_end=$(date +%s) + stage_timings="${stage_timings}deploying_recovery:$((stage_end - stage_start))s," + fi + # t219: Record total lifecycle timing and log to proof-log local lifecycle_end_time lifecycle_end_time=$(date +%s) @@ -7774,6 +7817,33 @@ cmd_pulse() { done <<< "$stale_diags" fi + # Phase 4d: Auto-recover stuck deploying tasks (t222) + # Tasks can get stuck in 'deploying' if the deploy succeeds but the + # transition to 'deployed' fails (e.g., DB write error, process killed + # mid-transition). Detect tasks in 'deploying' state for longer than + # the deploy timeout and auto-recover them via process_post_pr_lifecycle + # (which now handles the deploying state in Step 4b of cmd_pr_lifecycle). + local deploying_timeout_seconds="${SUPERVISOR_DEPLOY_TIMEOUT:-600}" # 10 min default + local stuck_deploying + stuck_deploying=$(db "$SUPERVISOR_DB" " + SELECT id, updated_at FROM tasks + WHERE status = 'deploying' + AND updated_at < strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-${deploying_timeout_seconds} seconds'); + " 2>/dev/null || echo "") + + if [[ -n "$stuck_deploying" ]]; then + while IFS='|' read -r stuck_id stuck_updated; do + [[ -n "$stuck_id" ]] || continue + log_warn " Stuck deploying: $stuck_id (last updated: ${stuck_updated:-unknown}, timeout: ${deploying_timeout_seconds}s) — triggering recovery (t222)" + # process_post_pr_lifecycle will pick this up and run cmd_pr_lifecycle + # which now handles the deploying state in Step 4b + cmd_pr_lifecycle "$stuck_id" 2>>"$SUPERVISOR_LOG" || { + log_error " Recovery failed for stuck deploying task $stuck_id — forcing to deployed" + cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true + } + done <<< "$stuck_deploying" + fi + # Phase 5: Summary local total_running total_running=$(cmd_running_count "${batch_id:-}") diff --git a/tests/test-supervisor-state-machine.sh b/tests/test-supervisor-state-machine.sh index b694b67c4..c2a0cf3f8 100644 --- a/tests/test-supervisor-state-machine.sh +++ b/tests/test-supervisor-state-machine.sh @@ -1575,6 +1575,118 @@ fi rm -rf "$CLAIM_TEST_DIR" +# ============================================================ +# SECTION: Stuck Deploying Auto-Recovery (t222) +# ============================================================ +section "Stuck Deploying Auto-Recovery (t222)" + +# Test: deploying -> deployed transition is valid (prerequisite) +sup add test-t222a --repo /tmp/test --description "Deploying recovery test" >/dev/null +sup transition test-t222a dispatched >/dev/null +sup transition test-t222a running >/dev/null +sup transition test-t222a evaluating >/dev/null +sup transition test-t222a complete >/dev/null +sup transition test-t222a pr_review >/dev/null +sup transition test-t222a merging >/dev/null +sup transition test-t222a merged >/dev/null +sup transition test-t222a deploying >/dev/null + +# Verify task is in deploying state +if [[ "$(get_status test-t222a)" == "deploying" ]]; then + pass "Task reaches deploying state correctly" +else + fail "Task should be in deploying state: $(get_status test-t222a)" +fi + +# Simulate recovery: deploying -> deployed +sup transition test-t222a deployed >/dev/null +if [[ "$(get_status test-t222a)" == "deployed" ]]; then + pass "deploying -> deployed recovery transition succeeds (t222)" +else + fail "deploying -> deployed recovery failed: $(get_status test-t222a)" +fi + +# Test: deploying -> failed is also valid (deploy failure path) +sup add test-t222b --repo /tmp/test --description "Deploying failure test" >/dev/null +sup transition test-t222b dispatched >/dev/null +sup transition test-t222b running >/dev/null +sup transition test-t222b evaluating >/dev/null +sup transition test-t222b complete >/dev/null +sup transition test-t222b pr_review >/dev/null +sup transition test-t222b merging >/dev/null +sup transition test-t222b merged >/dev/null +sup transition test-t222b deploying >/dev/null +sup transition test-t222b failed --error "Deploy failed during recovery" >/dev/null +if [[ "$(get_status test-t222b)" == "failed" ]]; then + pass "deploying -> failed transition succeeds (deploy failure path)" +else + fail "deploying -> failed transition failed: $(get_status test-t222b)" +fi + +# Test: state_log records deploying recovery transitions +log_entries=$(test_db "SELECT from_state || '->' || to_state FROM state_log WHERE task_id = 'test-t222a' AND from_state = 'deploying';") +if echo "$log_entries" | grep -q "deploying->deployed"; then + pass "State log records deploying -> deployed recovery (t222)" +else + fail "State log missing deploying -> deployed entry" "Got: $log_entries" +fi + +# Test: cmd_pr_lifecycle handles deploying state via sourced function +# Create a task stuck in deploying and verify pr_lifecycle recovers it +sup add test-t222c --repo /tmp/test --description "PR lifecycle deploying recovery" >/dev/null +sup transition test-t222c dispatched >/dev/null +sup transition test-t222c running >/dev/null +sup transition test-t222c evaluating >/dev/null +sup transition test-t222c complete >/dev/null +sup transition test-t222c pr_review >/dev/null +sup transition test-t222c merging >/dev/null +sup transition test-t222c merged >/dev/null +sup transition test-t222c deploying >/dev/null + +# Run cmd_pr_lifecycle on the stuck task — it should auto-recover +lifecycle_output=$(bash -c " + export AIDEVOPS_SUPERVISOR_DIR='$TEST_DIR' + set -- init + source '$SUPERVISOR_SCRIPT' >/dev/null + cmd_pr_lifecycle 'test-t222c' +" 2>&1 || true) + +recovered_status=$(get_status test-t222c) +if [[ "$recovered_status" == "deployed" ]]; then + pass "cmd_pr_lifecycle auto-recovers stuck deploying -> deployed (t222)" +else + # Also acceptable: failed (if recovery transition was rejected for some reason) + if [[ "$recovered_status" == "failed" ]]; then + pass "cmd_pr_lifecycle handles stuck deploying (transitioned to failed)" + else + fail "cmd_pr_lifecycle did not recover stuck deploying task" "Status: $recovered_status, Output: $(echo "$lifecycle_output" | tail -3)" + fi +fi + +# Test: invalid transition from deploying (e.g., deploying -> queued) +sup add test-t222d --repo /tmp/test --description "Invalid deploying transition" >/dev/null +sup transition test-t222d dispatched >/dev/null +sup transition test-t222d running >/dev/null +sup transition test-t222d evaluating >/dev/null +sup transition test-t222d complete >/dev/null +sup transition test-t222d pr_review >/dev/null +sup transition test-t222d merging >/dev/null +sup transition test-t222d merged >/dev/null +sup transition test-t222d deploying >/dev/null +invalid_deploying=$(sup transition test-t222d queued 2>&1 || true) +if echo "$invalid_deploying" | grep -qi "invalid transition"; then + pass "deploying -> queued rejected (invalid transition)" +else + fail "deploying -> queued should be rejected" "$invalid_deploying" +fi + +# Verify state unchanged after invalid transition +if [[ "$(get_status test-t222d)" == "deploying" ]]; then + pass "State unchanged after invalid deploying transition (t222)" +else + fail "State changed despite invalid transition: $(get_status test-t222d)" +fi + # ============================================================ # SUMMARY # ============================================================