diff --git a/.agents/scripts/supervisor-helper.sh b/.agents/scripts/supervisor-helper.sh index 6915c8a75..b29200efd 100755 --- a/.agents/scripts/supervisor-helper.sh +++ b/.agents/scripts/supervisor-helper.sh @@ -8385,12 +8385,36 @@ cmd_pr_lifecycle() { # Populate VERIFY.md queue for post-merge verification (t180.2) populate_verify_queue "$task_id" "$tpr" "$trepo" 2>>"$SUPERVISOR_LOG" || log_warn "Verify queue population issue for $task_id (non-blocking)" - # Final transition - cmd_transition "$task_id" "deployed" || log_warn "Failed to transition $task_id to deployed" - - # Notify (best-effort, suppress errors) - send_task_notification "$task_id" "deployed" "PR merged, deployed, worktree cleaned" 2>>"$SUPERVISOR_LOG" || true - store_success_pattern "$task_id" "deployed" "" 2>>"$SUPERVISOR_LOG" || true + # t248: Final transition with retry logic (3 attempts: 0s, 1s, 3s) + local deploy_retry_count=0 + local deploy_max_retries=3 + local deploy_succeeded=false + local deploy_error="" + + while [[ "$deploy_retry_count" -lt "$deploy_max_retries" ]]; do + if [[ "$deploy_retry_count" -gt 0 ]]; then + local deploy_backoff=$((2 ** (deploy_retry_count - 1))) + log_info " Transition retry $deploy_retry_count/$deploy_max_retries after ${deploy_backoff}s backoff..." + sleep "$deploy_backoff" + fi + + if deploy_error=$(cmd_transition "$task_id" "deployed" 2>&1); then + deploy_succeeded=true + break + fi + + deploy_retry_count=$((deploy_retry_count + 1)) + log_warn " Transition attempt $deploy_retry_count failed: $deploy_error" + done + + if [[ "$deploy_succeeded" == "true" ]]; then + # Notify (best-effort, suppress errors) + send_task_notification "$task_id" "deployed" "PR merged, deployed, worktree cleaned" 2>>"$SUPERVISOR_LOG" || true + store_success_pattern "$task_id" "deployed" "" 2>>"$SUPERVISOR_LOG" || true + else + log_error "Failed to transition $task_id to deployed after $deploy_max_retries attempts: $deploy_error" + # Task will remain in 'deploying' and Phase 4b will retry on next pulse + fi else log_info "[dry-run] Would deploy and clean up for $task_id" fi @@ -8401,7 +8425,7 @@ cmd_pr_lifecycle() { stage_timings="${stage_timings}deploying:$((stage_end - stage_start))s," fi - # Step 4b: Auto-recover stuck deploying state (t222) + # Step 4b: Auto-recover stuck deploying state (t222, t248) # If a task is already in 'deploying' (from a prior pulse where the deploy # succeeded but the transition to 'deployed' failed), re-attempt the # transition and housekeeping steps. The deploy itself already completed @@ -8410,7 +8434,7 @@ cmd_pr_lifecycle() { local stage_start stage_start=$(date +%s) - log_warn "Task $task_id stuck in deploying state — attempting auto-recovery (t222)" + log_warn "Task $task_id stuck in deploying state — attempting auto-recovery (t222, t248)" if [[ "$dry_run" == "false" ]]; then # Re-run housekeeping that may have been skipped when the prior @@ -8419,20 +8443,42 @@ cmd_pr_lifecycle() { update_todo_on_complete "$task_id" 2>>"$SUPERVISOR_LOG" || log_warn "TODO.md update issue for $task_id during recovery (non-blocking)" populate_verify_queue "$task_id" "$tpr" "$trepo" 2>>"$SUPERVISOR_LOG" || log_warn "Verify queue population issue for $task_id during recovery (non-blocking)" - # Attempt the transition that previously failed - if cmd_transition "$task_id" "deployed" 2>>"$SUPERVISOR_LOG"; then - log_success "Auto-recovered $task_id: deploying -> deployed (t222)" - send_task_notification "$task_id" "deployed" "Auto-recovered from stuck deploying state" 2>>"$SUPERVISOR_LOG" || true + # t248: Retry transition with exponential backoff (3 attempts: 0s, 1s, 3s) + local retry_count=0 + local max_retries=3 + local retry_succeeded=false + local transition_error="" + + while [[ "$retry_count" -lt "$max_retries" ]]; do + if [[ "$retry_count" -gt 0 ]]; then + local backoff_delay=$((2 ** (retry_count - 1))) + log_info " Retry $retry_count/$max_retries after ${backoff_delay}s backoff..." + sleep "$backoff_delay" + fi + + # Capture transition error output for debugging + if transition_error=$(cmd_transition "$task_id" "deployed" 2>&1); then + retry_succeeded=true + break + fi + + retry_count=$((retry_count + 1)) + log_warn " Transition attempt $retry_count failed: $transition_error" + done + + if [[ "$retry_succeeded" == "true" ]]; then + log_success "Auto-recovered $task_id: deploying -> deployed (t222, t248, attempts: $retry_count)" + send_task_notification "$task_id" "deployed" "Auto-recovered from stuck deploying state (attempts: $retry_count)" 2>>"$SUPERVISOR_LOG" || true store_success_pattern "$task_id" "deployed" "" 2>>"$SUPERVISOR_LOG" || true write_proof_log --task "$task_id" --event "auto_recover" --stage "deploying" \ - --decision "deploying->deployed" --evidence "stuck_state_recovery" \ - --maker "pr_lifecycle:t222" 2>/dev/null || true + --decision "deploying->deployed" --evidence "stuck_state_recovery,retries:$retry_count" \ + --maker "pr_lifecycle:t222:t248" 2>/dev/null || true else - log_error "Auto-recovery failed for $task_id — transition to deployed rejected" - # If the transition itself is invalid, something is deeply wrong. + log_error "Auto-recovery failed for $task_id after $max_retries attempts — last error: $transition_error" + # If the transition itself is invalid after retries, something is deeply wrong. # Transition to failed so the task doesn't stay stuck forever. - cmd_transition "$task_id" "failed" --error "Auto-recovery failed: deploying->deployed transition rejected (t222)" 2>>"$SUPERVISOR_LOG" || true - send_task_notification "$task_id" "failed" "Stuck in deploying, auto-recovery failed" 2>>"$SUPERVISOR_LOG" || true + cmd_transition "$task_id" "failed" --error "Auto-recovery failed after $max_retries attempts: $transition_error (t222, t248)" 2>>"$SUPERVISOR_LOG" || true + send_task_notification "$task_id" "failed" "Stuck in deploying, auto-recovery failed after $max_retries attempts" 2>>"$SUPERVISOR_LOG" || true fi else log_info "[dry-run] Would auto-recover $task_id from deploying to deployed" @@ -9048,13 +9094,14 @@ cmd_pulse() { done <<< "$stale_diags" fi - # Phase 4d: Auto-recover stuck deploying tasks (t222) + # Phase 4d: Auto-recover stuck deploying tasks (t222, t248) # Tasks can get stuck in 'deploying' if the deploy succeeds but the # transition to 'deployed' fails (e.g., DB write error, process killed # mid-transition). Detect tasks in 'deploying' state for longer than # the deploy timeout and auto-recover them via process_post_pr_lifecycle # (which now handles the deploying state in Step 4b of cmd_pr_lifecycle). - local deploying_timeout_seconds="${SUPERVISOR_DEPLOY_TIMEOUT:-600}" # 10 min default + # t248: Reduced from 600s (10min) to 120s (2min) for faster recovery + local deploying_timeout_seconds="${SUPERVISOR_DEPLOY_TIMEOUT:-120}" # 2 min default local stuck_deploying stuck_deploying=$(db "$SUPERVISOR_DB" " SELECT id, updated_at FROM tasks