marcusquinn · marcusquinn · Feb 10, 2026 · Feb 10, 2026 · gemini-code-assist · Feb 10, 2026
diff --git a/.agents/scripts/supervisor-helper.sh b/.agents/scripts/supervisor-helper.sh
@@ -7222,6 +7222,49 @@ cmd_pr_lifecycle() {
         stage_timings="${stage_timings}deploying:$((stage_end - stage_start))s,"
     fi
 
+    # Step 4b: Auto-recover stuck deploying state (t222)
+    # If a task is already in 'deploying' (from a prior pulse where the deploy
+    # succeeded but the transition to 'deployed' failed), re-attempt the
+    # transition and housekeeping steps. The deploy itself already completed
+    # successfully — only the state transition was lost.
+    if [[ "$tstatus" == "deploying" ]]; then
+        local stage_start
+        stage_start=$(date +%s)
+
+        log_warn "Task $task_id stuck in deploying state — attempting auto-recovery (t222)"
+
+        if [[ "$dry_run" == "false" ]]; then
+            # Re-run housekeeping that may have been skipped when the prior
+            # transition failed (all non-blocking, best-effort)
+            cleanup_after_merge "$task_id" 2>>"$SUPERVISOR_LOG" || log_warn "Worktree cleanup issue for $task_id during recovery (non-blocking)"
+            update_todo_on_complete "$task_id" 2>>"$SUPERVISOR_LOG" || log_warn "TODO.md update issue for $task_id during recovery (non-blocking)"
+            populate_verify_queue "$task_id" "$tpr" "$trepo" 2>>"$SUPERVISOR_LOG" || log_warn "Verify queue population issue for $task_id during recovery (non-blocking)"
+
+            # Attempt the transition that previously failed
+            if cmd_transition "$task_id" "deployed" 2>>"$SUPERVISOR_LOG"; then
+                log_success "Auto-recovered $task_id: deploying -> deployed (t222)"
+                send_task_notification "$task_id" "deployed" "Auto-recovered from stuck deploying state" 2>>"$SUPERVISOR_LOG" || true
+                store_success_pattern "$task_id" "deployed" "" 2>>"$SUPERVISOR_LOG" || true
+                write_proof_log --task "$task_id" --event "auto_recover" --stage "deploying" \
+                    --decision "deploying->deployed" --evidence "stuck_state_recovery" \
+                    --maker "pr_lifecycle:t222" 2>/dev/null || true
-                    --maker "pr_lifecycle:t222" 2>/dev/null || true
+                    --maker "pr_lifecycle:t222" 2>>"$SUPERVISOR_LOG" || true
-                    --maker "pr_lifecycle:t222" 2>/dev/null || true
+                    --maker "pr_lifecycle:t222" 2>>"$SUPERVISOR_LOG" || true
+            else
+                log_error "Auto-recovery failed for $task_id — transition to deployed rejected"
+                # If the transition itself is invalid, something is deeply wrong.
+                # Transition to failed so the task doesn't stay stuck forever.
+                cmd_transition "$task_id" "failed" --error "Auto-recovery failed: deploying->deployed transition rejected (t222)" 2>>"$SUPERVISOR_LOG" || true
+                send_task_notification "$task_id" "failed" "Stuck in deploying, auto-recovery failed" 2>>"$SUPERVISOR_LOG" || true
+            fi
+        else
+            log_info "[dry-run] Would auto-recover $task_id from deploying to deployed"
+        fi
+
+        # t222: Record recovery timing
+        local stage_end
+        stage_end=$(date +%s)
+        stage_timings="${stage_timings}deploying_recovery:$((stage_end - stage_start))s,"
+    fi
+
     # t219: Record total lifecycle timing and log to proof-log
     local lifecycle_end_time
     lifecycle_end_time=$(date +%s)
@@ -7774,6 +7817,33 @@ cmd_pulse() {
         done <<< "$stale_diags"
     fi
 
+    # Phase 4d: Auto-recover stuck deploying tasks (t222)
+    # Tasks can get stuck in 'deploying' if the deploy succeeds but the
+    # transition to 'deployed' fails (e.g., DB write error, process killed
+    # mid-transition). Detect tasks in 'deploying' state for longer than
+    # the deploy timeout and auto-recover them via process_post_pr_lifecycle
+    # (which now handles the deploying state in Step 4b of cmd_pr_lifecycle).
+    local deploying_timeout_seconds="${SUPERVISOR_DEPLOY_TIMEOUT:-600}"  # 10 min default
+    local stuck_deploying
+    stuck_deploying=$(db "$SUPERVISOR_DB" "
+        SELECT id, updated_at FROM tasks
+        WHERE status = 'deploying'
+        AND updated_at < strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-${deploying_timeout_seconds} seconds');
+    " 2>/dev/null || echo "")
-    " 2>/dev/null || echo "")
+    " 2>>"$SUPERVISOR_LOG" || echo "")
-    " 2>/dev/null || echo "")
+    " 2>>"$SUPERVISOR_LOG" || echo "")
+
+    if [[ -n "$stuck_deploying" ]]; then
+        while IFS='|' read -r stuck_id stuck_updated; do
+            [[ -n "$stuck_id" ]] || continue
+            log_warn "  Stuck deploying: $stuck_id (last updated: ${stuck_updated:-unknown}, timeout: ${deploying_timeout_seconds}s) — triggering recovery (t222)"
+            # process_post_pr_lifecycle will pick this up and run cmd_pr_lifecycle
+            # which now handles the deploying state in Step 4b
+            cmd_pr_lifecycle "$stuck_id" 2>>"$SUPERVISOR_LOG" || {
+                log_error "  Recovery failed for stuck deploying task $stuck_id — forcing to deployed"
+                cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
-                log_error "  Recovery failed for stuck deploying task $stuck_id — forcing to deployed"
-                cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
+                log_error "  Recovery failed for stuck deploying task $stuck_id — forcing to failed for manual review"
+                cmd_transition "$stuck_id" "failed" --error "Catastrophic recovery failure for stuck deploying task (t222)" 2>>"$SUPERVISOR_LOG" || true
-                log_error "  Recovery failed for stuck deploying task $stuck_id — forcing to deployed"
-                cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
+                log_error "  Recovery failed for stuck deploying task $stuck_id — forcing to failed for manual review"
+                cmd_transition "$stuck_id" "failed" --error "Catastrophic recovery failure for stuck deploying task (t222)" 2>>"$SUPERVISOR_LOG" || true
+            }
-            cmd_pr_lifecycle "$stuck_id" 2>>"$SUPERVISOR_LOG" || {
-                log_error "  Recovery failed for stuck deploying task $stuck_id — forcing to deployed"
-                cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
-            }
+            cmd_pr_lifecycle "$stuck_id" 2>>"$SUPERVISOR_LOG" || {
+                log_error "  Recovery failed for stuck deploying task $stuck_id — evaluating fallback"
+                local current_state
+                current_state=$(db "$SUPERVISOR_DB" "SELECT status FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo "")
+                if [[ "$current_state" == "deploying" ]]; then
+                    cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
+                else
+                    log_warn "  Skipping force-recovery for $stuck_id (state now $current_state)"
+                fi
+            }
-            cmd_pr_lifecycle "$stuck_id" 2>>"$SUPERVISOR_LOG" || {
-                log_error "  Recovery failed for stuck deploying task $stuck_id — forcing to deployed"
-                cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
-            }
+            cmd_pr_lifecycle "$stuck_id" 2>>"$SUPERVISOR_LOG" || {
+                log_error "  Recovery failed for stuck deploying task $stuck_id — evaluating fallback"
+                local current_state
+                current_state=$(db "$SUPERVISOR_DB" "SELECT status FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo "")
+                if [[ "$current_state" == "deploying" ]]; then
+                    cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
+                else
+                    log_warn "  Skipping force-recovery for $stuck_id (state now $current_state)"
+                fi
+            }
+        done <<< "$stuck_deploying"
+    fi
+
     # Phase 5: Summary
     local total_running
     total_running=$(cmd_running_count "${batch_id:-}")

diff --git a/tests/test-supervisor-state-machine.sh b/tests/test-supervisor-state-machine.sh
@@ -1575,6 +1575,118 @@ fi
 
 rm -rf "$CLAIM_TEST_DIR"
 
+# ============================================================
+# SECTION: Stuck Deploying Auto-Recovery (t222)
+# ============================================================
+section "Stuck Deploying Auto-Recovery (t222)"
+
+# Test: deploying -> deployed transition is valid (prerequisite)
+sup add test-t222a --repo /tmp/test --description "Deploying recovery test" >/dev/null
+sup transition test-t222a dispatched >/dev/null
+sup transition test-t222a running >/dev/null
+sup transition test-t222a evaluating >/dev/null
+sup transition test-t222a complete >/dev/null
+sup transition test-t222a pr_review >/dev/null
+sup transition test-t222a merging >/dev/null
+sup transition test-t222a merged >/dev/null
+sup transition test-t222a deploying >/dev/null
+
+# Verify task is in deploying state
+if [[ "$(get_status test-t222a)" == "deploying" ]]; then
+    pass "Task reaches deploying state correctly"
+else
+    fail "Task should be in deploying state: $(get_status test-t222a)"
+fi
+
+# Simulate recovery: deploying -> deployed
+sup transition test-t222a deployed >/dev/null
+if [[ "$(get_status test-t222a)" == "deployed" ]]; then
+    pass "deploying -> deployed recovery transition succeeds (t222)"
+else
+    fail "deploying -> deployed recovery failed: $(get_status test-t222a)"
+fi
+
+# Test: deploying -> failed is also valid (deploy failure path)
+sup add test-t222b --repo /tmp/test --description "Deploying failure test" >/dev/null
+sup transition test-t222b dispatched >/dev/null
+sup transition test-t222b running >/dev/null
+sup transition test-t222b evaluating >/dev/null
+sup transition test-t222b complete >/dev/null
+sup transition test-t222b pr_review >/dev/null
+sup transition test-t222b merging >/dev/null
+sup transition test-t222b merged >/dev/null
+sup transition test-t222b deploying >/dev/null
+sup transition test-t222b failed --error "Deploy failed during recovery" >/dev/null
+if [[ "$(get_status test-t222b)" == "failed" ]]; then
+    pass "deploying -> failed transition succeeds (deploy failure path)"
+else
+    fail "deploying -> failed transition failed: $(get_status test-t222b)"
+fi
+
+# Test: state_log records deploying recovery transitions
+log_entries=$(test_db "SELECT from_state || '->' || to_state FROM state_log WHERE task_id = 'test-t222a' AND from_state = 'deploying';")
+if echo "$log_entries" | grep -q "deploying->deployed"; then
+    pass "State log records deploying -> deployed recovery (t222)"
+else
+    fail "State log missing deploying -> deployed entry" "Got: $log_entries"
+fi
+
+# Test: cmd_pr_lifecycle handles deploying state via sourced function
+# Create a task stuck in deploying and verify pr_lifecycle recovers it
+sup add test-t222c --repo /tmp/test --description "PR lifecycle deploying recovery" >/dev/null
+sup transition test-t222c dispatched >/dev/null
+sup transition test-t222c running >/dev/null
+sup transition test-t222c evaluating >/dev/null
+sup transition test-t222c complete >/dev/null
+sup transition test-t222c pr_review >/dev/null
+sup transition test-t222c merging >/dev/null
+sup transition test-t222c merged >/dev/null
+sup transition test-t222c deploying >/dev/null
+
+# Run cmd_pr_lifecycle on the stuck task — it should auto-recover
+lifecycle_output=$(bash -c "
+    export AIDEVOPS_SUPERVISOR_DIR='$TEST_DIR'
+    set -- init
+    source '$SUPERVISOR_SCRIPT' >/dev/null
+    cmd_pr_lifecycle 'test-t222c'
+" 2>&1 || true)
+
+recovered_status=$(get_status test-t222c)
+if [[ "$recovered_status" == "deployed" ]]; then
+    pass "cmd_pr_lifecycle auto-recovers stuck deploying -> deployed (t222)"
+else
+    # Also acceptable: failed (if recovery transition was rejected for some reason)
+    if [[ "$recovered_status" == "failed" ]]; then
+        pass "cmd_pr_lifecycle handles stuck deploying (transitioned to failed)"
+    else
+        fail "cmd_pr_lifecycle did not recover stuck deploying task" "Status: $recovered_status, Output: $(echo "$lifecycle_output" | tail -3)"
+    fi
+fi
+
+# Test: invalid transition from deploying (e.g., deploying -> queued)
+sup add test-t222d --repo /tmp/test --description "Invalid deploying transition" >/dev/null
+sup transition test-t222d dispatched >/dev/null
+sup transition test-t222d running >/dev/null
+sup transition test-t222d evaluating >/dev/null
+sup transition test-t222d complete >/dev/null
+sup transition test-t222d pr_review >/dev/null
+sup transition test-t222d merging >/dev/null
+sup transition test-t222d merged >/dev/null
+sup transition test-t222d deploying >/dev/null
+invalid_deploying=$(sup transition test-t222d queued 2>&1 || true)
+if echo "$invalid_deploying" | grep -qi "invalid transition"; then
+    pass "deploying -> queued rejected (invalid transition)"
+else
+    fail "deploying -> queued should be rejected" "$invalid_deploying"
+fi
+
+# Verify state unchanged after invalid transition
+if [[ "$(get_status test-t222d)" == "deploying" ]]; then
+    pass "State unchanged after invalid deploying transition (t222)"
+else
+    fail "State changed despite invalid transition: $(get_status test-t222d)"
+fi
+
 # ============================================================
 # SUMMARY
 # ============================================================