From ba6d2801e26ac372f110fc0a862ea3dd31cb52e4 Mon Sep 17 00:00:00 2001
From: marcusquinn <6428977+marcusquinn@users.noreply.github.com>
Date: Thu, 19 Feb 2026 02:34:34 +0000
Subject: [PATCH 1/2] feat: add two-phase worker hang detection with graceful
 termination (t1222)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At 50% of hung timeout, send SIGTERM for graceful shutdown via wrapper trap.
If worker survives grace period (min 120s, max 240s), escalate to hard SIGKILL.
Saves ~15 minutes per hung worker vs waiting full timeout before killing.

Chose marker-file approach over DB field — stateless across pulse cycles,
no schema migration needed, matches existing PID file pattern in supervisor/pids/.

Env: SUPERVISOR_HANG_GRACEFUL=true|false (default: true) to toggle behavior.
---
 .agents/scripts/supervisor-helper.sh |  1 +
 .agents/scripts/supervisor/pulse.sh  | 44 ++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/.agents/scripts/supervisor-helper.sh b/.agents/scripts/supervisor-helper.sh
index cee47366..4c8b4cbb 100755
--- a/.agents/scripts/supervisor-helper.sh
+++ b/.agents/scripts/supervisor-helper.sh
@@ -467,6 +467,7 @@ Environment:
   SUPERVISOR_TIMEOUT_BUGFIX   Hang timeout for #bugfix/#fix tasks (default: 3600 — 1h)
   SUPERVISOR_TIMEOUT_DOCS     Hang timeout for #docs tasks (default: 1800 — 30m)
   SUPERVISOR_HEARTBEAT_INTERVAL Seconds between worker heartbeat writes to log (default: 300 — 5m)
+  SUPERVISOR_HANG_GRACEFUL    Enable two-phase hang detection: SIGTERM at 50%% timeout, SIGKILL at 100%% (default: true, t1222)
   SUPERVISOR_SELF_MEM_LIMIT   MB before supervisor respawns after batch (default: 8192)
   SUPERVISOR_SKILL_UPDATE_PR  Enable skill update PR pipeline in pulse (default: false)
   SUPERVISOR_SKILL_UPDATE_INTERVAL Seconds between skill update PR runs (default: 86400)
diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh
index 95bc1d06..e9237665 100755
--- a/.agents/scripts/supervisor/pulse.sh
+++ b/.agents/scripts/supervisor/pulse.sh
@@ -2025,6 +2025,8 @@ RULES:
 			if ! kill -0 "$health_pid" 2>/dev/null; then
 				# Dead worker: PID no longer exists
 				rm -f "$pid_file"
+				# t1222: Clean up hang warning marker for dead workers
+				rm -f "$SUPERVISOR_DIR/pids/${health_task}.hang-warned"
 				if [[ "$health_status" == "running" || "$health_status" == "dispatched" ]]; then
 					log_warn "  Dead worker for $health_task (PID $health_pid gone, was $health_status) — evaluating"
 					cmd_evaluate "$health_task" --no-ai 2>>"$SUPERVISOR_LOG" || {
@@ -2063,6 +2065,9 @@ RULES:
 
 					# Check 2: Hung state (no log output for timeout period)
 					# t1199: Use per-task hung timeout based on ~estimate (2x estimate, 4h cap, 30m default)
+					# t1222: Two-phase hang detection — graceful SIGTERM at 50% timeout, hard SIGKILL at 100%.
+					#   Saves ~15 minutes per hung worker by terminating early and retrying immediately
+					#   instead of waiting the full timeout. The wrapper's EXIT trap handles child cleanup.
 					if [[ "$should_kill" == "false" ]]; then
 						local log_file
 						log_file=$(db "$SUPERVISOR_DB" "SELECT log_file FROM tasks WHERE id = '$(sql_escape "$health_task")';" 2>/dev/null || echo "")
@@ -2076,9 +2081,46 @@ RULES:
 							# Compute per-task hung timeout from ~estimate field (t1199)
 							local task_hung_timeout
 							task_hung_timeout=$(get_task_hung_timeout "$health_task" 2>/dev/null || echo "$worker_timeout_seconds")
+
+							# t1222: Two-phase hang detection (disable with SUPERVISOR_HANG_GRACEFUL=false)
+							local hang_graceful="${SUPERVISOR_HANG_GRACEFUL:-true}"
+							local hang_warn_threshold=$((task_hung_timeout / 2))
+							local hang_warn_marker="$SUPERVISOR_DIR/pids/${health_task}.hang-warned"
+
 							if [[ "$log_age_seconds" -gt "$task_hung_timeout" ]]; then
+								# Phase 2 (or single-phase if graceful disabled): Full timeout exceeded — hard kill
 								should_kill=true
 								kill_reason="Worker hung (no output for ${log_age_seconds}s, timeout ${task_hung_timeout}s)"
+								rm -f "$hang_warn_marker"
+							elif [[ "$hang_graceful" == "true" && "$log_age_seconds" -gt "$hang_warn_threshold" ]]; then
+								# Phase 1: 50% timeout exceeded — attempt graceful termination
+								if [[ ! -f "$hang_warn_marker" ]]; then
+									# First detection at 50%: send SIGTERM for graceful shutdown
+									log_warn "  t1222: Worker $health_task possibly hung (no output for ${log_age_seconds}s, 50% of ${task_hung_timeout}s timeout)"
+									log_warn "  t1222: Sending SIGTERM for graceful shutdown (PID $health_pid)"
+									echo "$now_epoch" >"$hang_warn_marker"
+									# SIGTERM triggers the wrapper's cleanup_children trap
+									kill -TERM "$health_pid" 2>/dev/null || true
+								else
+									# Already warned — check if SIGTERM worked (grace period: 2 pulse cycles ~4min)
+									local warn_epoch
+									warn_epoch=$(cat "$hang_warn_marker" 2>/dev/null || echo "0")
+									local grace_elapsed=$((now_epoch - warn_epoch))
+									# Grace period: min(240s, 25% of hung timeout) — enough for 2 pulse cycles
+									local grace_period=$((task_hung_timeout / 4))
+									if [[ "$grace_period" -gt 240 ]]; then
+										grace_period=240
+									fi
+									if [[ "$grace_period" -lt 120 ]]; then
+										grace_period=120
+									fi
+									if [[ "$grace_elapsed" -gt "$grace_period" ]]; then
+										# Grace period expired, worker didn't terminate — escalate to hard kill
+										should_kill=true
+										kill_reason="Worker hung (graceful SIGTERM failed after ${grace_elapsed}s grace, no output for ${log_age_seconds}s)"
+										rm -f "$hang_warn_marker"
+									fi
+								fi
 							fi
 						fi
 					fi
@@ -2094,6 +2136,8 @@ RULES:
 							kill -9 "$health_pid" 2>/dev/null || true
 						fi
 						rm -f "$pid_file"
+						# t1222: Clean up hang warning marker on kill
+						rm -f "$SUPERVISOR_DIR/pids/${health_task}.hang-warned"
 
 						# t1074: Auto-retry timed-out workers up to max_retries before marking failed.
 						# Check if the task has a PR already (worker may have created one before timeout).

From bcf493215254825ef7dec0bcaa2d01d1dfb8d95f Mon Sep 17 00:00:00 2001
From: marcusquinn <6428977+marcusquinn@users.noreply.github.com>
Date: Thu, 19 Feb 2026 03:06:05 +0000
Subject: [PATCH 2/2] fix: address review feedback for t1222

---
 .agents/scripts/supervisor-helper.sh  | 2 +-
 .agents/scripts/supervisor/cleanup.sh | 2 ++
 .agents/scripts/supervisor/pulse.sh   | 8 +++++---
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.agents/scripts/supervisor-helper.sh b/.agents/scripts/supervisor-helper.sh
index 4c8b4cbb..f79f89e0 100755
--- a/.agents/scripts/supervisor-helper.sh
+++ b/.agents/scripts/supervisor-helper.sh
@@ -467,7 +467,7 @@ Environment:
   SUPERVISOR_TIMEOUT_BUGFIX   Hang timeout for #bugfix/#fix tasks (default: 3600 — 1h)
   SUPERVISOR_TIMEOUT_DOCS     Hang timeout for #docs tasks (default: 1800 — 30m)
   SUPERVISOR_HEARTBEAT_INTERVAL Seconds between worker heartbeat writes to log (default: 300 — 5m)
-  SUPERVISOR_HANG_GRACEFUL    Enable two-phase hang detection: SIGTERM at 50%% timeout, SIGKILL at 100%% (default: true, t1222)
+  SUPERVISOR_HANG_GRACEFUL    Enable two-phase hang detection: SIGTERM at 50% timeout, SIGKILL at 100% (default: true, t1222)
   SUPERVISOR_SELF_MEM_LIMIT   MB before supervisor respawns after batch (default: 8192)
   SUPERVISOR_SKILL_UPDATE_PR  Enable skill update PR pipeline in pulse (default: false)
   SUPERVISOR_SKILL_UPDATE_INTERVAL Seconds between skill update PR runs (default: 86400)
diff --git a/.agents/scripts/supervisor/cleanup.sh b/.agents/scripts/supervisor/cleanup.sh
index ad163755..51d6280a 100755
--- a/.agents/scripts/supervisor/cleanup.sh
+++ b/.agents/scripts/supervisor/cleanup.sh
@@ -282,6 +282,8 @@ cleanup_worker_processes() {
 	fi
 
 	rm -f "$pid_file"
+	# t1222: Clean up hang warning marker to prevent stale markers from killing re-dispatched workers
+	rm -f "$SUPERVISOR_DIR/pids/${task_id}.hang-warned" 2>/dev/null || true
 
 	if [[ "$killed" -gt 0 ]]; then
 		log_info "Cleaned up worker process for $task_id (PID: $pid)"
diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh
index e9237665..1480b1a5 100755
--- a/.agents/scripts/supervisor/pulse.sh
+++ b/.agents/scripts/supervisor/pulse.sh
@@ -2098,15 +2098,17 @@ RULES:
 									# First detection at 50%: send SIGTERM for graceful shutdown
 									log_warn "  t1222: Worker $health_task possibly hung (no output for ${log_age_seconds}s, 50% of ${task_hung_timeout}s timeout)"
 									log_warn "  t1222: Sending SIGTERM for graceful shutdown (PID $health_pid)"
-									echo "$now_epoch" >"$hang_warn_marker"
+									echo "$now_epoch" >"$hang_warn_marker" 2>/dev/null || true
 									# SIGTERM triggers the wrapper's cleanup_children trap
 									kill -TERM "$health_pid" 2>/dev/null || true
 								else
 									# Already warned — check if SIGTERM worked (grace period: 2 pulse cycles ~4min)
-									local warn_epoch
+									local warn_epoch=0
 									warn_epoch=$(cat "$hang_warn_marker" 2>/dev/null || echo "0")
+									warn_epoch="${warn_epoch:-0}"
 									local grace_elapsed=$((now_epoch - warn_epoch))
-									# Grace period: min(240s, 25% of hung timeout) — enough for 2 pulse cycles
+									# Grace period: min(240s, max(120s, 25% of hung timeout))
+									# At 2-min cron this spans 1-2 cycles; at 5-min cron the hard kill fires on the next cycle
 									local grace_period=$((task_hung_timeout / 4))
 									if [[ "$grace_period" -gt 240 ]]; then
 										grace_period=240