From ba6d2801e26ac372f110fc0a862ea3dd31cb52e4 Mon Sep 17 00:00:00 2001 From: marcusquinn <6428977+marcusquinn@users.noreply.github.com> Date: Thu, 19 Feb 2026 02:34:34 +0000 Subject: [PATCH 1/2] feat: add two-phase worker hang detection with graceful termination (t1222) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At 50% of hung timeout, send SIGTERM for graceful shutdown via wrapper trap. If worker survives grace period (min 120s, max 240s), escalate to hard SIGKILL. Saves ~15 minutes per hung worker vs waiting full timeout before killing. Chose marker-file approach over DB field — stateless across pulse cycles, no schema migration needed, matches existing PID file pattern in supervisor/pids/. Env: SUPERVISOR_HANG_GRACEFUL=true|false (default: true) to toggle behavior. --- .agents/scripts/supervisor-helper.sh | 1 + .agents/scripts/supervisor/pulse.sh | 44 ++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/.agents/scripts/supervisor-helper.sh b/.agents/scripts/supervisor-helper.sh index cee47366..4c8b4cbb 100755 --- a/.agents/scripts/supervisor-helper.sh +++ b/.agents/scripts/supervisor-helper.sh @@ -467,6 +467,7 @@ Environment: SUPERVISOR_TIMEOUT_BUGFIX Hang timeout for #bugfix/#fix tasks (default: 3600 — 1h) SUPERVISOR_TIMEOUT_DOCS Hang timeout for #docs tasks (default: 1800 — 30m) SUPERVISOR_HEARTBEAT_INTERVAL Seconds between worker heartbeat writes to log (default: 300 — 5m) + SUPERVISOR_HANG_GRACEFUL Enable two-phase hang detection: SIGTERM at 50%% timeout, SIGKILL at 100%% (default: true, t1222) SUPERVISOR_SELF_MEM_LIMIT MB before supervisor respawns after batch (default: 8192) SUPERVISOR_SKILL_UPDATE_PR Enable skill update PR pipeline in pulse (default: false) SUPERVISOR_SKILL_UPDATE_INTERVAL Seconds between skill update PR runs (default: 86400) diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh index 95bc1d06..e9237665 100755 --- a/.agents/scripts/supervisor/pulse.sh +++ b/.agents/scripts/supervisor/pulse.sh @@ -2025,6 +2025,8 @@ RULES: if ! kill -0 "$health_pid" 2>/dev/null; then # Dead worker: PID no longer exists rm -f "$pid_file" + # t1222: Clean up hang warning marker for dead workers + rm -f "$SUPERVISOR_DIR/pids/${health_task}.hang-warned" if [[ "$health_status" == "running" || "$health_status" == "dispatched" ]]; then log_warn " Dead worker for $health_task (PID $health_pid gone, was $health_status) — evaluating" cmd_evaluate "$health_task" --no-ai 2>>"$SUPERVISOR_LOG" || { @@ -2063,6 +2065,9 @@ RULES: # Check 2: Hung state (no log output for timeout period) # t1199: Use per-task hung timeout based on ~estimate (2x estimate, 4h cap, 30m default) + # t1222: Two-phase hang detection — graceful SIGTERM at 50% timeout, hard SIGKILL at 100%. + # Saves ~15 minutes per hung worker by terminating early and retrying immediately + # instead of waiting the full timeout. The wrapper's EXIT trap handles child cleanup. if [[ "$should_kill" == "false" ]]; then local log_file log_file=$(db "$SUPERVISOR_DB" "SELECT log_file FROM tasks WHERE id = '$(sql_escape "$health_task")';" 2>/dev/null || echo "") @@ -2076,9 +2081,46 @@ RULES: # Compute per-task hung timeout from ~estimate field (t1199) local task_hung_timeout task_hung_timeout=$(get_task_hung_timeout "$health_task" 2>/dev/null || echo "$worker_timeout_seconds") + + # t1222: Two-phase hang detection (disable with SUPERVISOR_HANG_GRACEFUL=false) + local hang_graceful="${SUPERVISOR_HANG_GRACEFUL:-true}" + local hang_warn_threshold=$((task_hung_timeout / 2)) + local hang_warn_marker="$SUPERVISOR_DIR/pids/${health_task}.hang-warned" + if [[ "$log_age_seconds" -gt "$task_hung_timeout" ]]; then + # Phase 2 (or single-phase if graceful disabled): Full timeout exceeded — hard kill should_kill=true kill_reason="Worker hung (no output for ${log_age_seconds}s, timeout ${task_hung_timeout}s)" + rm -f "$hang_warn_marker" + elif [[ "$hang_graceful" == "true" && "$log_age_seconds" -gt "$hang_warn_threshold" ]]; then + # Phase 1: 50% timeout exceeded — attempt graceful termination + if [[ ! -f "$hang_warn_marker" ]]; then + # First detection at 50%: send SIGTERM for graceful shutdown + log_warn " t1222: Worker $health_task possibly hung (no output for ${log_age_seconds}s, 50% of ${task_hung_timeout}s timeout)" + log_warn " t1222: Sending SIGTERM for graceful shutdown (PID $health_pid)" + echo "$now_epoch" >"$hang_warn_marker" + # SIGTERM triggers the wrapper's cleanup_children trap + kill -TERM "$health_pid" 2>/dev/null || true + else + # Already warned — check if SIGTERM worked (grace period: 2 pulse cycles ~4min) + local warn_epoch + warn_epoch=$(cat "$hang_warn_marker" 2>/dev/null || echo "0") + local grace_elapsed=$((now_epoch - warn_epoch)) + # Grace period: min(240s, 25% of hung timeout) — enough for 2 pulse cycles + local grace_period=$((task_hung_timeout / 4)) + if [[ "$grace_period" -gt 240 ]]; then + grace_period=240 + fi + if [[ "$grace_period" -lt 120 ]]; then + grace_period=120 + fi + if [[ "$grace_elapsed" -gt "$grace_period" ]]; then + # Grace period expired, worker didn't terminate — escalate to hard kill + should_kill=true + kill_reason="Worker hung (graceful SIGTERM failed after ${grace_elapsed}s grace, no output for ${log_age_seconds}s)" + rm -f "$hang_warn_marker" + fi + fi fi fi fi @@ -2094,6 +2136,8 @@ RULES: kill -9 "$health_pid" 2>/dev/null || true fi rm -f "$pid_file" + # t1222: Clean up hang warning marker on kill + rm -f "$SUPERVISOR_DIR/pids/${health_task}.hang-warned" # t1074: Auto-retry timed-out workers up to max_retries before marking failed. # Check if the task has a PR already (worker may have created one before timeout). From bcf493215254825ef7dec0bcaa2d01d1dfb8d95f Mon Sep 17 00:00:00 2001 From: marcusquinn <6428977+marcusquinn@users.noreply.github.com> Date: Thu, 19 Feb 2026 03:06:05 +0000 Subject: [PATCH 2/2] fix: address review feedback for t1222 --- .agents/scripts/supervisor-helper.sh | 2 +- .agents/scripts/supervisor/cleanup.sh | 2 ++ .agents/scripts/supervisor/pulse.sh | 8 +++++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.agents/scripts/supervisor-helper.sh b/.agents/scripts/supervisor-helper.sh index 4c8b4cbb..f79f89e0 100755 --- a/.agents/scripts/supervisor-helper.sh +++ b/.agents/scripts/supervisor-helper.sh @@ -467,7 +467,7 @@ Environment: SUPERVISOR_TIMEOUT_BUGFIX Hang timeout for #bugfix/#fix tasks (default: 3600 — 1h) SUPERVISOR_TIMEOUT_DOCS Hang timeout for #docs tasks (default: 1800 — 30m) SUPERVISOR_HEARTBEAT_INTERVAL Seconds between worker heartbeat writes to log (default: 300 — 5m) - SUPERVISOR_HANG_GRACEFUL Enable two-phase hang detection: SIGTERM at 50%% timeout, SIGKILL at 100%% (default: true, t1222) + SUPERVISOR_HANG_GRACEFUL Enable two-phase hang detection: SIGTERM at 50% timeout, SIGKILL at 100% (default: true, t1222) SUPERVISOR_SELF_MEM_LIMIT MB before supervisor respawns after batch (default: 8192) SUPERVISOR_SKILL_UPDATE_PR Enable skill update PR pipeline in pulse (default: false) SUPERVISOR_SKILL_UPDATE_INTERVAL Seconds between skill update PR runs (default: 86400) diff --git a/.agents/scripts/supervisor/cleanup.sh b/.agents/scripts/supervisor/cleanup.sh index ad163755..51d6280a 100755 --- a/.agents/scripts/supervisor/cleanup.sh +++ b/.agents/scripts/supervisor/cleanup.sh @@ -282,6 +282,8 @@ cleanup_worker_processes() { fi rm -f "$pid_file" + # t1222: Clean up hang warning marker to prevent stale markers from killing re-dispatched workers + rm -f "$SUPERVISOR_DIR/pids/${task_id}.hang-warned" 2>/dev/null || true if [[ "$killed" -gt 0 ]]; then log_info "Cleaned up worker process for $task_id (PID: $pid)" diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh index e9237665..1480b1a5 100755 --- a/.agents/scripts/supervisor/pulse.sh +++ b/.agents/scripts/supervisor/pulse.sh @@ -2098,15 +2098,17 @@ RULES: # First detection at 50%: send SIGTERM for graceful shutdown log_warn " t1222: Worker $health_task possibly hung (no output for ${log_age_seconds}s, 50% of ${task_hung_timeout}s timeout)" log_warn " t1222: Sending SIGTERM for graceful shutdown (PID $health_pid)" - echo "$now_epoch" >"$hang_warn_marker" + echo "$now_epoch" >"$hang_warn_marker" 2>/dev/null || true # SIGTERM triggers the wrapper's cleanup_children trap kill -TERM "$health_pid" 2>/dev/null || true else # Already warned — check if SIGTERM worked (grace period: 2 pulse cycles ~4min) - local warn_epoch + local warn_epoch=0 warn_epoch=$(cat "$hang_warn_marker" 2>/dev/null || echo "0") + warn_epoch="${warn_epoch:-0}" local grace_elapsed=$((now_epoch - warn_epoch)) - # Grace period: min(240s, 25% of hung timeout) — enough for 2 pulse cycles + # Grace period: min(240s, max(120s, 25% of hung timeout)) + # At 2-min cron this spans 1-2 cycles; at 5-min cron the hard kill fires on the next cycle local grace_period=$((task_hung_timeout / 4)) if [[ "$grace_period" -gt 240 ]]; then grace_period=240