From 3b20c5bb3635e9bb40f981f5109e097dd24a7a00 Mon Sep 17 00:00:00 2001 From: marcusquinn <6428977+marcusquinn@users.noreply.github.com> Date: Thu, 5 Mar 2026 01:19:05 +0000 Subject: [PATCH] fix: add watchdog timeout to pulse-wrapper run_pulse() (t1397) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bare `wait $opencode_pid` blocked the wrapper indefinitely when opencode hung. Since launchd StartInterval only fires when the previous invocation exits, the stale-process check in check_dedup() never ran — the wrapper was still alive, just blocked on wait. Replace with a watchdog loop that polls every 60s and kills the process tree if it exceeds PULSE_STALE_THRESHOLD (30min). Portable across bash 3.2+ (macOS default). Also simplify quality sweep timestamp validation — strip non-numeric chars and default to 0, replacing the verbose regex check. --- .agents/scripts/pulse-wrapper.sh | 66 ++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/.agents/scripts/pulse-wrapper.sh b/.agents/scripts/pulse-wrapper.sh index dd090ef7b..5e0517737 100755 --- a/.agents/scripts/pulse-wrapper.sh +++ b/.agents/scripts/pulse-wrapper.sh @@ -8,13 +8,14 @@ # 1. Uses a PID file with staleness check (not pgrep) for dedup # 2. Cleans up orphaned opencode processes before each pulse # 3. Calculates dynamic worker concurrency from available RAM -# 4. Lets the pulse run to completion — no hard timeout +# 4. Internal watchdog kills stuck pulses after PULSE_STALE_THRESHOLD (t1397) # # Lifecycle: launchd fires every 120s. If a pulse is still running, the -# dedup check skips. If a pulse has been running longer than PULSE_STALE_THRESHOLD -# (default 30 min), it's assumed stuck (opencode idle bug) and killed so the -# next invocation can start fresh. This is the ONLY kill mechanism — no -# arbitrary timeouts that would interrupt active work. +# dedup check skips. run_pulse() has an internal watchdog that polls every +# 60s and kills the opencode process if it exceeds PULSE_STALE_THRESHOLD +# (default 30 min). This ensures the wrapper always exits, allowing launchd +# to fire the next invocation. check_dedup() serves as a secondary safety +# net for edge cases where the wrapper itself gets stuck. # # Called by launchd every 120s via the supervisor-pulse plist. @@ -842,15 +843,18 @@ prefetch_active_workers() { } ####################################### -# Run the pulse — no hard timeout +# Run the pulse — with internal watchdog timeout (t1397) # -# The pulse runs until opencode exits naturally. If opencode enters its -# idle-state bug (file watcher keeps process alive after session completes), -# the NEXT launchd invocation's check_dedup() will detect the stale process -# (age > PULSE_STALE_THRESHOLD) and kill it. This is correct because: -# - Active pulses doing real work are never interrupted -# - Stuck pulses are detected by the next invocation (120s later) -# - The stale threshold (30 min) is generous enough for any real workload +# The pulse runs until opencode exits naturally. A watchdog loop checks +# every 60s whether the process has exceeded PULSE_STALE_THRESHOLD. If so, +# it kills the process tree and returns, allowing the wrapper to continue +# to the quality sweep and health issue phases. +# +# Previous design relied on the NEXT launchd invocation's check_dedup() +# to kill stale processes. This failed because launchd StartInterval only +# fires when the previous invocation has exited — and the wrapper blocks +# on `wait`, so the next invocation never starts. The watchdog is now +# internal to the same process that spawned opencode. ####################################### run_pulse() { local start_epoch @@ -869,7 +873,7 @@ ${state_content} --- END PRE-FETCHED STATE ---" fi - # Run opencode — blocks until it exits (or is killed by next invocation's stale check) + # Run opencode in background "$OPENCODE_BIN" run "$prompt" \ --dir "$PULSE_DIR" \ -m "$PULSE_MODEL" \ @@ -881,7 +885,29 @@ ${state_content} echo "[pulse-wrapper] opencode PID: $opencode_pid" >>"$LOGFILE" - # Wait for natural exit + # Watchdog loop: check every 60s if the process is still alive and within + # the stale threshold. This replaces the bare `wait` that blocked the + # wrapper indefinitely when opencode hung. + while kill -0 "$opencode_pid" 2>/dev/null; do + local now + now=$(date +%s) + local elapsed=$((now - start_epoch)) + if [[ "$elapsed" -gt "$PULSE_STALE_THRESHOLD" ]]; then + echo "[pulse-wrapper] Pulse exceeded stale threshold (${elapsed}s > ${PULSE_STALE_THRESHOLD}s) — killing" >>"$LOGFILE" + _kill_tree "$opencode_pid" + sleep 2 + # Force kill if still alive + if kill -0 "$opencode_pid" 2>/dev/null; then + _force_kill_tree "$opencode_pid" + fi + break + fi + # Sleep 60s then re-check. Portable across bash 3.2+ (macOS default). + # The process may exit during sleep — kill -0 at top of loop catches that. + sleep 60 + done + + # Reap the process (may already be dead) wait "$opencode_pid" 2>/dev/null || true # Clean up PID file @@ -1467,12 +1493,10 @@ run_daily_quality_sweep() { # Timestamp guard — run at most once per QUALITY_SWEEP_INTERVAL if [[ -f "$QUALITY_SWEEP_LAST_RUN" ]]; then local last_run - last_run=$(cat "$QUALITY_SWEEP_LAST_RUN" || echo "0") - # Validate integer before arithmetic expansion (prevents command injection) - if ! [[ "$last_run" =~ ^[0-9]+$ ]]; then - echo "[pulse-wrapper] Corrupt sweep timestamp '${last_run}' — resetting" >>"$LOGFILE" - last_run=0 - fi + last_run=$(cat "$QUALITY_SWEEP_LAST_RUN" 2>/dev/null || echo "0") + # Strip whitespace/newlines and validate integer (t1397) + last_run="${last_run//[^0-9]/}" + last_run="${last_run:-0}" local now now=$(date +%s) local elapsed=$((now - last_run))