From 899d73c487280615843431f61f361c2902408512 Mon Sep 17 00:00:00 2001 From: Alexey <1556417+alex-solovyev@users.noreply.github.com> Date: Fri, 13 Mar 2026 13:09:52 +0100 Subject: [PATCH] fix(pulse-wrapper): add flock-based lock guard to prevent concurrent instances (GH#4409) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 14 concurrent pulse instances ran simultaneously, each spawning workers before the previous pulse could assess running workers, resulting in 52 active workers against a 24-slot pool. Root cause: check_dedup() uses a PID file with a TOCTOU race window — between reading the PID file and writing the new PID, another instance can pass the same check. Fix: Add flock(1) -n (non-blocking) as the primary instance lock in main(), acquired before any other check. flock is atomic at the kernel level — no race window. The lock auto-releases on process exit (including crashes, OOM kills, SIGKILL). Falls back gracefully to PID-based dedup on systems without flock (e.g., macOS). --- .agents/scripts/pulse-wrapper.sh | 69 +++++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 10 deletions(-) diff --git a/.agents/scripts/pulse-wrapper.sh b/.agents/scripts/pulse-wrapper.sh index bafb9c0d7..3b285e26e 100755 --- a/.agents/scripts/pulse-wrapper.sh +++ b/.agents/scripts/pulse-wrapper.sh @@ -5,14 +5,15 @@ # but never exits, blocking all future pulses via the pgrep dedup guard. # # This wrapper: -# 1. Uses a PID file with staleness check (not pgrep) for dedup -# 2. Cleans up orphaned opencode processes before each pulse -# 3. Kills runaway processes exceeding RSS or runtime limits (t1398.1) -# 4. Calculates dynamic worker concurrency from available RAM -# 5. Internal watchdog kills stuck pulses after PULSE_STALE_THRESHOLD (t1397) -# 6. Self-watchdog: idle detection kills pulse when CPU drops to zero (t1398.3) -# 7. Progress-based watchdog: kills if log output stalls for PULSE_PROGRESS_TIMEOUT (GH#2958) -# 8. Provider-aware pulse sessions via headless-runtime-helper.sh +# 1. flock-based instance lock prevents concurrent pulses (GH#4409) +# 2. Uses a PID file with staleness check (not pgrep) for dedup +# 3. Cleans up orphaned opencode processes before each pulse +# 4. Kills runaway processes exceeding RSS or runtime limits (t1398.1) +# 5. Calculates dynamic worker concurrency from available RAM +# 6. Internal watchdog kills stuck pulses after PULSE_STALE_THRESHOLD (t1397) +# 7. Self-watchdog: idle detection kills pulse when CPU drops to zero (t1398.3) +# 8. Progress-based watchdog: kills if log output stalls for PULSE_PROGRESS_TIMEOUT (GH#2958) +# 9. Provider-aware pulse sessions via headless-runtime-helper.sh # # Lifecycle: launchd fires every 120s. If a pulse is still running, the # dedup check skips. run_pulse() has an internal watchdog that polls every @@ -28,7 +29,7 @@ # PULSE_PROGRESS_TIMEOUT seconds. A process that's running but producing # no output is stuck — not productive. This catches cases where CPU is # nonzero (network I/O, spinning) but no actual work is being done. -# check_dedup() serves as a tertiary safety net for edge cases where the +# check_dedup() serves as a secondary safety net for edge cases where the # wrapper itself gets stuck. # # PID file sentinel protocol (GH#4324): @@ -149,6 +150,7 @@ SESSION_COUNT_WARN=$(_validate_int SESSION_COUNT_WARN "$SESSION_COUNT_WARN" 5 1) # _sanitize_markdown and _sanitize_log_field provided by worker-lifecycle-common.sh PIDFILE="${HOME}/.aidevops/logs/pulse.pid" +LOCKFILE="${HOME}/.aidevops/logs/pulse-wrapper.lock" LOGFILE="${HOME}/.aidevops/logs/pulse.log" WRAPPER_LOGFILE="${HOME}/.aidevops/logs/pulse-wrapper.log" SESSION_FLAG="${HOME}/.aidevops/logs/pulse-session.flag" @@ -173,6 +175,43 @@ fi ####################################### mkdir -p "$(dirname "$PIDFILE")" +####################################### +# Acquire an exclusive instance lock using flock (GH#4409) +# +# Primary defense against concurrent pulse instances. Uses flock(1) +# with -n (non-blocking) so a second instance exits immediately if +# the lock is held. The lock is automatically released when the +# process exits (including crashes, OOM kills, SIGKILL). +# +# The lock file descriptor (FD 9) is held for the entire lifetime +# of the process. The caller must exec this via: +# exec 9>"$LOCKFILE" +# acquire_instance_lock +# +# On systems without flock (e.g., macOS without util-linux), falls +# back to check_dedup() which uses PID-file-based dedup. The flock +# guard is strictly superior (atomic, no TOCTOU race) but the PID +# fallback is better than nothing. +# +# Returns: 0 if lock acquired, 1 if another instance holds the lock +####################################### +acquire_instance_lock() { + if ! command -v flock &>/dev/null; then + # flock not available — fall back to PID-based dedup only. + # Log once so the user knows the stronger guard is missing. + echo "[pulse-wrapper] flock not available — relying on PID-based dedup only (install util-linux for atomic locking)" >>"$WRAPPER_LOGFILE" + return 0 + fi + + if ! flock -n 9; then + echo "[pulse-wrapper] Another pulse instance holds the lock — exiting immediately (GH#4409)" >>"$WRAPPER_LOGFILE" + return 1 + fi + + echo "[pulse-wrapper] Instance lock acquired (PID $$)" >>"$WRAPPER_LOGFILE" + return 0 +} + ####################################### # Check for stale PID file and clean up # Returns: 0 if safe to proceed, 1 if another pulse is genuinely running @@ -2404,7 +2443,8 @@ run_underfill_worker_recycler() { ####################################### # Main # -# Execution order (t1429): +# Execution order (t1429, GH#4409): +# 0. Instance lock (flock — atomic, prevents concurrent pulses) # 1. Gate checks (consent, dedup) # 2. Cleanup (orphans, worktrees, stashes) # 3. Prefetch state (parallel gh API calls) @@ -2418,6 +2458,15 @@ run_underfill_worker_recycler() { # even the API calls themselves add latency that delays dispatch. ####################################### main() { + # GH#4409: Acquire exclusive instance lock FIRST — before any other + # check. This is the primary defense against concurrent pulses. The + # flock is atomic (no TOCTOU race) and auto-releases on process exit. + # Open the lock file on FD 9 so flock can hold it for the process lifetime. + exec 9>"$LOCKFILE" + if ! acquire_instance_lock; then + return 0 + fi + if ! check_session_gate; then return 0 fi