Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 28 additions & 7 deletions .agents/scripts/pulse-session-helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,27 @@ count_workers() {

#######################################
# Check if a pulse process is currently running
# Handles SETUP:/IDLE: sentinels from pulse-wrapper.sh (GH#4575)
# Returns: 0 if running, 1 if not
#######################################
is_pulse_running() {
if [[ -f "$PIDFILE" ]]; then
local pid
pid=$(cat "$PIDFILE" || echo "")
if [[ -n "$pid" ]] && ps -p "$pid" >/dev/null 2>&1; then
local pid_content
pid_content=$(cat "$PIDFILE" || echo "")

# IDLE sentinel or empty — not running
if [[ -z "$pid_content" ]] || [[ "$pid_content" == IDLE:* ]]; then
return 1
fi

# SETUP sentinel — extract numeric PID
local pid="$pid_content"
if [[ "$pid_content" == SETUP:* ]]; then
pid="${pid_content#SETUP:}"
fi

# Validate numeric and check process
if [[ "$pid" =~ ^[0-9]+$ ]] && ps -p "$pid" >/dev/null 2>&1; then
return 0
fi
fi
Expand Down Expand Up @@ -415,11 +429,18 @@ cmd_status() {
fi
echo ""

# Pulse process
# Pulse process — handle SETUP:/IDLE: sentinels (GH#4575)
if is_pulse_running; then
local pulse_pid
pulse_pid=$(cat "$PIDFILE" || echo "?")
echo -e " Process: ${GREEN}running${NC} (PID ${pulse_pid})"
local pulse_pid_content pulse_display_pid
pulse_pid_content=$(cat "$PIDFILE" || echo "?")
# Extract numeric PID for display
if [[ "$pulse_pid_content" == SETUP:* ]]; then
pulse_display_pid="${pulse_pid_content#SETUP:}"
echo -e " Process: ${YELLOW}setup${NC} (PID ${pulse_display_pid}, pre-flight stages)"
else
pulse_display_pid="$pulse_pid_content"
echo -e " Process: ${GREEN}running${NC} (PID ${pulse_display_pid})"
fi
else
echo -e " Process: ${BLUE}idle${NC} (waiting for next launchd cycle)"
fi
Expand Down
196 changes: 181 additions & 15 deletions .agents/scripts/pulse-wrapper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -297,20 +297,78 @@ check_dedup() {
return 0
fi

local old_pid
old_pid=$(cat "$PIDFILE" 2>/dev/null || echo "")
local pid_content
pid_content=$(cat "$PIDFILE" 2>/dev/null || echo "")

# Empty file or IDLE sentinel — safe to proceed (GH#4324)
if [[ -z "$old_pid" ]] || [[ "$old_pid" == IDLE:* ]]; then
if [[ -z "$pid_content" ]] || [[ "$pid_content" == IDLE:* ]]; then
return 0
fi

# SETUP sentinel (t1482): another wrapper is running pre-flight stages
# (cleanup, prefetch). The instance lock already prevents true concurrency,
# so if we got past acquire_instance_lock, the SETUP wrapper is dead or
# we ARE that wrapper. Either way, safe to proceed.
if [[ "$pid_content" == SETUP:* ]]; then
local setup_pid="${pid_content#SETUP:}"

# Numeric validation — corrupt sentinel gets reset (GH#4575)
if ! [[ "$setup_pid" =~ ^[0-9]+$ ]]; then
echo "[pulse-wrapper] check_dedup: invalid SETUP sentinel '${pid_content}' — resetting to IDLE" >>"$LOGFILE"
echo "IDLE:$(date -u +%Y-%m-%dT%H:%M:%SZ)" >"$PIDFILE"
return 0
fi

if [[ "$setup_pid" == "$$" ]]; then
# We wrote this ourselves — proceed
return 0
fi

# Check if the process is still alive via its cmdline (GH#4575)
local setup_cmd=""
setup_cmd=$(ps -p "$setup_pid" -o command= 2>/dev/null || echo "")

if [[ -z "$setup_cmd" ]]; then
echo "[pulse-wrapper] check_dedup: SETUP wrapper $setup_pid is dead — proceeding" >>"$LOGFILE"
echo "IDLE:$(date -u +%Y-%m-%dT%H:%M:%SZ)" >"$PIDFILE"
return 0
fi

# PID reuse guard: verify the process is actually a pulse-wrapper
# before killing. PID reuse can assign the old PID to an unrelated
# process between cycles. (GH#4575)
if [[ "$setup_cmd" != *"pulse-wrapper.sh"* ]]; then
echo "[pulse-wrapper] check_dedup: SETUP PID $setup_pid belongs to non-wrapper process ('${setup_cmd%%' '*}'); refusing kill, resetting sentinel" >>"$LOGFILE"
echo "IDLE:$(date -u +%Y-%m-%dT%H:%M:%SZ)" >"$PIDFILE"
return 0
fi

# SETUP wrapper is alive but we hold the instance lock — it's a zombie
# from a previous cycle. Kill it and proceed.
echo "[pulse-wrapper] check_dedup: killing zombie SETUP wrapper $setup_pid" >>"$LOGFILE"
_kill_tree "$setup_pid" || true
sleep 1
if kill -0 "$setup_pid" 2>/dev/null; then
_force_kill_tree "$setup_pid" || true
fi
echo "IDLE:$(date -u +%Y-%m-%dT%H:%M:%SZ)" >"$PIDFILE"
return 0
fi

# Non-numeric content (corrupt/unknown) — safe to proceed
local old_pid="$pid_content"
if ! [[ "$old_pid" =~ ^[0-9]+$ ]]; then
echo "[pulse-wrapper] check_dedup: unrecognised PID file content '${old_pid}' — treating as idle" >>"$LOGFILE"
return 0
fi

# Self-detection (t1482): if the PID file contains our own PID, we wrote
# it in a previous code path (e.g., early PID write at main() entry).
# Never block on ourselves.
if [[ "$old_pid" == "$$" ]]; then
return 0
fi

# Check if the process is still running
if ! ps -p "$old_pid" >/dev/null 2>&1; then
# Process is dead — write IDLE sentinel so the file is never absent
Expand Down Expand Up @@ -487,7 +545,43 @@ prefetch_state() {
idx=$((idx + 1))
done <<<"$repo_entries"

# Wait for all parallel fetches
# Wait for all parallel fetches with a hard timeout (t1482).
# Each repo does 3 gh API calls (pr list, pr list --state all, issue list).
# Normal completion: <30s. Timeout at 120s catches hung gh connections.
# Uses poll-based approach (kill -0) instead of blocking wait — wait $pid
# blocks until the process exits, so a timeout check between waits is
# ineffective when a single wait hangs for minutes.
local wait_elapsed=0
local all_done=false
while [[ "$all_done" != "true" ]] && [[ "$wait_elapsed" -lt 120 ]]; do
all_done=true
for pid in "${pids[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
all_done=false
break
fi
done
if [[ "$all_done" != "true" ]]; then
sleep 2
wait_elapsed=$((wait_elapsed + 2))
fi
done
if [[ "$all_done" != "true" ]]; then
echo "[pulse-wrapper] Parallel gh fetch timeout after ${wait_elapsed}s — killing remaining fetches" >>"$LOGFILE"
for pid in "${pids[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
_kill_tree "$pid" || true
fi
done
sleep 1
# Force-kill any survivors
for pid in "${pids[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
_force_kill_tree "$pid" || true
fi
done
fi
# Reap all child processes (non-blocking since they're dead or killed)
for pid in "${pids[@]}"; do
wait "$pid" 2>/dev/null || true
done
Expand All @@ -509,29 +603,56 @@ prefetch_state() {
# Clean up
rm -rf "$tmpdir"

# Append mission state
# t1482: Sub-helpers that call external scripts (gh API, pr-salvage,
# gh-failure-miner) get individual timeouts via run_cmd_with_timeout.
# If a helper times out, the pulse proceeds without that section —
# degraded but functional. Shell functions that only read local state
# (priority allocations, queue governor, contribution watch) run
# directly since they complete instantly.

# Append mission state (reads local files — fast)
prefetch_missions "$repo_entries" >>"$STATE_FILE"

# Append active worker snapshot for orphaned PR detection (t216)
# Append active worker snapshot for orphaned PR detection (t216, local ps — fast)
prefetch_active_workers >>"$STATE_FILE"

# Append repo hygiene data for LLM triage (t1417)
prefetch_hygiene >>"$STATE_FILE"
# This includes pr-salvage-helper.sh which iterates all repos sequentially
# and can hang on gh API calls. Give it 120s since it does 8 repos.
local hygiene_tmp
hygiene_tmp=$(mktemp)
run_cmd_with_timeout 120 prefetch_hygiene >"$hygiene_tmp" 2>/dev/null || {
echo "[pulse-wrapper] prefetch_hygiene timed out after 120s (non-fatal)" >>"$LOGFILE"
}
cat "$hygiene_tmp" >>"$STATE_FILE"
rm -f "$hygiene_tmp"

# Append CI failure patterns from notification mining (GH#4480)
prefetch_ci_failures >>"$STATE_FILE"
local ci_tmp
ci_tmp=$(mktemp)
run_cmd_with_timeout 90 prefetch_ci_failures >"$ci_tmp" 2>/dev/null || {
echo "[pulse-wrapper] prefetch_ci_failures timed out after 90s (non-fatal)" >>"$LOGFILE"
}
cat "$ci_tmp" >>"$STATE_FILE"
rm -f "$ci_tmp"

# Append priority-class worker allocations (t1423)
# Append priority-class worker allocations (t1423, reads local file — fast)
_append_priority_allocations >>"$STATE_FILE"

# Append adaptive queue-governor guidance (t1455)
# Append adaptive queue-governor guidance (t1455, local computation — fast)
append_adaptive_queue_governor

# Append external contribution watch summary (t1419)
# Append external contribution watch summary (t1419, local state — fast)
prefetch_contribution_watch >>"$STATE_FILE"

# Append failed-notification systemic summary (t3960)
prefetch_gh_failure_notifications >>"$STATE_FILE"
local ghfail_tmp
ghfail_tmp=$(mktemp)
run_cmd_with_timeout 90 prefetch_gh_failure_notifications >"$ghfail_tmp" 2>/dev/null || {
echo "[pulse-wrapper] prefetch_gh_failure_notifications timed out after 90s (non-fatal)" >>"$LOGFILE"
}
cat "$ghfail_tmp" >>"$STATE_FILE"
rm -f "$ghfail_tmp"

# Export PULSE_SCOPE_REPOS — comma-separated list of repo slugs that
# workers are allowed to create PRs/branches on (t1405, GH#2928).
Expand Down Expand Up @@ -1483,6 +1604,49 @@ check_session_count() {
return 0
}

#######################################
# Run a command with a per-call timeout (t1482)
#
# Lighter than run_stage_with_timeout — no logging, no stage semantics.
# Designed for sub-helpers inside prefetch_state that can hang on gh API
# calls. Kills the entire process group on timeout.
#
# Arguments:
# $1 - timeout in seconds
# $2..N - command and arguments
#
# Returns:
# 0 - command completed successfully
# 124 - command timed out and was killed
# else- command exit code
#######################################
run_cmd_with_timeout() {
local timeout_secs="$1"
shift
[[ "$timeout_secs" =~ ^[0-9]+$ ]] || timeout_secs=60

"$@" &
local cmd_pid=$!

local elapsed=0
while kill -0 "$cmd_pid" 2>/dev/null; do
if [[ "$elapsed" -ge "$timeout_secs" ]]; then
_kill_tree "$cmd_pid" || true
sleep 1
if kill -0 "$cmd_pid" 2>/dev/null; then
_force_kill_tree "$cmd_pid" || true
fi
wait "$cmd_pid" 2>/dev/null || true
return 124
fi
sleep 2
elapsed=$((elapsed + 2))
done

wait "$cmd_pid"
return $?
}

#######################################
# Run a stage with a wall-clock timeout
#
Expand Down Expand Up @@ -2605,9 +2769,11 @@ main() {
return 0
fi

# t1425: Write PID early to prevent parallel instances during setup.
# run_pulse() overwrites with the opencode PID for watchdog tracking.
echo "$$" >"$PIDFILE"
# t1425, t1482: Write SETUP sentinel during pre-flight stages.
# Uses SETUP:$$ format so check_dedup() can distinguish "wrapper doing
# setup" from "opencode running pulse". run_pulse() overwrites with the
# plain opencode PID for watchdog tracking.
echo "SETUP:$$" >"$PIDFILE"

run_stage_with_timeout "cleanup_orphans" "$PRE_RUN_STAGE_TIMEOUT" cleanup_orphans || true
run_stage_with_timeout "cleanup_worktrees" "$PRE_RUN_STAGE_TIMEOUT" cleanup_worktrees || true
Expand Down
Loading