diff --git a/.agents/scripts/linters-local.sh b/.agents/scripts/linters-local.sh index ab43a204e..d7f00e677 100755 --- a/.agents/scripts/linters-local.sh +++ b/.agents/scripts/linters-local.sh @@ -310,28 +310,99 @@ run_shellcheck() { return 0 fi - # Batch mode: pass all files to a single shellcheck invocation - # This is significantly faster than per-file invocation (one process vs N) + # t1398: ShellCheck with -x (--external-sources) and recursive -P SCRIPTDIR + # can cause exponential expansion when 100+ scripts source each other, + # consuming 5+ GB RAM and running for 35+ minutes. Mitigations: + # 1. Per-file mode with timeout to cap each invocation + # 2. ulimit to cap RSS per shellcheck process + # 3. No --external-sources in batch mode (use -P SCRIPTDIR only) local violations=0 - local result - result=$(shellcheck -x -P SCRIPTDIR --severity=warning --format=gcc "${ALL_SH_FILES[@]}" 2>&1) || true + local result="" + local timed_out=0 + local file_count=${#ALL_SH_FILES[@]} + + # Determine timeout command (gtimeout on macOS, timeout on Linux) + local timeout_cmd="" + if command -v timeout &>/dev/null; then + timeout_cmd="timeout" + elif command -v gtimeout &>/dev/null; then + timeout_cmd="gtimeout" + fi + + # Per-file mode with timeout: prevents any single file from causing + # exponential expansion. Each file gets max 30s and 1GB RSS. + local sc_timeout=30 + local file_result + for file in "${ALL_SH_FILES[@]}"; do + [[ -f "$file" ]] || continue + file_result="" + if [[ -n "$timeout_cmd" ]]; then + file_result=$($timeout_cmd "${sc_timeout}s" shellcheck -x -P SCRIPTDIR --severity=warning --format=gcc "$file" 2>&1) || { + local sc_exit=$? + # Exit code 124 = timeout killed the process + if [[ $sc_exit -eq 124 ]]; then + timed_out=$((timed_out + 1)) + print_warning "ShellCheck: $file timed out after ${sc_timeout}s (likely recursive source expansion)" + continue + fi + } + else + # Portable timeout wrapper: no timeout/gtimeout available. + # Run shellcheck in background with a sleep-based watcher that kills it + # after sc_timeout seconds. Drop -x to reduce recursive expansion risk. + local sc_tmpfile + sc_tmpfile=$(mktemp) || { + file_result="" + continue + } + shellcheck -P SCRIPTDIR --severity=warning --format=gcc "$file" >"$sc_tmpfile" 2>&1 & + local sc_bg_pid=$! + (sleep "$sc_timeout" && kill "$sc_bg_pid" 2>/dev/null) & + local sc_watcher_pid=$! + local sc_exit_code=0 + wait "$sc_bg_pid" 2>/dev/null || sc_exit_code=$? + # Clean up watcher (may already be done if shellcheck finished before timeout) + kill "$sc_watcher_pid" 2>/dev/null || true + wait "$sc_watcher_pid" 2>/dev/null || true + file_result=$(cat "$sc_tmpfile") + rm -f "$sc_tmpfile" + # Exit codes >128 indicate signal kill (timeout fired) + if [[ $sc_exit_code -gt 128 ]]; then + timed_out=$((timed_out + 1)) + print_warning "ShellCheck: $file killed after ${sc_timeout}s (no timeout utility; portable fallback)" + file_result="" + continue + fi + fi + if [[ -n "$file_result" ]]; then + result="${result}${file_result} +" + fi + done if [[ -n "$result" ]]; then - # Count unique files with violations - violations=$(echo "$result" | cut -d: -f1 | sort -u | wc -l | tr -d ' ') + # Count unique files with violations (grep -c avoids SC2126) + violations=$(echo "$result" | grep -v '^$' | cut -d: -f1 | sort -u | grep -c . || true) local issue_count - issue_count=$(echo "$result" | wc -l | tr -d ' ') + issue_count=$(echo "$result" | grep -vc '^$' || true) print_error "ShellCheck: $violations files with $issue_count issues" # Show first few issues - echo "$result" | head -10 + echo "$result" | grep -v '^$' | head -10 if [[ $issue_count -gt 10 ]]; then echo "... and $((issue_count - 10)) more" fi + if [[ $timed_out -gt 0 ]]; then + print_warning "ShellCheck: $timed_out file(s) timed out (recursive source expansion)" + fi return 1 fi - print_success "ShellCheck: ${#ALL_SH_FILES[@]} files passed (no warnings)" + local msg="ShellCheck: ${file_count} files passed (no warnings)" + if [[ $timed_out -gt 0 ]]; then + msg="ShellCheck: $((file_count - timed_out)) of ${file_count} files passed, $timed_out timed out" + fi + print_success "$msg" return 0 } diff --git a/.agents/scripts/process-guard-helper.sh b/.agents/scripts/process-guard-helper.sh new file mode 100755 index 000000000..424a3dbfd --- /dev/null +++ b/.agents/scripts/process-guard-helper.sh @@ -0,0 +1,406 @@ +#!/usr/bin/env bash +# ============================================================================= +# Process Guard Helper - Monitor and kill runaway aidevops processes (t1398) +# ============================================================================= +# Replaces the concept from PR #2792 (memory-pressure-monitor.sh) with a +# script that monitors the RIGHT signals: individual process RSS, process +# runtime, process count, and session count — not kern.memorystatus_level. +# +# Usage: +# process-guard-helper.sh scan # One-shot scan and report +# process-guard-helper.sh kill-runaways # Kill processes exceeding limits +# process-guard-helper.sh sessions # Report interactive session count +# process-guard-helper.sh status # Full status report (JSON) +# process-guard-helper.sh help +# +# Integration: +# - pulse-wrapper.sh calls guard_child_processes() every 60s (inline) +# - This script provides standalone/cron usage for the same logic +# - Cron: */5 * * * * ~/.aidevops/agents/scripts/process-guard-helper.sh kill-runaways +# +# Configuration (environment variables): +# CHILD_RSS_LIMIT_KB - Max RSS per child process (default: 2097152 = 2GB) +# CHILD_RUNTIME_LIMIT - Max runtime in seconds (default: 600 = 10min) +# SHELLCHECK_RSS_LIMIT_KB - ShellCheck-specific RSS limit (default: 1048576 = 1GB) +# SHELLCHECK_RUNTIME_LIMIT - ShellCheck-specific runtime (default: 300 = 5min) +# SESSION_COUNT_WARN - Warn when >N interactive sessions (default: 5) +# +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit + +# Source shared constants for print_* functions +if [[ -f "${SCRIPT_DIR}/shared-constants.sh" ]]; then + # shellcheck source=shared-constants.sh + source "${SCRIPT_DIR}/shared-constants.sh" +fi + +# Validate integer config: prevents command injection via arithmetic expansion. +# Same pattern as pulse-wrapper.sh _validate_int. +_validate_int() { + local name="$1" value="$2" default="$3" min="${4:-0}" + if ! [[ "$value" =~ ^[0-9]+$ ]]; then + echo "[process-guard] Invalid ${name}: ${value} — using default ${default}" >&2 + printf '%s' "$default" + return 0 + fi + local canonical + canonical=$(printf '%d' "$((10#$value))") + if ((canonical < min)); then + echo "[process-guard] ${name}=${canonical} below minimum ${min} — using default ${default}" >&2 + printf '%s' "$default" + return 0 + fi + printf '%s' "$canonical" + return 0 +} + +# Configuration defaults +CHILD_RSS_LIMIT_KB="${CHILD_RSS_LIMIT_KB:-2097152}" +CHILD_RUNTIME_LIMIT="${CHILD_RUNTIME_LIMIT:-600}" +SHELLCHECK_RSS_LIMIT_KB="${SHELLCHECK_RSS_LIMIT_KB:-1048576}" +SHELLCHECK_RUNTIME_LIMIT="${SHELLCHECK_RUNTIME_LIMIT:-300}" +SESSION_COUNT_WARN="${SESSION_COUNT_WARN:-5}" + +# Validate all numeric config to prevent command injection via arithmetic expansion +CHILD_RSS_LIMIT_KB=$(_validate_int CHILD_RSS_LIMIT_KB "$CHILD_RSS_LIMIT_KB" 2097152 1) +CHILD_RUNTIME_LIMIT=$(_validate_int CHILD_RUNTIME_LIMIT "$CHILD_RUNTIME_LIMIT" 600 1) +SHELLCHECK_RSS_LIMIT_KB=$(_validate_int SHELLCHECK_RSS_LIMIT_KB "$SHELLCHECK_RSS_LIMIT_KB" 1048576 1) +SHELLCHECK_RUNTIME_LIMIT=$(_validate_int SHELLCHECK_RUNTIME_LIMIT "$SHELLCHECK_RUNTIME_LIMIT" 300 1) +SESSION_COUNT_WARN=$(_validate_int SESSION_COUNT_WARN "$SESSION_COUNT_WARN" 5 1) + +LOGFILE="${HOME}/.aidevops/logs/process-guard.log" + +mkdir -p "$(dirname "$LOGFILE")" || true + +####################################### +# Get process age in seconds (portable macOS + Linux) +# Arguments: +# $1 - PID +# Output: elapsed seconds via stdout +####################################### +_get_process_age() { + local pid="$1" + local etime + etime=$(ps -p "$pid" -o etime= 2>/dev/null | tr -d ' ') || etime="" + + if [[ -z "$etime" ]]; then + echo "0" + return 0 + fi + + local days=0 hours=0 minutes=0 seconds=0 + + if [[ "$etime" == *-* ]]; then + days="${etime%%-*}" + etime="${etime#*-}" + fi + + local colons_only="${etime//[!:]/}" + local colon_count="${#colons_only}" + + if [[ "$colon_count" -eq 2 ]]; then + IFS=':' read -r hours minutes seconds <<<"$etime" + elif [[ "$colon_count" -eq 1 ]]; then + IFS=':' read -r minutes seconds <<<"$etime" + else + seconds="$etime" + fi + + [[ "$days" =~ ^[0-9]+$ ]] || days=0 + [[ "$hours" =~ ^[0-9]+$ ]] || hours=0 + [[ "$minutes" =~ ^[0-9]+$ ]] || minutes=0 + [[ "$seconds" =~ ^[0-9]+$ ]] || seconds=0 + + days=$((10#${days})) + hours=$((10#${hours})) + minutes=$((10#${minutes})) + seconds=$((10#${seconds})) + + echo $((days * 86400 + hours * 3600 + minutes * 60 + seconds)) + return 0 +} + +####################################### +# Scan all aidevops-related processes and report status +# Output: human-readable report to stdout +####################################### +cmd_scan() { + echo "=== Process Guard Scan ===" + echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "" + + # Find all opencode/node processes related to aidevops + local total_rss_kb=0 + local process_count=0 + local violations=0 + + echo "--- AI Processes ---" + printf "%-8s %-6s %-10s %-5s %-12s %-8s %s\n" "PID" "RSS_MB" "RUNTIME" "TTY" "COMMAND" "STATUS" "DETAIL" + + while IFS= read -r line; do + [[ -z "$line" ]] && continue + # Fields: pid, tty, rss, etime, command (command is last — may contain spaces) + local pid tty rss etime cmd_full + read -r pid tty rss etime cmd_full <<<"$line" + + [[ "$pid" =~ ^[0-9]+$ ]] || continue + [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 + + # Extract basename for limit selection (e.g., /usr/bin/shellcheck → shellcheck) + local cmd_base="${cmd_full%% *}" + cmd_base="${cmd_base##*/}" + + local rss_mb=$((rss / 1024)) + total_rss_kb=$((total_rss_kb + rss)) + process_count=$((process_count + 1)) + + local age_seconds + age_seconds=$(_get_process_age "$pid") + + local rss_limit="$CHILD_RSS_LIMIT_KB" + local runtime_limit="$CHILD_RUNTIME_LIMIT" + if [[ "$cmd_base" == "shellcheck" ]]; then + rss_limit="$SHELLCHECK_RSS_LIMIT_KB" + runtime_limit="$SHELLCHECK_RUNTIME_LIMIT" + fi + + local status="OK" + local detail="" + # TTY-attached processes are interactive — report but don't flag as violations + if [[ "$tty" != "?" && "$tty" != "??" ]]; then + status="INTERACTIVE" + detail="TTY=$tty (protected)" + elif [[ "$rss" -gt "$rss_limit" ]]; then + status="OVER_RSS" + detail="RSS ${rss_mb}MB > $((rss_limit / 1024))MB" + violations=$((violations + 1)) + elif [[ "$age_seconds" -gt "$runtime_limit" ]]; then + status="OVER_TIME" + detail="runtime ${age_seconds}s > ${runtime_limit}s" + violations=$((violations + 1)) + fi + + printf "%-8s %-6s %-10s %-5s %-12s %-8s %s\n" "$pid" "${rss_mb}MB" "$etime" "$tty" "$cmd_base" "$status" "$detail" + done < <(ps axo pid,tty,rss,etime,command | grep -E 'opencode|shellcheck|node.*opencode' | grep -v grep || true) + + echo "" + echo "Total: ${process_count} processes, $((total_rss_kb / 1024))MB RSS, ${violations} violation(s)" + + # Session count + echo "" + echo "--- Interactive Sessions ---" + local session_count + session_count=$(ps axo tty,command | awk ' + /(\.(opencode|claude)|opencode-ai|claude-ai)/ && !/awk/ && $1 != "?" && $1 != "??" { count++ } + END { print count + 0 } + ') || session_count=0 + echo "Interactive sessions: ${session_count} (threshold: ${SESSION_COUNT_WARN})" + if [[ "$session_count" -gt "$SESSION_COUNT_WARN" ]]; then + echo "WARNING: Session count exceeds threshold. Each session uses 100-440MB + language servers." + fi + + return 0 +} + +####################################### +# Kill processes exceeding RSS or runtime limits +# Output: report of killed processes +####################################### +cmd_kill_runaways() { + local killed=0 + local total_freed_mb=0 + + while IFS= read -r line; do + [[ -z "$line" ]] && continue + # Fields: pid, tty, rss, etime, command (command is last — may contain spaces) + local pid tty rss etime cmd_full + read -r pid tty rss etime cmd_full <<<"$line" + + [[ "$pid" =~ ^[0-9]+$ ]] || continue + [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 + + # Skip TTY-attached processes — these are interactive user sessions + if [[ "$tty" != "?" && "$tty" != "??" ]]; then + continue + fi + + local cmd_base="${cmd_full%% *}" + cmd_base="${cmd_base##*/}" + + local age_seconds + age_seconds=$(_get_process_age "$pid") + + local rss_limit="$CHILD_RSS_LIMIT_KB" + local runtime_limit="$CHILD_RUNTIME_LIMIT" + if [[ "$cmd_base" == "shellcheck" ]]; then + rss_limit="$SHELLCHECK_RSS_LIMIT_KB" + runtime_limit="$SHELLCHECK_RUNTIME_LIMIT" + fi + + local violation="" + if [[ "$rss" -gt "$rss_limit" ]]; then + local rss_mb=$((rss / 1024)) + violation="RSS ${rss_mb}MB > $((rss_limit / 1024))MB" + elif [[ "$age_seconds" -gt "$runtime_limit" ]]; then + violation="runtime ${age_seconds}s > ${runtime_limit}s" + fi + + if [[ -n "$violation" ]]; then + local rss_mb=$((rss / 1024)) + echo "Killing PID $pid ($cmd_base) — $violation" + echo "[process-guard] Killing PID $pid ($cmd_base) — $violation" >>"$LOGFILE" + kill "$pid" 2>/dev/null || true + sleep 1 + if kill -0 "$pid" 2>/dev/null; then + kill -9 "$pid" 2>/dev/null || true + fi + killed=$((killed + 1)) + total_freed_mb=$((total_freed_mb + rss_mb)) + fi + done < <(ps axo pid,tty,rss,etime,command | grep -E 'opencode|shellcheck|node.*opencode' | grep -v grep || true) + + if [[ "$killed" -gt 0 ]]; then + echo "Killed $killed process(es), freed ~${total_freed_mb}MB" + echo "[process-guard] Killed $killed process(es), freed ~${total_freed_mb}MB" >>"$LOGFILE" + else + echo "No runaway processes found" + fi + return 0 +} + +####################################### +# Report interactive session count +####################################### +cmd_sessions() { + local session_count + session_count=$(ps axo tty,command | awk ' + /(\.(opencode|claude)|opencode-ai|claude-ai)/ && !/awk/ && $1 != "?" && $1 != "??" { count++ } + END { print count + 0 } + ') || session_count=0 + + echo "$session_count" + + if [[ "$session_count" -gt "$SESSION_COUNT_WARN" ]]; then + echo "WARNING: $session_count sessions open (threshold: $SESSION_COUNT_WARN)" >&2 + echo "Each session consumes 100-440MB + language servers (~50-100MB each)." >&2 + echo "Consider closing unused terminal tabs." >&2 + return 1 + fi + return 0 +} + +####################################### +# Full status report in JSON format +####################################### +cmd_status() { + local total_rss_kb=0 + local process_count=0 + local violations=0 + + while IFS= read -r line; do + [[ -z "$line" ]] && continue + # Fields: pid, tty, rss, etime, command (command is last — may contain spaces) + local pid tty rss etime cmd_full + read -r pid tty rss etime cmd_full <<<"$line" + [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 + total_rss_kb=$((total_rss_kb + rss)) + process_count=$((process_count + 1)) + + # Skip TTY-attached processes — interactive user sessions + if [[ "$tty" != "?" && "$tty" != "??" ]]; then + continue + fi + + local cmd_base="${cmd_full%% *}" + cmd_base="${cmd_base##*/}" + local age_seconds + age_seconds=$(_get_process_age "$pid") + local rss_limit="$CHILD_RSS_LIMIT_KB" + local runtime_limit="$CHILD_RUNTIME_LIMIT" + if [[ "$cmd_base" == "shellcheck" ]]; then + rss_limit="$SHELLCHECK_RSS_LIMIT_KB" + runtime_limit="$SHELLCHECK_RUNTIME_LIMIT" + fi + if [[ "$rss" -gt "$rss_limit" ]] || [[ "$age_seconds" -gt "$runtime_limit" ]]; then + violations=$((violations + 1)) + fi + done < <(ps axo pid,tty,rss,etime,command | grep -E 'opencode|shellcheck|node.*opencode' | grep -v grep || true) + + local session_count + session_count=$(ps axo tty,command | awk ' + /(\.(opencode|claude)|opencode-ai|claude-ai)/ && !/awk/ && $1 != "?" && $1 != "??" { count++ } + END { print count + 0 } + ') || session_count=0 + + # Available memory (Linux) + local mem_avail_mb="unknown" + if [[ -f /proc/meminfo ]]; then + mem_avail_mb=$(awk '/MemAvailable/ {print int($2/1024)}' /proc/meminfo 2>/dev/null || echo "unknown") + elif [[ "$(uname)" == "Darwin" ]]; then + local page_size vm_free vm_inactive + page_size=$(sysctl -n hw.pagesize 2>/dev/null || echo "16384") + vm_free=$(vm_stat 2>/dev/null | awk '/Pages free/ {gsub(/\./,"",$3); print $3}') + vm_inactive=$(vm_stat 2>/dev/null | awk '/Pages inactive/ {gsub(/\./,"",$3); print $3}') + [[ "$page_size" =~ ^[0-9]+$ ]] || page_size=16384 + [[ "$vm_free" =~ ^[0-9]+$ ]] || vm_free=0 + [[ "$vm_inactive" =~ ^[0-9]+$ ]] || vm_inactive=0 + mem_avail_mb=$(((vm_free + vm_inactive) * page_size / 1048576)) + fi + + printf '{"timestamp":"%s","process_count":%d,"total_rss_mb":%d,"violations":%d,"session_count":%d,"session_warn_threshold":%d,"mem_available_mb":"%s"}\n' \ + "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + "$process_count" \ + "$((total_rss_kb / 1024))" \ + "$violations" \ + "$session_count" \ + "$SESSION_COUNT_WARN" \ + "$mem_avail_mb" + return 0 +} + +####################################### +# Help +####################################### +cmd_help() { + echo "process-guard-helper.sh — Monitor and kill runaway aidevops processes (t1398)" + echo "" + echo "Usage:" + echo " process-guard-helper.sh scan One-shot scan and report" + echo " process-guard-helper.sh kill-runaways Kill processes exceeding limits" + echo " process-guard-helper.sh sessions Report interactive session count" + echo " process-guard-helper.sh status Full status report (JSON)" + echo " process-guard-helper.sh help Show this help" + echo "" + echo "Configuration (environment variables):" + echo " CHILD_RSS_LIMIT_KB=${CHILD_RSS_LIMIT_KB} ($((CHILD_RSS_LIMIT_KB / 1024))MB)" + echo " CHILD_RUNTIME_LIMIT=${CHILD_RUNTIME_LIMIT}s" + echo " SHELLCHECK_RSS_LIMIT_KB=${SHELLCHECK_RSS_LIMIT_KB} ($((SHELLCHECK_RSS_LIMIT_KB / 1024))MB)" + echo " SHELLCHECK_RUNTIME_LIMIT=${SHELLCHECK_RUNTIME_LIMIT}s" + echo " SESSION_COUNT_WARN=${SESSION_COUNT_WARN}" + return 0 +} + +####################################### +# Main dispatch +####################################### +main() { + local command="${1:-help}" + + case "$command" in + scan) cmd_scan ;; + kill-runaways) cmd_kill_runaways ;; + sessions) cmd_sessions ;; + status) cmd_status ;; + help | --help | -h) cmd_help ;; + *) + echo "Unknown command: $command" >&2 + cmd_help >&2 + return 1 + ;; + esac +} + +main "$@" diff --git a/.agents/scripts/pulse-wrapper.sh b/.agents/scripts/pulse-wrapper.sh index 5e0517737..86aa5e0af 100755 --- a/.agents/scripts/pulse-wrapper.sh +++ b/.agents/scripts/pulse-wrapper.sh @@ -41,6 +41,13 @@ RAM_RESERVE_MB="${RAM_RESERVE_MB:-8192}" # 8 GB reserved for OS MAX_WORKERS_CAP="${MAX_WORKERS_CAP:-8}" # Hard ceiling regardless of RAM QUALITY_SWEEP_INTERVAL="${QUALITY_SWEEP_INTERVAL:-86400}" # 24 hours between sweeps +# Process guard limits (t1398) +CHILD_RSS_LIMIT_KB="${CHILD_RSS_LIMIT_KB:-2097152}" # 2 GB default — kill child if RSS exceeds this +CHILD_RUNTIME_LIMIT="${CHILD_RUNTIME_LIMIT:-600}" # 10 min default — kill child if runtime exceeds this +SHELLCHECK_RSS_LIMIT_KB="${SHELLCHECK_RSS_LIMIT_KB:-1048576}" # 1 GB — ShellCheck-specific (lower due to exponential expansion) +SHELLCHECK_RUNTIME_LIMIT="${SHELLCHECK_RUNTIME_LIMIT:-300}" # 5 min — ShellCheck-specific +SESSION_COUNT_WARN="${SESSION_COUNT_WARN:-5}" # Warn when >N concurrent sessions detected + # Validate numeric configuration — prevent command injection via $(( )) expansion. # Bash arithmetic evaluates variable contents as expressions, so unsanitised strings # like "a[$(cmd)]" would execute arbitrary commands. @@ -70,6 +77,11 @@ RAM_PER_WORKER_MB=$(_validate_int RAM_PER_WORKER_MB "$RAM_PER_WORKER_MB" 1024 1) RAM_RESERVE_MB=$(_validate_int RAM_RESERVE_MB "$RAM_RESERVE_MB" 8192) MAX_WORKERS_CAP=$(_validate_int MAX_WORKERS_CAP "$MAX_WORKERS_CAP" 8) QUALITY_SWEEP_INTERVAL=$(_validate_int QUALITY_SWEEP_INTERVAL "$QUALITY_SWEEP_INTERVAL" 86400) +CHILD_RSS_LIMIT_KB=$(_validate_int CHILD_RSS_LIMIT_KB "$CHILD_RSS_LIMIT_KB" 2097152 1) +CHILD_RUNTIME_LIMIT=$(_validate_int CHILD_RUNTIME_LIMIT "$CHILD_RUNTIME_LIMIT" 600 1) +SHELLCHECK_RSS_LIMIT_KB=$(_validate_int SHELLCHECK_RSS_LIMIT_KB "$SHELLCHECK_RSS_LIMIT_KB" 1048576 1) +SHELLCHECK_RUNTIME_LIMIT=$(_validate_int SHELLCHECK_RUNTIME_LIMIT "$SHELLCHECK_RUNTIME_LIMIT" 300 1) +SESSION_COUNT_WARN=$(_validate_int SESSION_COUNT_WARN "$SESSION_COUNT_WARN" 5 1) # Sanitise untrusted strings before embedding in GitHub markdown comments. # Strips @ mentions (prevents unwanted notifications) and backtick sequences @@ -792,7 +804,7 @@ check_permission_failure_pr() { ####################################### prefetch_active_workers() { local worker_lines - worker_lines=$(ps axo pid,etime,command 2>/dev/null | grep '/full-loop' | grep '\.opencode' | grep -v grep || true) + worker_lines=$(ps axo pid,etime,command | grep '/full-loop' | grep '\.opencode' | grep -v grep || true) echo "" echo "# Active Workers" @@ -811,9 +823,7 @@ prefetch_active_workers() { echo "" echo "$worker_lines" | while IFS= read -r line; do local pid etime cmd - pid=$(echo "$line" | awk '{print $1}') - etime=$(echo "$line" | awk '{print $2}') - cmd=$(echo "$line" | cut -d' ' -f3-) + read -r pid etime cmd <<<"$line" # Compute elapsed seconds for struggle ratio local elapsed_seconds @@ -842,6 +852,123 @@ prefetch_active_workers() { return 0 } +####################################### +# Process guard: kill child processes exceeding RSS or runtime limits (t1398) +# +# Scans all child processes of the current pulse (and their descendants) +# for resource violations. ShellCheck processes get stricter limits due +# to their known exponential expansion with --external-sources. +# +# This is the primary defense against the March 3 kernel panic scenario: +# a single shellcheck invocation consuming 5+ GB RAM for 35+ minutes. +# +# Called from the watchdog loop inside run_pulse() every 60s. +# +# Arguments: +# $1 - (optional) PID of the primary pulse process to exempt from +# CHILD_RUNTIME_LIMIT (governed by PULSE_STALE_THRESHOLD instead) +# Returns: 0 always (best-effort, never breaks the pulse) +####################################### +guard_child_processes() { + local pulse_pid="${1:-}" + local killed=0 + local total_freed_mb=0 + + # Get all descendant PIDs of the current shell process. + # Use 'command' (full command line) instead of 'comm' (basename only) + # so that patterns like 'node.*opencode' can match. (CodeRabbit review) + local descendants + descendants=$(ps -eo pid,ppid,rss,etime,command | awk -v parent=$$ ' + BEGIN { pids[parent]=1 } + { if ($2 in pids) { pids[$1]=1; print $0 } } + ') || return 0 + + while IFS= read -r line; do + [[ -z "$line" ]] && continue + + # Fields from ps -eo pid,ppid,rss,etime,command + # command is last and may contain spaces — read captures the rest + local pid _ppid rss etime cmd_full + read -r pid _ppid rss etime cmd_full <<<"$line" + + # Validate numeric fields + [[ "$pid" =~ ^[0-9]+$ ]] || continue + [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 + + local age_seconds + age_seconds=$(_get_process_age "$pid") + + # Extract basename for limit selection (e.g., /usr/bin/shellcheck → shellcheck) + local cmd_base="${cmd_full%% *}" + cmd_base="${cmd_base##*/}" + + # Determine limits: ShellCheck gets stricter limits + local rss_limit="$CHILD_RSS_LIMIT_KB" + local runtime_limit="$CHILD_RUNTIME_LIMIT" + if [[ "$cmd_base" == "shellcheck" ]]; then + rss_limit="$SHELLCHECK_RSS_LIMIT_KB" + runtime_limit="$SHELLCHECK_RUNTIME_LIMIT" + fi + + local violation="" + if [[ "$rss" -gt "$rss_limit" ]]; then + local rss_mb=$((rss / 1024)) + local limit_mb=$((rss_limit / 1024)) + violation="RSS ${rss_mb}MB > ${limit_mb}MB limit" + elif [[ -n "$pulse_pid" && "$pid" == "$pulse_pid" ]]; then + # Primary pulse process — runtime governed by PULSE_STALE_THRESHOLD, + # not CHILD_RUNTIME_LIMIT. Skip runtime check but keep RSS check. + : + elif [[ "$age_seconds" -gt "$runtime_limit" ]]; then + violation="runtime ${age_seconds}s > ${runtime_limit}s limit" + fi + + if [[ -n "$violation" ]]; then + local rss_mb=$((rss / 1024)) + echo "[pulse-wrapper] Process guard: killing PID $pid ($cmd_base) — $violation" >>"$LOGFILE" + _kill_tree "$pid" + sleep 1 + if kill -0 "$pid" 2>/dev/null; then + _force_kill_tree "$pid" + fi + killed=$((killed + 1)) + total_freed_mb=$((total_freed_mb + rss_mb)) + fi + done <<<"$descendants" + + if [[ "$killed" -gt 0 ]]; then + echo "[pulse-wrapper] Process guard: killed $killed process(es), freed ~${total_freed_mb}MB" >>"$LOGFILE" + fi + return 0 +} + +####################################### +# Check concurrent session count and warn (t1398) +# +# Counts running opencode/claude interactive sessions (those with a TTY). +# If count exceeds SESSION_COUNT_WARN, logs a warning. This is informational +# — the pulse doesn't kill user sessions, but the health issue will show it. +# +# Returns: session count via stdout +####################################### +check_session_count() { + local interactive_count=0 + + # Count opencode processes with a real TTY (interactive sessions). + # Filter both '?' (Linux) and '??' (macOS) headless TTY entries. + interactive_count=$(ps axo tty,command | awk ' + /(\.(opencode|claude)|opencode-ai|claude-ai)/ && !/awk/ && $1 != "?" && $1 != "??" { count++ } + END { print count + 0 } + ') || interactive_count=0 + + if [[ "$interactive_count" -gt "$SESSION_COUNT_WARN" ]]; then + echo "[pulse-wrapper] Session warning: $interactive_count interactive sessions open (threshold: $SESSION_COUNT_WARN). Each consumes 100-440MB + language servers. Consider closing unused tabs." >>"$LOGFILE" + fi + + echo "$interactive_count" + return 0 +} + ####################################### # Run the pulse — with internal watchdog timeout (t1397) # @@ -850,6 +977,9 @@ prefetch_active_workers() { # it kills the process tree and returns, allowing the wrapper to continue # to the quality sweep and health issue phases. # +# The watchdog also runs guard_child_processes() every 60s to kill any +# child process exceeding RSS or runtime limits (t1398). +# # Previous design relied on the NEXT launchd invocation's check_dedup() # to kill stale processes. This failed because launchd StartInterval only # fires when the previous invocation has exited — and the wrapper blocks @@ -886,8 +1016,9 @@ ${state_content} echo "[pulse-wrapper] opencode PID: $opencode_pid" >>"$LOGFILE" # Watchdog loop: check every 60s if the process is still alive and within - # the stale threshold. This replaces the bare `wait` that blocked the - # wrapper indefinitely when opencode hung. + # the stale threshold. Also runs process guard to kill runaway children (t1398). + # This replaces the bare `wait` that blocked the wrapper indefinitely when + # opencode hung. while kill -0 "$opencode_pid" 2>/dev/null; do local now now=$(date +%s) @@ -902,6 +1033,10 @@ ${state_content} fi break fi + # Process guard: kill children exceeding RSS/runtime limits (t1398) + # Pass opencode_pid so the primary pulse process is exempt from + # CHILD_RUNTIME_LIMIT (it's governed by PULSE_STALE_THRESHOLD above). + guard_child_processes "$opencode_pid" # Sleep 60s then re-check. Portable across bash 3.2+ (macOS default). # The process may exit during sleep — kill -0 at top of loop catches that. sleep 60 @@ -1152,19 +1287,17 @@ _update_health_issue_for_repo() { local workers_md="" local worker_count=0 local worker_lines - worker_lines=$(ps axo pid,tty,etime,command 2>/dev/null | grep '\.opencode' | grep -v grep | grep -v 'bash-language-server' || true) + worker_lines=$(ps axo pid,tty,etime,command | grep '\.opencode' | grep -v grep | grep -v 'bash-language-server' || true) if [[ -n "$worker_lines" ]]; then local worker_table="" while IFS= read -r line; do local w_pid w_tty w_etime w_cmd - w_pid=$(echo "$line" | awk '{print $1}') - w_tty=$(echo "$line" | awk '{print $2}') - w_etime=$(echo "$line" | awk '{print $3}') - w_cmd=$(echo "$line" | cut -d' ' -f4-) + read -r w_pid w_tty w_etime w_cmd <<<"$line" - # Only count headless workers (no TTY) - [[ "$w_tty" != "??" ]] && continue + # Only count headless workers (no TTY). + # Exclude both '?' (Linux headless) and '??' (macOS headless). + [[ "$w_tty" != "?" && "$w_tty" != "??" ]] && continue # Extract title if present (--title "...") local w_title="headless" @@ -1285,6 +1418,14 @@ ${worker_table}" max_workers=$(cat "$max_workers_file" 2>/dev/null || echo "?") fi + # Interactive session count (t1398) + local session_count + session_count=$(check_session_count) + local session_warning="" + if [[ "$session_count" -gt "$SESSION_COUNT_WARN" ]]; then + session_warning=" **WARNING: exceeds threshold of ${SESSION_COUNT_WARN}**" + fi + # --- Assemble body --- local body body="## Queue Health Dashboard @@ -1303,6 +1444,7 @@ ${worker_table}" | Active Workers | ${worker_count} | | Max Workers | ${max_workers} | | Worktrees | ${wt_count} | +| Interactive Sessions | ${session_count}${session_warning} | ### Open PRs @@ -1712,49 +1854,71 @@ _quality_sweep_for_repo() { local sc_summary="" local sc_details="" - while IFS= read -r shfile; do - [[ -z "$shfile" ]] && continue - local result - result=$(shellcheck -f gcc "$shfile" 2>/dev/null || true) - if [[ -n "$result" ]]; then - local file_errors - file_errors=$(echo "$result" | grep -c ':.*: error:') || file_errors=0 - local file_warnings - file_warnings=$(echo "$result" | grep -c ':.*: warning:') || file_warnings=0 - sc_errors=$((sc_errors + file_errors)) - sc_warnings=$((sc_warnings + file_warnings)) - - # Capture first 3 findings per file for the summary - local rel_path="${shfile#"$repo_path"/}" - local top_findings - top_findings=$(echo "$result" | head -3 | while IFS= read -r line; do - echo " - \`${rel_path}\`: ${line##*: }" - done) - if [[ -n "$top_findings" ]]; then - sc_details="${sc_details}${top_findings} + # Determine timeout command (t1398: prevent runaway shellcheck) + local sc_timeout_cmd="" + if command -v timeout &>/dev/null; then + sc_timeout_cmd="timeout 30s" + elif command -v gtimeout &>/dev/null; then + sc_timeout_cmd="gtimeout 30s" + fi + + # t1398: skip shellcheck entirely when no timeout utility is available. + # Running shellcheck without a timeout risks unbounded execution + # (exponential source expansion can consume 5+ GB RAM for 35+ min). + if [[ -z "$sc_timeout_cmd" ]]; then + echo "[pulse-wrapper] WARNING: no timeout/gtimeout available — skipping ShellCheck to avoid runaway risk" >>"$LOGFILE" + shellcheck_section="### ShellCheck (skipped) + +- **Reason**: no timeout utility available — skipping to prevent runaway expansion risk " + else + + while IFS= read -r shfile; do + [[ -z "$shfile" ]] && continue + local result + # t1398: timeout each shellcheck invocation to prevent exponential expansion + result=$($sc_timeout_cmd shellcheck -f gcc "$shfile" || true) + if [[ -n "$result" ]]; then + local file_errors + file_errors=$(grep -c ':.*: error:' <<<"$result") || file_errors=0 + local file_warnings + file_warnings=$(grep -c ':.*: warning:' <<<"$result") || file_warnings=0 + sc_errors=$((sc_errors + file_errors)) + sc_warnings=$((sc_warnings + file_warnings)) + + # Capture first 3 findings per file for the summary + local rel_path="${shfile#"$repo_path"/}" + local top_findings + top_findings=$(head -3 <<<"$result" | while IFS= read -r line; do + echo " - \`${rel_path}\`: ${line##*: }" + done) + if [[ -n "$top_findings" ]]; then + sc_details="${sc_details}${top_findings} +" + fi fi - fi - done <<<"$sh_files" + done <<<"$sh_files" - local file_count - file_count=$(echo "$sh_files" | wc -l | tr -d ' ') - shellcheck_section="### ShellCheck ($file_count files scanned) + local file_count + file_count=$(echo "$sh_files" | wc -l | tr -d ' ') + shellcheck_section="### ShellCheck ($file_count files scanned) - **Errors**: ${sc_errors} - **Warnings**: ${sc_warnings} " - if [[ -n "$sc_details" ]]; then - shellcheck_section="${shellcheck_section} + if [[ -n "$sc_details" ]]; then + shellcheck_section="${shellcheck_section} **Top findings:** ${sc_details}" - fi - if [[ "$sc_errors" -eq 0 && "$sc_warnings" -eq 0 ]]; then - shellcheck_section="${shellcheck_section} + fi + if [[ "$sc_errors" -eq 0 && "$sc_warnings" -eq 0 ]]; then + shellcheck_section="${shellcheck_section} _All clear — no issues found._ " - fi - tool_count=$((tool_count + 1)) + fi + tool_count=$((tool_count + 1)) + + fi # end: sc_timeout_cmd non-empty guard fi fi @@ -2084,6 +2248,7 @@ main() { cleanup_orphans cleanup_worktrees calculate_max_workers + check_session_count >/dev/null prefetch_state run_pulse run_daily_quality_sweep @@ -2110,14 +2275,12 @@ cleanup_orphans() { while IFS= read -r line; do local pid tty etime rss cmd - pid=$(echo "$line" | awk '{print $1}') - tty=$(echo "$line" | awk '{print $2}') - etime=$(echo "$line" | awk '{print $3}') - rss=$(echo "$line" | awk '{print $4}') - cmd=$(echo "$line" | cut -d' ' -f5-) - - # Skip interactive sessions (has a real TTY) - if [[ "$tty" != "??" ]]; then + read -r pid tty etime rss cmd <<<"$line" + + # Skip interactive sessions (has a real TTY). + # Exclude both '?' (Linux headless) and '??' (macOS headless) — only + # those are headless; anything else (pts/N, ttys00N) is interactive. + if [[ "$tty" != "?" && "$tty" != "??" ]]; then continue fi @@ -2144,13 +2307,9 @@ cleanup_orphans() { # Also kill orphaned node launchers (parent of .opencode processes) while IFS= read -r line; do local pid tty etime rss cmd - pid=$(echo "$line" | awk '{print $1}') - tty=$(echo "$line" | awk '{print $2}') - etime=$(echo "$line" | awk '{print $3}') - rss=$(echo "$line" | awk '{print $4}') - cmd=$(echo "$line" | cut -d' ' -f5-) + read -r pid tty etime rss cmd <<<"$line" - [[ "$tty" != "??" ]] && continue + [[ "$tty" != "?" && "$tty" != "??" ]] && continue echo "$cmd" | grep -qE '/full-loop|Supervisor Pulse|Strategic Review|language-server|eslintServer' && continue local age_seconds