Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 89 additions & 34 deletions .agents/scripts/memory-pressure-monitor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@
# processes (ShellCheck at 5.7 GB, zombie pulses, session accumulation), not by
# generic OS memory pressure.
#
# Process classification (GH#2992):
# - "App" processes (Claude, Electron, ShipIt, OpenCode): long-running by design.
# Runtime alerts are skipped — only RSS is monitored. These processes run for
# hours/days and runtime warnings are 100% false positives (~5,400/day noise).
# - "Tool" processes (shellcheck, language servers, node workers): short-lived.
# Both RSS and runtime alerts are active.
#
# Auto-kill (GH#2915): ShellCheck processes that hit CRITICAL RSS or exceed
# runtime limits are automatically killed. This is safe because the bash
# language server respawns them. The root cause (source-path=SCRIPTDIR in
Expand All @@ -26,8 +33,9 @@
#
# Process-level thresholds (primary signals):
# RSS per process: > 2 GB → warning, > 4 GB → critical (kill candidate)
# Runtime: > 10 min for shellcheck, > 30 min for other tools
# Session count: > 5 concurrent interactive sessions → warning
# Runtime (tools): > 10 min for shellcheck, > 30 min for other tools
# Runtime (apps): skipped — long-running by design
# Session count: > 8 concurrent interactive sessions → warning
# Total aidevops: > 8 GB aggregate RSS → warning
#
# OS-level thresholds (secondary, informational only):
Expand All @@ -39,7 +47,7 @@
# PROCESS_RSS_CRIT_MB Per-process RSS critical (default: 4096)
# SHELLCHECK_RUNTIME_MAX ShellCheck max runtime in seconds (default: 600)
# TOOL_RUNTIME_MAX Other tool max runtime in seconds (default: 1800)
# SESSION_COUNT_WARN Interactive session warning threshold (default: 5)
# SESSION_COUNT_WARN Interactive session warning threshold (default: 8)
# AGGREGATE_RSS_WARN_MB Total aidevops RSS warning (default: 8192)
# AUTO_KILL_SHELLCHECK Auto-kill runaway ShellCheck (default: true)
# MEMORY_COOLDOWN_SECS Notification cooldown per category (default: 300)
Expand All @@ -51,7 +59,7 @@ set -euo pipefail
# --- Configuration -----------------------------------------------------------

readonly SCRIPT_NAME="memory-pressure-monitor"
readonly SCRIPT_VERSION="1.1.0"
readonly SCRIPT_VERSION="2.0.0"

# Per-process RSS thresholds (MB)
PROCESS_RSS_WARN_MB="${PROCESS_RSS_WARN_MB:-2048}"
Expand All @@ -62,7 +70,7 @@ SHELLCHECK_RUNTIME_MAX="${SHELLCHECK_RUNTIME_MAX:-600}" # 10 min
TOOL_RUNTIME_MAX="${TOOL_RUNTIME_MAX:-1800}" # 30 min

# Session/aggregate thresholds
SESSION_COUNT_WARN="${SESSION_COUNT_WARN:-5}"
SESSION_COUNT_WARN="${SESSION_COUNT_WARN:-8}"
AGGREGATE_RSS_WARN_MB="${AGGREGATE_RSS_WARN_MB:-8192}" # 8 GB total

# Auto-kill: ShellCheck processes are safe to kill (language server respawns them)
Expand Down Expand Up @@ -92,6 +100,16 @@ readonly MONITORED_PATTERNS=(
"bash-language-server"
)

# App processes: long-running by design, runtime alerts are false positives (GH#2992).
# Only RSS is monitored for these. Matched against the short command name (basename).
# Case-insensitive matching via _is_app_process().
readonly APP_PROCESS_NAMES=(
"claude"
"electron"
"shipit"
"opencode"
)

# --- Validation ---------------------------------------------------------------

# Validate numeric configuration — prevent command injection via $(( )) expansion.
Expand All @@ -117,7 +135,7 @@ PROCESS_RSS_WARN_MB=$(_validate_int PROCESS_RSS_WARN_MB "$PROCESS_RSS_WARN_MB" 2
PROCESS_RSS_CRIT_MB=$(_validate_int PROCESS_RSS_CRIT_MB "$PROCESS_RSS_CRIT_MB" 4096 512)
SHELLCHECK_RUNTIME_MAX=$(_validate_int SHELLCHECK_RUNTIME_MAX "$SHELLCHECK_RUNTIME_MAX" 600 60)
TOOL_RUNTIME_MAX=$(_validate_int TOOL_RUNTIME_MAX "$TOOL_RUNTIME_MAX" 1800 120)
SESSION_COUNT_WARN=$(_validate_int SESSION_COUNT_WARN "$SESSION_COUNT_WARN" 5 2)
SESSION_COUNT_WARN=$(_validate_int SESSION_COUNT_WARN "$SESSION_COUNT_WARN" 8 2)
AGGREGATE_RSS_WARN_MB=$(_validate_int AGGREGATE_RSS_WARN_MB "$AGGREGATE_RSS_WARN_MB" 8192 1024)
COOLDOWN_SECS=$(_validate_int COOLDOWN_SECS "$COOLDOWN_SECS" 300 30)
DAEMON_INTERVAL=$(_validate_int DAEMON_INTERVAL "$DAEMON_INTERVAL" 60 10)
Expand Down Expand Up @@ -305,6 +323,28 @@ _format_duration() {
return 0
}

# Check if a command name is an "app" process (long-running by design).
# App processes only get RSS monitoring — runtime alerts are skipped.
# Arguments: $1=command name (short basename)
# Returns: 0 if app process, 1 if tool process
_is_app_process() {
local cmd_name="$1"
# Strip leading dot (e.g., ".opencode" → "opencode") — some binaries
# are installed with a dot-prefixed wrapper name
cmd_name="${cmd_name#.}"
# Case-insensitive: convert to lowercase via tr (bash 3.2 compatible)
local cmd_lower
cmd_lower=$(printf '%s' "$cmd_name" | tr '[:upper:]' '[:lower:]')
local app_name app_lower
for app_name in "${APP_PROCESS_NAMES[@]}"; do
app_lower=$(printf '%s' "$app_name" | tr '[:upper:]' '[:lower:]')
if [[ "$cmd_lower" == "$app_lower" ]]; then
return 0
fi
done
return 1
}
Comment on lines +330 to +346
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The _is_app_process function can be simplified and made more efficient. The current implementation uses a shell for loop and invokes tr for each item on every call, which is not optimal for performance.

A more idiomatic approach for this kind of set membership test in shell scripting is to use grep. A single grep -qixF command can perform a case-insensitive, fixed-string, whole-line match against the list of app names. This is more concise, easier to maintain, and more performant as it avoids a shell loop and repeated sub-process calls.

Suggested change
_is_app_process() {
local cmd_name="$1"
# Strip leading dot (e.g., ".opencode" → "opencode") — some binaries
# are installed with a dot-prefixed wrapper name
cmd_name="${cmd_name#.}"
# Case-insensitive: convert to lowercase via tr (bash 3.2 compatible)
local cmd_lower
cmd_lower=$(printf '%s' "$cmd_name" | tr '[:upper:]' '[:lower:]')
local app_name app_lower
for app_name in "${APP_PROCESS_NAMES[@]}"; do
app_lower=$(printf '%s' "$app_name" | tr '[:upper:]' '[:lower:]')
if [[ "$cmd_lower" == "$app_lower" ]]; then
return 0
fi
done
return 1
}
_is_app_process() {
local cmd_name="$1"
# Strip leading dot (e.g., ".opencode" → "opencode") — some binaries
# are installed with a dot-prefixed wrapper name
cmd_name="${cmd_name#.}"
# Case-insensitive, fixed-string, whole-line match against app names.
# `grep` returns 0 on match, 1 on no-match, which is what we need.
printf '%s\n' "${APP_PROCESS_NAMES[@]}" | grep -qixF -- "$cmd_name"
}


# Collect all monitored processes with their RSS and runtime
# Output: one line per process: PID|RSS_MB|RUNTIME_SECS|COMMAND_NAME|FULL_COMMAND
_collect_monitored_processes() {
Expand Down Expand Up @@ -473,22 +513,24 @@ do_check() {
has_warning=true
fi

# Runtime check — different limits for shellcheck vs other tools
local runtime_limit="$TOOL_RUNTIME_MAX"
if [[ "$cmd_name" == "shellcheck" ]]; then
runtime_limit="$SHELLCHECK_RUNTIME_MAX"
fi
# Runtime check — skip for app processes (long-running by design, GH#2992)
if ! _is_app_process "$cmd_name"; then
local runtime_limit="$TOOL_RUNTIME_MAX"
if [[ "$cmd_name" == "shellcheck" ]]; then
runtime_limit="$SHELLCHECK_RUNTIME_MAX"
fi

if [[ "$runtime" -gt "$runtime_limit" ]]; then
local duration
duration=$(_format_duration "$runtime")
local limit_duration
limit_duration=$(_format_duration "$runtime_limit")
findings+=("WARNING|runtime|${pid}|${cmd_name} running for ${duration} (limit: ${limit_duration})")
has_warning=true
# Auto-kill ShellCheck exceeding runtime — stuck in source chain expansion
if [[ "$cmd_name" == "shellcheck" && "$AUTO_KILL_SHELLCHECK" == "true" ]]; then
_auto_kill_process "$pid" "runtime ${duration} exceeds ${limit_duration} limit"
if [[ "$runtime" -gt "$runtime_limit" ]]; then
local duration
duration=$(_format_duration "$runtime")
local limit_duration
limit_duration=$(_format_duration "$runtime_limit")
findings+=("WARNING|runtime|${pid}|${cmd_name} running for ${duration} (limit: ${limit_duration})")
has_warning=true
# Auto-kill ShellCheck exceeding runtime — stuck in source chain expansion
if [[ "$cmd_name" == "shellcheck" && "$AUTO_KILL_SHELLCHECK" == "true" ]]; then
_auto_kill_process "$pid" "runtime ${duration} exceeds ${limit_duration} limit"
fi
fi
fi
done <<<"$processes"
Expand Down Expand Up @@ -593,8 +635,8 @@ cmd_status() {
if [[ -z "$processes" ]]; then
echo " No monitored processes running"
else
printf " %-8s %-8s %-12s %-20s %s\n" "PID" "RSS MB" "Runtime" "Command" "Status"
printf " %-8s %-8s %-12s %-20s %s\n" "---" "------" "-------" "-------" "------"
printf " %-8s %-8s %-12s %-20s %-6s %s\n" "PID" "RSS MB" "Runtime" "Command" "Type" "Status"
printf " %-8s %-8s %-12s %-20s %-6s %s\n" "---" "------" "-------" "-------" "----" "------"

while IFS='|' read -r pid rss_mb runtime cmd_name full_cmd; do
[[ -z "$pid" ]] && continue
Expand All @@ -604,24 +646,32 @@ cmd_status() {
local duration
duration=$(_format_duration "$runtime")

local proc_type="tool"
if _is_app_process "$cmd_name"; then
proc_type="app"
fi

local status="ok"
if [[ "$rss_mb" -ge "$PROCESS_RSS_CRIT_MB" ]]; then
status="CRITICAL (RSS)"
elif [[ "$rss_mb" -ge "$PROCESS_RSS_WARN_MB" ]]; then
status="WARNING (RSS)"
fi

local runtime_limit="$TOOL_RUNTIME_MAX"
[[ "$cmd_name" == "shellcheck" ]] && runtime_limit="$SHELLCHECK_RUNTIME_MAX"
if [[ "$runtime" -gt "$runtime_limit" ]]; then
if [[ "$status" == "ok" ]]; then
status="WARNING (runtime)"
else
status="${status}, WARNING (runtime)"
# Runtime check only for tool processes (apps are long-running by design)
if [[ "$proc_type" == "tool" ]]; then
local runtime_limit="$TOOL_RUNTIME_MAX"
[[ "$cmd_name" == "shellcheck" ]] && runtime_limit="$SHELLCHECK_RUNTIME_MAX"
if [[ "$runtime" -gt "$runtime_limit" ]]; then
if [[ "$status" == "ok" ]]; then
status="WARNING (runtime)"
else
status="${status}, WARNING (runtime)"
fi
fi
fi

printf " %-8s %-8s %-12s %-20s %s\n" "$pid" "$rss_mb" "$duration" "$cmd_name" "$status"
printf " %-8s %-8s %-12s %-20s %-6s %s\n" "$pid" "$rss_mb" "$duration" "$cmd_name" "$proc_type" "$status"
done <<<"$processes"

echo ""
Expand Down Expand Up @@ -784,10 +834,15 @@ Commands:
--uninstall, -u Remove launchd plist and state files
--help, -h Show this help

Process classification (GH#2992):
App processes (claude, electron, shipit, opencode): RSS only, no runtime alerts
Tool processes (shellcheck, language servers): RSS + runtime alerts

Process-level thresholds (primary):
Per-process RSS: warning=${PROCESS_RSS_WARN_MB}MB, critical=${PROCESS_RSS_CRIT_MB}MB
ShellCheck max: $(_format_duration "$SHELLCHECK_RUNTIME_MAX")
Tool runtime max: $(_format_duration "$TOOL_RUNTIME_MAX")
ShellCheck max: $(_format_duration "$SHELLCHECK_RUNTIME_MAX") (tool)
Tool runtime max: $(_format_duration "$TOOL_RUNTIME_MAX") (tool)
App runtime: not checked (long-running by design)
Session count: warning >= ${SESSION_COUNT_WARN}
Aggregate RSS: warning >= ${AGGREGATE_RSS_WARN_MB}MB

Expand All @@ -801,7 +856,7 @@ Environment variables:
PROCESS_RSS_CRIT_MB Per-process RSS critical (default: 4096)
SHELLCHECK_RUNTIME_MAX ShellCheck max runtime in seconds (default: 600)
TOOL_RUNTIME_MAX Other tool max runtime in seconds (default: 1800)
SESSION_COUNT_WARN Interactive session warning threshold (default: 5)
SESSION_COUNT_WARN Interactive session warning threshold (default: 8)
AGGREGATE_RSS_WARN_MB Total aidevops RSS warning (default: 8192)
AUTO_KILL_SHELLCHECK Auto-kill runaway ShellCheck (default: true)
MEMORY_COOLDOWN_SECS Notification cooldown per category (default: 300)
Expand Down
89 changes: 89 additions & 0 deletions tests/test-memory-pressure-monitor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# - Cooldown logic (check_cooldown, set_cooldown, clear_cooldown)
# - OS memory info collection (_get_os_memory_info)
# - Interactive session counting (_count_interactive_sessions)
# - Process classification: app vs tool (_is_app_process) (GH#2992)
# - CLI commands (--help, --status, --check)
#
# Uses isolated temp directories to avoid touching production data.
Expand Down Expand Up @@ -326,6 +327,94 @@ test_session_count_numeric() {

test_session_count_numeric

# ============================================================================
section "Process Classification (GH#2992)"
# ============================================================================

test_is_app_process_claude() {
if _is_app_process "claude"; then
pass "claude is classified as app"
else
fail "claude is classified as app"
fi
}

test_is_app_process_electron() {
if _is_app_process "Electron"; then
pass "Electron is classified as app (case-insensitive)"
else
fail "Electron is classified as app (case-insensitive)"
fi
}

test_is_app_process_opencode() {
if _is_app_process "opencode"; then
pass "opencode is classified as app"
else
fail "opencode is classified as app"
fi
}

test_is_app_process_shipit() {
if _is_app_process "ShipIt"; then
pass "ShipIt is classified as app (case-insensitive)"
else
fail "ShipIt is classified as app (case-insensitive)"
fi
}

test_is_tool_process_shellcheck() {
if ! _is_app_process "shellcheck"; then
pass "shellcheck is classified as tool"
else
fail "shellcheck is classified as tool" "Was classified as app"
fi
}

test_is_tool_process_node() {
if ! _is_app_process "node"; then
pass "node is classified as tool"
else
fail "node is classified as tool" "Was classified as app"
fi
}

test_is_tool_process_unknown() {
if ! _is_app_process "some-random-process"; then
pass "Unknown process is classified as tool"
else
fail "Unknown process is classified as tool" "Was classified as app"
fi
}

test_session_count_default_threshold() {
# Verify the default session count threshold is 8 (raised from 5 in GH#2992)
if [[ "$SESSION_COUNT_WARN" -eq 8 ]]; then
pass "Session count threshold default is 8"
else
fail "Session count threshold default is 8" "Got $SESSION_COUNT_WARN"
fi
}

test_is_app_process_dot_prefix() {
# .opencode is the actual binary name on some installs
if _is_app_process ".opencode"; then
pass ".opencode (dot-prefixed) is classified as app"
else
fail ".opencode (dot-prefixed) is classified as app"
fi
}

test_is_app_process_claude
test_is_app_process_electron
test_is_app_process_opencode
test_is_app_process_shipit
test_is_app_process_dot_prefix
test_is_tool_process_shellcheck
test_is_tool_process_node
test_is_tool_process_unknown
test_session_count_default_threshold

# ============================================================================
section "CLI Commands"
# ============================================================================
Expand Down
Loading