Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 42 additions & 21 deletions .agents/scripts/supervisor-helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1409,6 +1409,13 @@ check_model_health() {
local ai_cli="$1"
local model="${2:-}"

# Pulse-level fast path: if health was already verified in this pulse
# invocation, skip the probe entirely (avoids 8s per task)
if [[ -n "${_PULSE_HEALTH_VERIFIED:-}" ]]; then
log_info "Model health: pulse-verified OK (skipping probe)"
return 0
fi

# Cache key: cli + model, stored as a file with timestamp
local cache_dir="$SUPERVISOR_DIR/health"
mkdir -p "$cache_dir"
Expand All @@ -1424,6 +1431,7 @@ check_model_health() {
local age=$(( now - cached_at ))
if [[ "$age" -lt 300 ]]; then
log_info "Model health: cached OK ($age seconds ago)"
_PULSE_HEALTH_VERIFIED="true"
return 0
fi
fi
Expand Down Expand Up @@ -1522,6 +1530,7 @@ check_model_health() {

# Healthy - cache the result
date +%s > "$cache_file"
_PULSE_HEALTH_VERIFIED="true"
log_info "Model health: OK (cached for 5m)"
return 0
}
Expand Down Expand Up @@ -1958,13 +1967,17 @@ cmd_dispatch() {
tab_cmd="cd '${worktree_path}' && ${cmd_parts[*]} > '${log_file}' 2>&1; echo \"EXIT:\$?\" >> '${log_file}'"
printf '\e]1337;NewTab=%s\a' "$tab_cmd" 2>/dev/null || true
# Also start background process as fallback (Tabby may not support OSC 1337)
(cd "$worktree_path" && "${cmd_parts[@]}" > "$log_file" 2>&1; echo "EXIT:$?" >> "$log_file") &
# Use nohup + disown to survive parent (cron) exit
nohup bash -c "cd '${worktree_path}' && $(printf '%q ' "${cmd_parts[@]}") > '${log_file}' 2>&1; echo \"EXIT:\$?\" >> '${log_file}'" &>/dev/null &
else
# Headless: background process
(cd "$worktree_path" && "${cmd_parts[@]}" > "$log_file" 2>&1; echo "EXIT:$?" >> "$log_file") &
# Use nohup + disown to survive parent (cron) exit — without this,
# workers die after ~2 minutes when the cron pulse script exits
nohup bash -c "cd '${worktree_path}' && $(printf '%q ' "${cmd_parts[@]}") > '${log_file}' 2>&1; echo \"EXIT:\$?\" >> '${log_file}'" &>/dev/null &
fi
Comment on lines 1969 to 1977

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The command to launch the background worker is duplicated in both the if (line 1971) and else (line 1976) branches. A very similar command is also used in the cmd_reprompt function (line 2560). This makes it harder to maintain if the dispatch logic needs to be changed in the future.

Consider extracting this complex command into a local helper function to avoid repetition and improve readability.

For example, you could create a helper function like this:

_dispatch_background_worker() {
    local work_path="$1"
    local log_path="$2"
    shift 2
    local -a command_parts=("$@")

    nohup bash -c "cd '${work_path}' && $(printf '%q ' "${command_parts[@]}") > '${log_path}' 2>&1; echo \"EXIT:\$?\" >> '${log_path}'" &>/dev/null &
}

And then call it from both branches, which would simplify the main function's logic.


local worker_pid=$!
disown "$worker_pid" 2>/dev/null || true

# Store PID for monitoring
echo "$worker_pid" > "$SUPERVISOR_DIR/pids/${task_id}.pid"
Expand Down Expand Up @@ -2543,8 +2556,10 @@ Task description: ${tdesc:-$task_id}"
# Ensure PID directory exists
mkdir -p "$SUPERVISOR_DIR/pids"

(cd "$work_dir" && "${cmd_parts[@]}" > "$new_log_file" 2>&1; echo "EXIT:$?" >> "$new_log_file") &
# Use nohup + disown to survive parent (cron) exit
nohup bash -c "cd '${work_dir}' && $(printf '%q ' "${cmd_parts[@]}") > '${new_log_file}' 2>&1; echo \"EXIT:\$?\" >> '${new_log_file}'" &>/dev/null &
local worker_pid=$!
disown "$worker_pid" 2>/dev/null || true

echo "$worker_pid" > "$SUPERVISOR_DIR/pids/${task_id}.pid"

Expand Down Expand Up @@ -3258,6 +3273,10 @@ cmd_pulse() {

log_info "=== Supervisor Pulse $(date -u +%Y-%m-%dT%H:%M:%SZ) ==="

# Pulse-level health check flag: once health is confirmed in this pulse,
# skip subsequent checks to avoid 8-second probes per task
_PULSE_HEALTH_VERIFIED=""

# Phase 0: Auto-pickup new tasks from TODO.md (t128.5)
# Scans for #auto-dispatch tags and Dispatch Queue section
local all_repos
Expand Down Expand Up @@ -3420,17 +3439,18 @@ cmd_pulse() {

if [[ -n "$next_tasks" ]]; then
while IFS='|' read -r tid trepo tdesc tmodel; do
if cmd_dispatch "$tid" --batch "$batch_id" 2>/dev/null; then
local dispatch_exit=0
cmd_dispatch "$tid" --batch "$batch_id" || dispatch_exit=$?
if [[ "$dispatch_exit" -eq 0 ]]; then
dispatched_count=$((dispatched_count + 1))
elif [[ "$dispatch_exit" -eq 2 ]]; then
log_info "Concurrency limit reached, stopping dispatch"
break
elif [[ "$dispatch_exit" -eq 3 ]]; then
log_warn "Provider unavailable for $tid, stopping dispatch until next pulse"
break
else
local dispatch_exit=$?
if [[ "$dispatch_exit" -eq 2 ]]; then
log_info "Concurrency limit reached, stopping dispatch"
break
elif [[ "$dispatch_exit" -eq 3 ]]; then
log_warn "Provider unavailable, stopping dispatch until next pulse"
break
fi
log_warn "Dispatch failed for $tid (exit $dispatch_exit), trying next task"
fi
done <<< "$next_tasks"
fi
Expand All @@ -3441,17 +3461,18 @@ cmd_pulse() {

if [[ -n "$next_tasks" ]]; then
while IFS='|' read -r tid trepo tdesc tmodel; do
if cmd_dispatch "$tid" 2>/dev/null; then
local dispatch_exit=0
cmd_dispatch "$tid" || dispatch_exit=$?
if [[ "$dispatch_exit" -eq 0 ]]; then
dispatched_count=$((dispatched_count + 1))
elif [[ "$dispatch_exit" -eq 2 ]]; then
log_info "Concurrency limit reached, stopping dispatch"
break
elif [[ "$dispatch_exit" -eq 3 ]]; then
log_warn "Provider unavailable for $tid, stopping dispatch until next pulse"
break
else
local dispatch_exit=$?
if [[ "$dispatch_exit" -eq 2 ]]; then
log_info "Concurrency limit reached, stopping dispatch"
break
elif [[ "$dispatch_exit" -eq 3 ]]; then
log_warn "Provider unavailable, stopping dispatch until next pulse"
break
fi
log_warn "Dispatch failed for $tid (exit $dispatch_exit), trying next task"
fi
done <<< "$next_tasks"
fi
Expand Down
2 changes: 1 addition & 1 deletion TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ Tasks with no open blockers - ready to work on. Use `/ready` to refresh this lis
## Backlog

- [ ] t140 setup.sh: Cisco Skill Scanner install fails on PEP 668 systems (Ubuntu 24.04+) #bugfix #setup #linux ~1h (ai:45m test:15m) logged:2026-02-07
- Notes: GH#415. pip3 install --user blocked by PEP 668 on modern Ubuntu/Debian. Fix fallback chain: uv -> pipx -> venv+symlink -> pip3 --user (legacy). Affects setup.sh lines ~2408-2432. Workaround: manual venv at ~/.aidevops/.agent-workspace/work/cisco-scanner-env/. BLOCKED: Re-prompt dispatch failed: backend_infrastructure_error
- Notes: GH#415. pip3 install --user blocked by PEP 668 on modern Ubuntu/Debian. Fix fallback chain: uv -> pipx -> venv+symlink -> pip3 --user (legacy). Affects setup.sh lines ~2408-2432. Workaround: manual venv at ~/.aidevops/.agent-workspace/work/cisco-scanner-env/. BLOCKED: Re-prompt dispatch failed: backend_infrastructure_error BLOCKED: Re-prompt dispatch failed: ambiguous_ai_unavailable
- [ ] t139 bug: memory-helper.sh recall fails on hyphenated queries #bugfix #memory ~30m (ai:20m test:10m) logged:2026-02-07 started:2026-02-07
- Notes: GH#414. Hyphens in FTS5 queries interpreted as NOT operator. "qs-agency" becomes "qs NOT agency" causing column resolution error. Fix: quote hyphenated terms before passing to FTS5 MATCH clause.
- [x] t138 aidevops update output overwhelms tool buffer on large updates #bugfix #setup ~30m (ai:20m test:10m) logged:2026-02-07 completed:2026-02-07
Expand Down
Loading