Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .agents/scripts/supervisor/cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,18 @@ cleanup_worker_processes() {
rm -f "$pid_file"
# t1222: Clean up hang warning marker to prevent stale markers from killing re-dispatched workers
rm -f "$SUPERVISOR_DIR/pids/${task_id}.hang-warned" 2>/dev/null || true
# t1190: Clean up timestamped dispatch/wrapper scripts for this task.
# These are created with timestamps (e.g., t001-dispatch-20260213142302.sh) to
# prevent overwrite race conditions. Remove them when the task is cleaned up.
local script
for script in "$SUPERVISOR_DIR/pids/${task_id}"-dispatch-*.sh \
"$SUPERVISOR_DIR/pids/${task_id}"-wrapper-*.sh \
"$SUPERVISOR_DIR/pids/${task_id}"-reprompt-*.sh \
"$SUPERVISOR_DIR/pids/${task_id}"-reprompt-wrapper-*.sh \
"$SUPERVISOR_DIR/pids/${task_id}"-prompt-repeat-*.sh \
"$SUPERVISOR_DIR/pids/${task_id}"-prompt-repeat-wrapper-*.sh; do
[[ -f "$script" ]] && rm -f "$script" || true
done

if [[ "$killed" -gt 0 ]]; then
log_info "Cleaned up worker process for $task_id (PID: $pid)"
Expand Down
83 changes: 58 additions & 25 deletions .agents/scripts/supervisor/dispatch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1049,7 +1049,10 @@ do_prompt_repeat() {
worker_xdg_config=$(generate_worker_mcp_config "$task_id" 2>/dev/null) || true

# Write dispatch script
local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-prompt-repeat.sh"
# t1190: Use timestamped filename to prevent overwrite race condition.
local pr_dispatch_ts
pr_dispatch_ts=$(date +%Y%m%d%H%M%S)
local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-prompt-repeat-${pr_dispatch_ts}.sh"
{
echo '#!/usr/bin/env bash'
echo "echo 'WORKER_STARTED task_id=${task_id} strategy=prompt_repeat pid=\$\$ timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ)"
Expand All @@ -1065,9 +1068,12 @@ do_prompt_repeat() {
chmod +x "$dispatch_script"

# Wrapper script with cleanup handlers (t253)
local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-prompt-repeat-wrapper.sh"
# t1190: Use timestamped filename to prevent overwrite race condition.
local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-prompt-repeat-wrapper-${pr_dispatch_ts}.sh"
{
echo '#!/usr/bin/env bash'
# t1190: Wrapper-level sentinel written before running dispatch script.
echo "echo 'WRAPPER_STARTED task_id=${task_id} strategy=prompt_repeat wrapper_pid=\$\$ dispatch_script=${dispatch_script} timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ) >> '${new_log_file}' 2>/dev/null || true"
echo '_kill_descendants_recursive() {'
echo ' local parent_pid="$1"'
echo ' local children'
Expand Down Expand Up @@ -1118,10 +1124,11 @@ do_prompt_repeat() {
chmod +x "$wrapper_script"

# Dispatch
# t1190: Redirect wrapper stderr to log file (not /dev/null) for diagnosis.
if command -v setsid &>/dev/null; then
nohup setsid bash "${wrapper_script}" &>/dev/null &
nohup setsid bash "${wrapper_script}" >>"${new_log_file}" 2>&1 &
else
nohup bash "${wrapper_script}" &>/dev/null &
nohup bash "${wrapper_script}" >>"${new_log_file}" 2>&1 &
fi
local worker_pid=$!
disown "$worker_pid" 2>/dev/null || true
Expand Down Expand Up @@ -2588,7 +2595,15 @@ cmd_dispatch() {

# Write dispatch script to a temp file to avoid bash -c quoting issues
# with multi-line prompts (newlines in printf '%q' break bash -c strings)
local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-dispatch.sh"
# t1190: Use timestamped filenames to prevent overwrite race condition when
# multiple dispatches run for the same task within a short window. Previously,
# a second dispatch would overwrite the dispatch/wrapper scripts before the
# first wrapper process had a chance to read them, causing the first wrapper
# to execute the second dispatch's script (which writes to a different log file),
# leaving the first log file with only the metadata header (no WORKER_STARTED).
local dispatch_ts
dispatch_ts=$(date +%Y%m%d%H%M%S)
local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-dispatch-${dispatch_ts}.sh"
{
echo '#!/usr/bin/env bash'
echo "# Startup sentinel (t183): if this line appears in the log, the script started"
Expand All @@ -2611,9 +2626,15 @@ cmd_dispatch() {
# errors when the dispatch script failed to start (e.g., opencode not found).
# Now errors are appended to the log file for diagnosis.
# t253: Add cleanup handlers to prevent orphaned children when wrapper exits
local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-wrapper.sh"
# t1190: Use timestamped filename (matches dispatch_ts) to prevent overwrite race.
local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-wrapper-${dispatch_ts}.sh"
{
echo '#!/usr/bin/env bash'
# t1190: Wrapper-level sentinel — written before running the dispatch script.
# If WRAPPER_STARTED appears in the log but WORKER_STARTED does not, the
# wrapper ran but the dispatch script failed to start (exec failure, bad shebang,
# permission error). This distinguishes "wrapper never ran" from "dispatch failed".
echo "echo 'WRAPPER_STARTED task_id=${task_id} wrapper_pid=\$\$ dispatch_script=${dispatch_script} timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ) >> '${log_file}' 2>/dev/null || true"
echo '# t253: Recursive cleanup to kill all descendant processes'
echo '_kill_descendants_recursive() {'
echo ' local parent_pid="$1"'
Expand Down Expand Up @@ -2678,23 +2699,10 @@ cmd_dispatch() {
log_info "Opening Tabby tab for $task_id..."
printf '\e]1337;NewTab=%s\a' "'${wrapper_script}'" 2>/dev/null || true
# Also start background process as fallback (Tabby may not support OSC 1337)
# t253: Use setsid if available (Linux) for process group isolation
# Use nohup + disown to survive parent (cron) exit
if command -v setsid &>/dev/null; then
nohup setsid bash "${wrapper_script}" &>/dev/null &
else
nohup bash "${wrapper_script}" &>/dev/null &
fi
_launch_wrapper_script "${wrapper_script}" "${log_file}"
else
# Headless: background process
# t253: Use setsid if available (Linux) for process group isolation
# Use nohup + disown to survive parent (cron) exit — without this,
# workers die after ~2 minutes when the cron pulse script exits
if command -v setsid &>/dev/null; then
nohup setsid bash "${wrapper_script}" &>/dev/null &
else
nohup bash "${wrapper_script}" &>/dev/null &
fi
_launch_wrapper_script "${wrapper_script}" "${log_file}"
fi

local worker_pid=$!
Expand All @@ -2714,6 +2722,24 @@ cmd_dispatch() {
return 0
}

#######################################
# Launch a wrapper script in the background, surviving parent (cron) exit.
# t253: Uses setsid if available (Linux) for process group isolation.
# t1190: Redirects wrapper stderr to log file for startup error diagnosis.
# Args: wrapper_script log_file
#######################################
_launch_wrapper_script() {
local wrapper_script="$1"
local log_file="$2"

if command -v setsid &>/dev/null; then
nohup setsid bash "${wrapper_script}" >>"${log_file}" 2>&1 &
else
nohup bash "${wrapper_script}" >>"${log_file}" 2>&1 &
fi
return 0
}

#######################################
# Check the status of a running worker
# Reads log file and PID to determine state
Expand Down Expand Up @@ -3054,7 +3080,10 @@ Task description: ${tdesc:-$task_id}"
worker_xdg_config=$(generate_worker_mcp_config "$task_id" 2>/dev/null) || true

# Write dispatch script with startup sentinel (t183)
local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-reprompt.sh"
# t1190: Use timestamped filename to prevent overwrite race condition.
local reprompt_dispatch_ts
reprompt_dispatch_ts=$(date +%Y%m%d%H%M%S)
local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-reprompt-${reprompt_dispatch_ts}.sh"
{
echo '#!/usr/bin/env bash'
echo "echo 'WORKER_STARTED task_id=${task_id} retry=${tretries} pid=\$\$ timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ)"
Expand All @@ -3071,9 +3100,12 @@ Task description: ${tdesc:-$task_id}"

# Wrapper script (t183): captures dispatch errors in log file
# t253: Add cleanup handlers to prevent orphaned children when wrapper exits
local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-reprompt-wrapper.sh"
# t1190: Use timestamped filename to prevent overwrite race condition.
local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-reprompt-wrapper-${reprompt_dispatch_ts}.sh"
{
echo '#!/usr/bin/env bash'
# t1190: Wrapper-level sentinel written before running dispatch script.
echo "echo 'WRAPPER_STARTED task_id=${task_id} retry=${tretries} wrapper_pid=\$\$ dispatch_script=${dispatch_script} timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ) >> '${new_log_file}' 2>/dev/null || true"
echo '# t253: Recursive cleanup to kill all descendant processes'
echo '_kill_descendants_recursive() {'
echo ' local parent_pid="$1"'
Expand Down Expand Up @@ -3118,10 +3150,11 @@ Task description: ${tdesc:-$task_id}"

# t253: Use setsid if available (Linux) for process group isolation
# Use nohup + disown to survive parent (cron) exit
# t1190: Redirect wrapper stderr to log file (not /dev/null) for diagnosis.
if command -v setsid &>/dev/null; then
nohup setsid bash "${wrapper_script}" &>/dev/null &
nohup setsid bash "${wrapper_script}" >>"${new_log_file}" 2>&1 &
else
nohup bash "${wrapper_script}" &>/dev/null &
nohup bash "${wrapper_script}" >>"${new_log_file}" 2>&1 &
fi
local worker_pid=$!
disown "$worker_pid" 2>/dev/null || true
Expand Down
20 changes: 18 additions & 2 deletions .agents/scripts/supervisor/evaluate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ extract_log_metadata() {
echo "worker_started=false"
fi

# Wrapper startup sentinel (t1190): distinguishes wrapper-never-ran from dispatch-exec-failed
if grep -q 'WRAPPER_STARTED' "$log_file"; then
echo "wrapper_started=true"
else
echo "wrapper_started=false"
fi

# Dispatch error sentinel (t183)
if grep -q 'WORKER_DISPATCH_ERROR\|WORKER_FAILED' "$log_file" 2>/dev/null; then
local dispatch_error
Expand Down Expand Up @@ -1014,13 +1021,22 @@ evaluate_worker() {
fi

# Check if worker never started (only dispatch metadata, no WORKER_STARTED sentinel)
if [[ "$log_size" -lt 500 ]] && ! grep -q 'WORKER_STARTED' "$tlog" 2>/dev/null; then
# t1190: Distinguish between wrapper-never-ran vs dispatch-script-failed:
# - no WRAPPER_STARTED: wrapper process never ran (nohup spawn failed, OS killed it)
# - WRAPPER_STARTED but no WORKER_STARTED: wrapper ran but dispatch script failed
# (exec failure, bad shebang, permission error, dispatch script overwritten)
if [[ "$log_size" -lt 500 ]] && ! grep -q 'WORKER_STARTED' "$tlog"; then
# Log has metadata but worker never started — extract any error from log
local startup_error=""
startup_error=$(grep -i 'WORKER_FAILED\|WORKER_DISPATCH_ERROR\|command not found\|No such file\|Permission denied' "$tlog" 2>/dev/null | head -1 | head -c 200 || echo "")
startup_error=$(grep -i 'WORKER_FAILED\|WORKER_DISPATCH_ERROR\|command not found\|No such file\|Permission denied' "$tlog" | head -1 | head -c 200 || echo "")
if [[ -n "$startup_error" ]]; then
echo "failed:worker_never_started:$(echo "$startup_error" | tr ' ' '_' | tr -cd '[:alnum:]_:-')"
elif grep -q 'WRAPPER_STARTED' "$tlog"; then
# t1190: Wrapper ran but dispatch script failed to produce WORKER_STARTED.
# This means the dispatch script exec failed (e.g., CLI not found, bad args).
echo "failed:worker_never_started:dispatch_exec_failed"
else
# t1190: Neither wrapper nor worker started — nohup spawn likely failed.
echo "failed:worker_never_started:no_sentinel"
fi
return 0
Expand Down
Loading