diff --git a/.agents/scripts/supervisor/cleanup.sh b/.agents/scripts/supervisor/cleanup.sh index 51d6280aa..a45017b65 100755 --- a/.agents/scripts/supervisor/cleanup.sh +++ b/.agents/scripts/supervisor/cleanup.sh @@ -284,6 +284,18 @@ cleanup_worker_processes() { rm -f "$pid_file" # t1222: Clean up hang warning marker to prevent stale markers from killing re-dispatched workers rm -f "$SUPERVISOR_DIR/pids/${task_id}.hang-warned" 2>/dev/null || true + # t1190: Clean up timestamped dispatch/wrapper scripts for this task. + # These are created with timestamps (e.g., t001-dispatch-20260213142302.sh) to + # prevent overwrite race conditions. Remove them when the task is cleaned up. + local script + for script in "$SUPERVISOR_DIR/pids/${task_id}"-dispatch-*.sh \ + "$SUPERVISOR_DIR/pids/${task_id}"-wrapper-*.sh \ + "$SUPERVISOR_DIR/pids/${task_id}"-reprompt-*.sh \ + "$SUPERVISOR_DIR/pids/${task_id}"-reprompt-wrapper-*.sh \ + "$SUPERVISOR_DIR/pids/${task_id}"-prompt-repeat-*.sh \ + "$SUPERVISOR_DIR/pids/${task_id}"-prompt-repeat-wrapper-*.sh; do + [[ -f "$script" ]] && rm -f "$script" || true + done if [[ "$killed" -gt 0 ]]; then log_info "Cleaned up worker process for $task_id (PID: $pid)" diff --git a/.agents/scripts/supervisor/dispatch.sh b/.agents/scripts/supervisor/dispatch.sh index f46de2a0a..0ab8eee5d 100755 --- a/.agents/scripts/supervisor/dispatch.sh +++ b/.agents/scripts/supervisor/dispatch.sh @@ -1049,7 +1049,10 @@ do_prompt_repeat() { worker_xdg_config=$(generate_worker_mcp_config "$task_id" 2>/dev/null) || true # Write dispatch script - local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-prompt-repeat.sh" + # t1190: Use timestamped filename to prevent overwrite race condition. + local pr_dispatch_ts + pr_dispatch_ts=$(date +%Y%m%d%H%M%S) + local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-prompt-repeat-${pr_dispatch_ts}.sh" { echo '#!/usr/bin/env bash' echo "echo 'WORKER_STARTED task_id=${task_id} strategy=prompt_repeat pid=\$\$ timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ)" @@ -1065,9 +1068,12 @@ do_prompt_repeat() { chmod +x "$dispatch_script" # Wrapper script with cleanup handlers (t253) - local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-prompt-repeat-wrapper.sh" + # t1190: Use timestamped filename to prevent overwrite race condition. + local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-prompt-repeat-wrapper-${pr_dispatch_ts}.sh" { echo '#!/usr/bin/env bash' + # t1190: Wrapper-level sentinel written before running dispatch script. + echo "echo 'WRAPPER_STARTED task_id=${task_id} strategy=prompt_repeat wrapper_pid=\$\$ dispatch_script=${dispatch_script} timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ) >> '${new_log_file}' 2>/dev/null || true" echo '_kill_descendants_recursive() {' echo ' local parent_pid="$1"' echo ' local children' @@ -1118,10 +1124,11 @@ do_prompt_repeat() { chmod +x "$wrapper_script" # Dispatch + # t1190: Redirect wrapper stderr to log file (not /dev/null) for diagnosis. if command -v setsid &>/dev/null; then - nohup setsid bash "${wrapper_script}" &>/dev/null & + nohup setsid bash "${wrapper_script}" >>"${new_log_file}" 2>&1 & else - nohup bash "${wrapper_script}" &>/dev/null & + nohup bash "${wrapper_script}" >>"${new_log_file}" 2>&1 & fi local worker_pid=$! disown "$worker_pid" 2>/dev/null || true @@ -2588,7 +2595,15 @@ cmd_dispatch() { # Write dispatch script to a temp file to avoid bash -c quoting issues # with multi-line prompts (newlines in printf '%q' break bash -c strings) - local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-dispatch.sh" + # t1190: Use timestamped filenames to prevent overwrite race condition when + # multiple dispatches run for the same task within a short window. Previously, + # a second dispatch would overwrite the dispatch/wrapper scripts before the + # first wrapper process had a chance to read them, causing the first wrapper + # to execute the second dispatch's script (which writes to a different log file), + # leaving the first log file with only the metadata header (no WORKER_STARTED). + local dispatch_ts + dispatch_ts=$(date +%Y%m%d%H%M%S) + local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-dispatch-${dispatch_ts}.sh" { echo '#!/usr/bin/env bash' echo "# Startup sentinel (t183): if this line appears in the log, the script started" @@ -2611,9 +2626,15 @@ cmd_dispatch() { # errors when the dispatch script failed to start (e.g., opencode not found). # Now errors are appended to the log file for diagnosis. # t253: Add cleanup handlers to prevent orphaned children when wrapper exits - local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-wrapper.sh" + # t1190: Use timestamped filename (matches dispatch_ts) to prevent overwrite race. + local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-wrapper-${dispatch_ts}.sh" { echo '#!/usr/bin/env bash' + # t1190: Wrapper-level sentinel — written before running the dispatch script. + # If WRAPPER_STARTED appears in the log but WORKER_STARTED does not, the + # wrapper ran but the dispatch script failed to start (exec failure, bad shebang, + # permission error). This distinguishes "wrapper never ran" from "dispatch failed". + echo "echo 'WRAPPER_STARTED task_id=${task_id} wrapper_pid=\$\$ dispatch_script=${dispatch_script} timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ) >> '${log_file}' 2>/dev/null || true" echo '# t253: Recursive cleanup to kill all descendant processes' echo '_kill_descendants_recursive() {' echo ' local parent_pid="$1"' @@ -2678,23 +2699,10 @@ cmd_dispatch() { log_info "Opening Tabby tab for $task_id..." printf '\e]1337;NewTab=%s\a' "'${wrapper_script}'" 2>/dev/null || true # Also start background process as fallback (Tabby may not support OSC 1337) - # t253: Use setsid if available (Linux) for process group isolation - # Use nohup + disown to survive parent (cron) exit - if command -v setsid &>/dev/null; then - nohup setsid bash "${wrapper_script}" &>/dev/null & - else - nohup bash "${wrapper_script}" &>/dev/null & - fi + _launch_wrapper_script "${wrapper_script}" "${log_file}" else # Headless: background process - # t253: Use setsid if available (Linux) for process group isolation - # Use nohup + disown to survive parent (cron) exit — without this, - # workers die after ~2 minutes when the cron pulse script exits - if command -v setsid &>/dev/null; then - nohup setsid bash "${wrapper_script}" &>/dev/null & - else - nohup bash "${wrapper_script}" &>/dev/null & - fi + _launch_wrapper_script "${wrapper_script}" "${log_file}" fi local worker_pid=$! @@ -2714,6 +2722,24 @@ cmd_dispatch() { return 0 } +####################################### +# Launch a wrapper script in the background, surviving parent (cron) exit. +# t253: Uses setsid if available (Linux) for process group isolation. +# t1190: Redirects wrapper stderr to log file for startup error diagnosis. +# Args: wrapper_script log_file +####################################### +_launch_wrapper_script() { + local wrapper_script="$1" + local log_file="$2" + + if command -v setsid &>/dev/null; then + nohup setsid bash "${wrapper_script}" >>"${log_file}" 2>&1 & + else + nohup bash "${wrapper_script}" >>"${log_file}" 2>&1 & + fi + return 0 +} + ####################################### # Check the status of a running worker # Reads log file and PID to determine state @@ -3054,7 +3080,10 @@ Task description: ${tdesc:-$task_id}" worker_xdg_config=$(generate_worker_mcp_config "$task_id" 2>/dev/null) || true # Write dispatch script with startup sentinel (t183) - local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-reprompt.sh" + # t1190: Use timestamped filename to prevent overwrite race condition. + local reprompt_dispatch_ts + reprompt_dispatch_ts=$(date +%Y%m%d%H%M%S) + local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-reprompt-${reprompt_dispatch_ts}.sh" { echo '#!/usr/bin/env bash' echo "echo 'WORKER_STARTED task_id=${task_id} retry=${tretries} pid=\$\$ timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ)" @@ -3071,9 +3100,12 @@ Task description: ${tdesc:-$task_id}" # Wrapper script (t183): captures dispatch errors in log file # t253: Add cleanup handlers to prevent orphaned children when wrapper exits - local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-reprompt-wrapper.sh" + # t1190: Use timestamped filename to prevent overwrite race condition. + local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-reprompt-wrapper-${reprompt_dispatch_ts}.sh" { echo '#!/usr/bin/env bash' + # t1190: Wrapper-level sentinel written before running dispatch script. + echo "echo 'WRAPPER_STARTED task_id=${task_id} retry=${tretries} wrapper_pid=\$\$ dispatch_script=${dispatch_script} timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ) >> '${new_log_file}' 2>/dev/null || true" echo '# t253: Recursive cleanup to kill all descendant processes' echo '_kill_descendants_recursive() {' echo ' local parent_pid="$1"' @@ -3118,10 +3150,11 @@ Task description: ${tdesc:-$task_id}" # t253: Use setsid if available (Linux) for process group isolation # Use nohup + disown to survive parent (cron) exit + # t1190: Redirect wrapper stderr to log file (not /dev/null) for diagnosis. if command -v setsid &>/dev/null; then - nohup setsid bash "${wrapper_script}" &>/dev/null & + nohup setsid bash "${wrapper_script}" >>"${new_log_file}" 2>&1 & else - nohup bash "${wrapper_script}" &>/dev/null & + nohup bash "${wrapper_script}" >>"${new_log_file}" 2>&1 & fi local worker_pid=$! disown "$worker_pid" 2>/dev/null || true diff --git a/.agents/scripts/supervisor/evaluate.sh b/.agents/scripts/supervisor/evaluate.sh index b1e5b0b57..1a04d6681 100755 --- a/.agents/scripts/supervisor/evaluate.sh +++ b/.agents/scripts/supervisor/evaluate.sh @@ -52,6 +52,13 @@ extract_log_metadata() { echo "worker_started=false" fi + # Wrapper startup sentinel (t1190): distinguishes wrapper-never-ran from dispatch-exec-failed + if grep -q 'WRAPPER_STARTED' "$log_file"; then + echo "wrapper_started=true" + else + echo "wrapper_started=false" + fi + # Dispatch error sentinel (t183) if grep -q 'WORKER_DISPATCH_ERROR\|WORKER_FAILED' "$log_file" 2>/dev/null; then local dispatch_error @@ -1014,13 +1021,22 @@ evaluate_worker() { fi # Check if worker never started (only dispatch metadata, no WORKER_STARTED sentinel) - if [[ "$log_size" -lt 500 ]] && ! grep -q 'WORKER_STARTED' "$tlog" 2>/dev/null; then + # t1190: Distinguish between wrapper-never-ran vs dispatch-script-failed: + # - no WRAPPER_STARTED: wrapper process never ran (nohup spawn failed, OS killed it) + # - WRAPPER_STARTED but no WORKER_STARTED: wrapper ran but dispatch script failed + # (exec failure, bad shebang, permission error, dispatch script overwritten) + if [[ "$log_size" -lt 500 ]] && ! grep -q 'WORKER_STARTED' "$tlog"; then # Log has metadata but worker never started — extract any error from log local startup_error="" - startup_error=$(grep -i 'WORKER_FAILED\|WORKER_DISPATCH_ERROR\|command not found\|No such file\|Permission denied' "$tlog" 2>/dev/null | head -1 | head -c 200 || echo "") + startup_error=$(grep -i 'WORKER_FAILED\|WORKER_DISPATCH_ERROR\|command not found\|No such file\|Permission denied' "$tlog" | head -1 | head -c 200 || echo "") if [[ -n "$startup_error" ]]; then echo "failed:worker_never_started:$(echo "$startup_error" | tr ' ' '_' | tr -cd '[:alnum:]_:-')" + elif grep -q 'WRAPPER_STARTED' "$tlog"; then + # t1190: Wrapper ran but dispatch script failed to produce WORKER_STARTED. + # This means the dispatch script exec failed (e.g., CLI not found, bad args). + echo "failed:worker_never_started:dispatch_exec_failed" else + # t1190: Neither wrapper nor worker started — nohup spawn likely failed. echo "failed:worker_never_started:no_sentinel" fi return 0