From 13ed85ff0803979fcfca0a03e27f8efa758ec772 Mon Sep 17 00:00:00 2001 From: marcusquinn <6428977+marcusquinn@users.noreply.github.com> Date: Thu, 19 Feb 2026 22:52:02 +0000 Subject: [PATCH 1/2] fix: prevent worker_never_started:no_sentinel via timestamped scripts and wrapper sentinel (t1190) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: dispatch/wrapper scripts used fixed filenames (e.g., t001-dispatch.sh), causing a race condition where a second dispatch overwrites the scripts before the first wrapper process reads them. The first wrapper then executes the new dispatch script which writes WORKER_STARTED to a different log file, leaving the original log with only the metadata header (no WORKER_STARTED → no_sentinel failure). Fixes: 1. Timestamped dispatch/wrapper filenames (t001-dispatch-20260213142302.sh) prevent overwrite race — each dispatch gets unique scripts that cannot be clobbered. 2. WRAPPER_STARTED sentinel written to log before running dispatch script, enabling distinguish between 'wrapper never ran' vs 'dispatch exec failed'. 3. Wrapper stderr redirected to log file (was /dev/null) so wrapper startup errors (bash not found, permission denied) are captured for diagnosis. 4. evaluate.sh: improved no_sentinel sub-classification using WRAPPER_STARTED signal. 5. cleanup.sh: remove timestamped scripts when task is cleaned up. Chose timestamped filenames over temp files (mktemp) to preserve debuggability — the timestamp matches the log file timestamp, making correlation easy. --- .agents/scripts/supervisor/cleanup.sh | 12 ++++++ .agents/scripts/supervisor/dispatch.sh | 60 ++++++++++++++++++++------ .agents/scripts/supervisor/evaluate.sh | 16 +++++++ 3 files changed, 74 insertions(+), 14 deletions(-) diff --git a/.agents/scripts/supervisor/cleanup.sh b/.agents/scripts/supervisor/cleanup.sh index 51d6280aa..ce06aede9 100755 --- a/.agents/scripts/supervisor/cleanup.sh +++ b/.agents/scripts/supervisor/cleanup.sh @@ -284,6 +284,18 @@ cleanup_worker_processes() { rm -f "$pid_file" # t1222: Clean up hang warning marker to prevent stale markers from killing re-dispatched workers rm -f "$SUPERVISOR_DIR/pids/${task_id}.hang-warned" 2>/dev/null || true + # t1190: Clean up timestamped dispatch/wrapper scripts for this task. + # These are created with timestamps (e.g., t001-dispatch-20260213142302.sh) to + # prevent overwrite race conditions. Remove them when the task is cleaned up. + local script + for script in "$SUPERVISOR_DIR/pids/${task_id}"-dispatch-*.sh \ + "$SUPERVISOR_DIR/pids/${task_id}"-wrapper-*.sh \ + "$SUPERVISOR_DIR/pids/${task_id}"-reprompt-[0-9]*.sh \ + "$SUPERVISOR_DIR/pids/${task_id}"-reprompt-wrapper-*.sh \ + "$SUPERVISOR_DIR/pids/${task_id}"-prompt-repeat-[0-9]*.sh \ + "$SUPERVISOR_DIR/pids/${task_id}"-prompt-repeat-wrapper-*.sh; do + [[ -f "$script" ]] && rm -f "$script" 2>/dev/null || true + done if [[ "$killed" -gt 0 ]]; then log_info "Cleaned up worker process for $task_id (PID: $pid)" diff --git a/.agents/scripts/supervisor/dispatch.sh b/.agents/scripts/supervisor/dispatch.sh index f46de2a0a..b503c637f 100755 --- a/.agents/scripts/supervisor/dispatch.sh +++ b/.agents/scripts/supervisor/dispatch.sh @@ -1049,7 +1049,10 @@ do_prompt_repeat() { worker_xdg_config=$(generate_worker_mcp_config "$task_id" 2>/dev/null) || true # Write dispatch script - local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-prompt-repeat.sh" + # t1190: Use timestamped filename to prevent overwrite race condition. + local pr_dispatch_ts + pr_dispatch_ts=$(date +%Y%m%d%H%M%S) + local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-prompt-repeat-${pr_dispatch_ts}.sh" { echo '#!/usr/bin/env bash' echo "echo 'WORKER_STARTED task_id=${task_id} strategy=prompt_repeat pid=\$\$ timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ)" @@ -1065,9 +1068,12 @@ do_prompt_repeat() { chmod +x "$dispatch_script" # Wrapper script with cleanup handlers (t253) - local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-prompt-repeat-wrapper.sh" + # t1190: Use timestamped filename to prevent overwrite race condition. + local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-prompt-repeat-wrapper-${pr_dispatch_ts}.sh" { echo '#!/usr/bin/env bash' + # t1190: Wrapper-level sentinel written before running dispatch script. + echo "echo 'WRAPPER_STARTED task_id=${task_id} strategy=prompt_repeat wrapper_pid=\$\$ dispatch_script=${dispatch_script} timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ) >> '${new_log_file}' 2>/dev/null || true" echo '_kill_descendants_recursive() {' echo ' local parent_pid="$1"' echo ' local children' @@ -1118,10 +1124,11 @@ do_prompt_repeat() { chmod +x "$wrapper_script" # Dispatch + # t1190: Redirect wrapper stderr to log file (not /dev/null) for diagnosis. if command -v setsid &>/dev/null; then - nohup setsid bash "${wrapper_script}" &>/dev/null & + nohup setsid bash "${wrapper_script}" >>"${new_log_file}" 2>&1 & else - nohup bash "${wrapper_script}" &>/dev/null & + nohup bash "${wrapper_script}" >>"${new_log_file}" 2>&1 & fi local worker_pid=$! disown "$worker_pid" 2>/dev/null || true @@ -2588,7 +2595,15 @@ cmd_dispatch() { # Write dispatch script to a temp file to avoid bash -c quoting issues # with multi-line prompts (newlines in printf '%q' break bash -c strings) - local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-dispatch.sh" + # t1190: Use timestamped filenames to prevent overwrite race condition when + # multiple dispatches run for the same task within a short window. Previously, + # a second dispatch would overwrite the dispatch/wrapper scripts before the + # first wrapper process had a chance to read them, causing the first wrapper + # to execute the second dispatch's script (which writes to a different log file), + # leaving the first log file with only the metadata header (no WORKER_STARTED). + local dispatch_ts + dispatch_ts=$(date +%Y%m%d%H%M%S) + local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-dispatch-${dispatch_ts}.sh" { echo '#!/usr/bin/env bash' echo "# Startup sentinel (t183): if this line appears in the log, the script started" @@ -2611,9 +2626,15 @@ cmd_dispatch() { # errors when the dispatch script failed to start (e.g., opencode not found). # Now errors are appended to the log file for diagnosis. # t253: Add cleanup handlers to prevent orphaned children when wrapper exits - local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-wrapper.sh" + # t1190: Use timestamped filename (matches dispatch_ts) to prevent overwrite race. + local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-wrapper-${dispatch_ts}.sh" { echo '#!/usr/bin/env bash' + # t1190: Wrapper-level sentinel — written before running the dispatch script. + # If WRAPPER_STARTED appears in the log but WORKER_STARTED does not, the + # wrapper ran but the dispatch script failed to start (exec failure, bad shebang, + # permission error). This distinguishes "wrapper never ran" from "dispatch failed". + echo "echo 'WRAPPER_STARTED task_id=${task_id} wrapper_pid=\$\$ dispatch_script=${dispatch_script} timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ) >> '${log_file}' 2>/dev/null || true" echo '# t253: Recursive cleanup to kill all descendant processes' echo '_kill_descendants_recursive() {' echo ' local parent_pid="$1"' @@ -2680,20 +2701,24 @@ cmd_dispatch() { # Also start background process as fallback (Tabby may not support OSC 1337) # t253: Use setsid if available (Linux) for process group isolation # Use nohup + disown to survive parent (cron) exit + # t1190: Redirect wrapper stderr to log file (not /dev/null) so wrapper startup + # errors (e.g., bash not found, permission denied) are captured for diagnosis. if command -v setsid &>/dev/null; then - nohup setsid bash "${wrapper_script}" &>/dev/null & + nohup setsid bash "${wrapper_script}" >>"${log_file}" 2>&1 & else - nohup bash "${wrapper_script}" &>/dev/null & + nohup bash "${wrapper_script}" >>"${log_file}" 2>&1 & fi else # Headless: background process # t253: Use setsid if available (Linux) for process group isolation # Use nohup + disown to survive parent (cron) exit — without this, # workers die after ~2 minutes when the cron pulse script exits + # t1190: Redirect wrapper stderr to log file (not /dev/null) so wrapper startup + # errors (e.g., bash not found, permission denied) are captured for diagnosis. if command -v setsid &>/dev/null; then - nohup setsid bash "${wrapper_script}" &>/dev/null & + nohup setsid bash "${wrapper_script}" >>"${log_file}" 2>&1 & else - nohup bash "${wrapper_script}" &>/dev/null & + nohup bash "${wrapper_script}" >>"${log_file}" 2>&1 & fi fi @@ -3054,7 +3079,10 @@ Task description: ${tdesc:-$task_id}" worker_xdg_config=$(generate_worker_mcp_config "$task_id" 2>/dev/null) || true # Write dispatch script with startup sentinel (t183) - local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-reprompt.sh" + # t1190: Use timestamped filename to prevent overwrite race condition. + local reprompt_dispatch_ts + reprompt_dispatch_ts=$(date +%Y%m%d%H%M%S) + local dispatch_script="${SUPERVISOR_DIR}/pids/${task_id}-reprompt-${reprompt_dispatch_ts}.sh" { echo '#!/usr/bin/env bash' echo "echo 'WORKER_STARTED task_id=${task_id} retry=${tretries} pid=\$\$ timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ)" @@ -3071,9 +3099,12 @@ Task description: ${tdesc:-$task_id}" # Wrapper script (t183): captures dispatch errors in log file # t253: Add cleanup handlers to prevent orphaned children when wrapper exits - local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-reprompt-wrapper.sh" + # t1190: Use timestamped filename to prevent overwrite race condition. + local wrapper_script="${SUPERVISOR_DIR}/pids/${task_id}-reprompt-wrapper-${reprompt_dispatch_ts}.sh" { echo '#!/usr/bin/env bash' + # t1190: Wrapper-level sentinel written before running dispatch script. + echo "echo 'WRAPPER_STARTED task_id=${task_id} retry=${tretries} wrapper_pid=\$\$ dispatch_script=${dispatch_script} timestamp='\$(date -u +%Y-%m-%dT%H:%M:%SZ) >> '${new_log_file}' 2>/dev/null || true" echo '# t253: Recursive cleanup to kill all descendant processes' echo '_kill_descendants_recursive() {' echo ' local parent_pid="$1"' @@ -3118,10 +3149,11 @@ Task description: ${tdesc:-$task_id}" # t253: Use setsid if available (Linux) for process group isolation # Use nohup + disown to survive parent (cron) exit + # t1190: Redirect wrapper stderr to log file (not /dev/null) for diagnosis. if command -v setsid &>/dev/null; then - nohup setsid bash "${wrapper_script}" &>/dev/null & + nohup setsid bash "${wrapper_script}" >>"${new_log_file}" 2>&1 & else - nohup bash "${wrapper_script}" &>/dev/null & + nohup bash "${wrapper_script}" >>"${new_log_file}" 2>&1 & fi local worker_pid=$! disown "$worker_pid" 2>/dev/null || true diff --git a/.agents/scripts/supervisor/evaluate.sh b/.agents/scripts/supervisor/evaluate.sh index b1e5b0b57..73b089f35 100755 --- a/.agents/scripts/supervisor/evaluate.sh +++ b/.agents/scripts/supervisor/evaluate.sh @@ -52,6 +52,13 @@ extract_log_metadata() { echo "worker_started=false" fi + # Wrapper startup sentinel (t1190): distinguishes wrapper-never-ran from dispatch-exec-failed + if grep -q 'WRAPPER_STARTED' "$log_file" 2>/dev/null; then + echo "wrapper_started=true" + else + echo "wrapper_started=false" + fi + # Dispatch error sentinel (t183) if grep -q 'WORKER_DISPATCH_ERROR\|WORKER_FAILED' "$log_file" 2>/dev/null; then local dispatch_error @@ -1014,13 +1021,22 @@ evaluate_worker() { fi # Check if worker never started (only dispatch metadata, no WORKER_STARTED sentinel) + # t1190: Distinguish between wrapper-never-ran vs dispatch-script-failed: + # - no WRAPPER_STARTED: wrapper process never ran (nohup spawn failed, OS killed it) + # - WRAPPER_STARTED but no WORKER_STARTED: wrapper ran but dispatch script failed + # (exec failure, bad shebang, permission error, dispatch script overwritten) if [[ "$log_size" -lt 500 ]] && ! grep -q 'WORKER_STARTED' "$tlog" 2>/dev/null; then # Log has metadata but worker never started — extract any error from log local startup_error="" startup_error=$(grep -i 'WORKER_FAILED\|WORKER_DISPATCH_ERROR\|command not found\|No such file\|Permission denied' "$tlog" 2>/dev/null | head -1 | head -c 200 || echo "") if [[ -n "$startup_error" ]]; then echo "failed:worker_never_started:$(echo "$startup_error" | tr ' ' '_' | tr -cd '[:alnum:]_:-')" + elif grep -q 'WRAPPER_STARTED' "$tlog" 2>/dev/null; then + # t1190: Wrapper ran but dispatch script failed to produce WORKER_STARTED. + # This means the dispatch script exec failed (e.g., CLI not found, bad args). + echo "failed:worker_never_started:dispatch_exec_failed" else + # t1190: Neither wrapper nor worker started — nohup spawn likely failed. echo "failed:worker_never_started:no_sentinel" fi return 0 From 6fa38ed18b5a8944a1d4d80b84d863fd9197a172 Mon Sep 17 00:00:00 2001 From: marcusquinn <6428977+marcusquinn@users.noreply.github.com> Date: Thu, 19 Feb 2026 23:35:49 +0000 Subject: [PATCH 2/2] fix: address review feedback for t1190 - cleanup.sh: normalize glob patterns ([0-9]* -> *) for reprompt/prompt-repeat scripts; remove redundant 2>/dev/null from rm -f (already guarded by [[ -f ]]) - dispatch.sh: extract duplicated setsid/nohup wrapper launch into _launch_wrapper_script() helper to eliminate code duplication - evaluate.sh: remove blanket 2>/dev/null suppression from grep calls per style guide (log file existence already checked before these calls) --- .agents/scripts/supervisor/cleanup.sh | 6 ++-- .agents/scripts/supervisor/dispatch.sh | 39 +++++++++++++------------- .agents/scripts/supervisor/evaluate.sh | 8 +++--- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/.agents/scripts/supervisor/cleanup.sh b/.agents/scripts/supervisor/cleanup.sh index ce06aede9..a45017b65 100755 --- a/.agents/scripts/supervisor/cleanup.sh +++ b/.agents/scripts/supervisor/cleanup.sh @@ -290,11 +290,11 @@ cleanup_worker_processes() { local script for script in "$SUPERVISOR_DIR/pids/${task_id}"-dispatch-*.sh \ "$SUPERVISOR_DIR/pids/${task_id}"-wrapper-*.sh \ - "$SUPERVISOR_DIR/pids/${task_id}"-reprompt-[0-9]*.sh \ + "$SUPERVISOR_DIR/pids/${task_id}"-reprompt-*.sh \ "$SUPERVISOR_DIR/pids/${task_id}"-reprompt-wrapper-*.sh \ - "$SUPERVISOR_DIR/pids/${task_id}"-prompt-repeat-[0-9]*.sh \ + "$SUPERVISOR_DIR/pids/${task_id}"-prompt-repeat-*.sh \ "$SUPERVISOR_DIR/pids/${task_id}"-prompt-repeat-wrapper-*.sh; do - [[ -f "$script" ]] && rm -f "$script" 2>/dev/null || true + [[ -f "$script" ]] && rm -f "$script" || true done if [[ "$killed" -gt 0 ]]; then diff --git a/.agents/scripts/supervisor/dispatch.sh b/.agents/scripts/supervisor/dispatch.sh index b503c637f..0ab8eee5d 100755 --- a/.agents/scripts/supervisor/dispatch.sh +++ b/.agents/scripts/supervisor/dispatch.sh @@ -2699,27 +2699,10 @@ cmd_dispatch() { log_info "Opening Tabby tab for $task_id..." printf '\e]1337;NewTab=%s\a' "'${wrapper_script}'" 2>/dev/null || true # Also start background process as fallback (Tabby may not support OSC 1337) - # t253: Use setsid if available (Linux) for process group isolation - # Use nohup + disown to survive parent (cron) exit - # t1190: Redirect wrapper stderr to log file (not /dev/null) so wrapper startup - # errors (e.g., bash not found, permission denied) are captured for diagnosis. - if command -v setsid &>/dev/null; then - nohup setsid bash "${wrapper_script}" >>"${log_file}" 2>&1 & - else - nohup bash "${wrapper_script}" >>"${log_file}" 2>&1 & - fi + _launch_wrapper_script "${wrapper_script}" "${log_file}" else # Headless: background process - # t253: Use setsid if available (Linux) for process group isolation - # Use nohup + disown to survive parent (cron) exit — without this, - # workers die after ~2 minutes when the cron pulse script exits - # t1190: Redirect wrapper stderr to log file (not /dev/null) so wrapper startup - # errors (e.g., bash not found, permission denied) are captured for diagnosis. - if command -v setsid &>/dev/null; then - nohup setsid bash "${wrapper_script}" >>"${log_file}" 2>&1 & - else - nohup bash "${wrapper_script}" >>"${log_file}" 2>&1 & - fi + _launch_wrapper_script "${wrapper_script}" "${log_file}" fi local worker_pid=$! @@ -2739,6 +2722,24 @@ cmd_dispatch() { return 0 } +####################################### +# Launch a wrapper script in the background, surviving parent (cron) exit. +# t253: Uses setsid if available (Linux) for process group isolation. +# t1190: Redirects wrapper stderr to log file for startup error diagnosis. +# Args: wrapper_script log_file +####################################### +_launch_wrapper_script() { + local wrapper_script="$1" + local log_file="$2" + + if command -v setsid &>/dev/null; then + nohup setsid bash "${wrapper_script}" >>"${log_file}" 2>&1 & + else + nohup bash "${wrapper_script}" >>"${log_file}" 2>&1 & + fi + return 0 +} + ####################################### # Check the status of a running worker # Reads log file and PID to determine state diff --git a/.agents/scripts/supervisor/evaluate.sh b/.agents/scripts/supervisor/evaluate.sh index 73b089f35..1a04d6681 100755 --- a/.agents/scripts/supervisor/evaluate.sh +++ b/.agents/scripts/supervisor/evaluate.sh @@ -53,7 +53,7 @@ extract_log_metadata() { fi # Wrapper startup sentinel (t1190): distinguishes wrapper-never-ran from dispatch-exec-failed - if grep -q 'WRAPPER_STARTED' "$log_file" 2>/dev/null; then + if grep -q 'WRAPPER_STARTED' "$log_file"; then echo "wrapper_started=true" else echo "wrapper_started=false" @@ -1025,13 +1025,13 @@ evaluate_worker() { # - no WRAPPER_STARTED: wrapper process never ran (nohup spawn failed, OS killed it) # - WRAPPER_STARTED but no WORKER_STARTED: wrapper ran but dispatch script failed # (exec failure, bad shebang, permission error, dispatch script overwritten) - if [[ "$log_size" -lt 500 ]] && ! grep -q 'WORKER_STARTED' "$tlog" 2>/dev/null; then + if [[ "$log_size" -lt 500 ]] && ! grep -q 'WORKER_STARTED' "$tlog"; then # Log has metadata but worker never started — extract any error from log local startup_error="" - startup_error=$(grep -i 'WORKER_FAILED\|WORKER_DISPATCH_ERROR\|command not found\|No such file\|Permission denied' "$tlog" 2>/dev/null | head -1 | head -c 200 || echo "") + startup_error=$(grep -i 'WORKER_FAILED\|WORKER_DISPATCH_ERROR\|command not found\|No such file\|Permission denied' "$tlog" | head -1 | head -c 200 || echo "") if [[ -n "$startup_error" ]]; then echo "failed:worker_never_started:$(echo "$startup_error" | tr ' ' '_' | tr -cd '[:alnum:]_:-')" - elif grep -q 'WRAPPER_STARTED' "$tlog" 2>/dev/null; then + elif grep -q 'WRAPPER_STARTED' "$tlog"; then # t1190: Wrapper ran but dispatch script failed to produce WORKER_STARTED. # This means the dispatch script exec failed (e.g., CLI not found, bad args). echo "failed:worker_never_started:dispatch_exec_failed"