diff --git a/.agents/scripts/supervisor-helper.sh b/.agents/scripts/supervisor-helper.sh index cbd961832..fa51a189c 100755 --- a/.agents/scripts/supervisor-helper.sh +++ b/.agents/scripts/supervisor-helper.sh @@ -34,6 +34,8 @@ # supervisor-helper.sh pr-check Check PR CI/review status # supervisor-helper.sh pr-merge [--dry-run] Merge PR (squash) # supervisor-helper.sh self-heal Create diagnostic subtask for failed/blocked task +# supervisor-helper.sh backup [reason] Backup supervisor database (t162) +# supervisor-helper.sh restore [backup_file] Restore from backup (lists if no file) (t162) # supervisor-helper.sh db [sql] Direct SQLite access # supervisor-helper.sh help # @@ -449,6 +451,108 @@ db() { sqlite3 -cmd ".timeout 5000" "$@" } +####################################### +# Backup supervisor database before destructive operations (t162) +# Creates timestamped copy in supervisor dir. Keeps last 5 backups. +# Usage: backup_db [reason] +####################################### +backup_db() { + local reason="${1:-manual}" + + if [[ ! -f "$SUPERVISOR_DB" ]]; then + log_warn "No database to backup at: $SUPERVISOR_DB" + return 1 + fi + + local timestamp + timestamp=$(date -u +%Y%m%dT%H%M%SZ) + local backup_file="$SUPERVISOR_DIR/supervisor-backup-${timestamp}-${reason}.db" + + # Use SQLite .backup for consistency (handles WAL correctly) + if sqlite3 "$SUPERVISOR_DB" ".backup '$backup_file'" 2>/dev/null; then + log_success "Database backed up: $backup_file" + else + # Fallback to file copy if .backup fails + if cp "$SUPERVISOR_DB" "$backup_file" 2>/dev/null; then + # Also copy WAL/SHM if present + [[ -f "${SUPERVISOR_DB}-wal" ]] && cp "${SUPERVISOR_DB}-wal" "${backup_file}-wal" 2>/dev/null || true + [[ -f "${SUPERVISOR_DB}-shm" ]] && cp "${SUPERVISOR_DB}-shm" "${backup_file}-shm" 2>/dev/null || true + log_success "Database backed up (file copy): $backup_file" + else + log_error "Failed to backup database" + return 1 + fi + fi + + # Prune old backups: keep last 5 + local backup_count + # shellcheck disable=SC2012 + backup_count=$(ls -1 "$SUPERVISOR_DIR"/supervisor-backup-*.db 2>/dev/null | wc -l | tr -d ' ') + if [[ "$backup_count" -gt 5 ]]; then + local to_remove + to_remove=$((backup_count - 5)) + # shellcheck disable=SC2012 + ls -1t "$SUPERVISOR_DIR"/supervisor-backup-*.db 2>/dev/null | tail -n "$to_remove" | while IFS= read -r old_backup; do + rm -f "$old_backup" "${old_backup}-wal" "${old_backup}-shm" 2>/dev/null || true + done + log_info "Pruned $to_remove old backup(s)" + fi + + echo "$backup_file" + return 0 +} + +####################################### +# Restore supervisor database from backup (t162) +# Usage: restore_db [backup_file] +# If no file specified, lists available backups +####################################### +restore_db() { + local backup_file="${1:-}" + + if [[ -z "$backup_file" ]]; then + log_info "Available backups:" + # shellcheck disable=SC2012 + ls -1t "$SUPERVISOR_DIR"/supervisor-backup-*.db 2>/dev/null | while IFS= read -r f; do + local size + size=$(du -h "$f" 2>/dev/null | cut -f1) + local task_count + task_count=$(sqlite3 "$f" "SELECT count(*) FROM tasks;" 2>/dev/null || echo "?") + echo " $f ($size, $task_count tasks)" + done + return 0 + fi + + if [[ ! -f "$backup_file" ]]; then + log_error "Backup file not found: $backup_file" + return 1 + fi + + # Verify backup is valid SQLite + if ! sqlite3 "$backup_file" "SELECT count(*) FROM tasks;" >/dev/null 2>&1; then + log_error "Backup file is not a valid supervisor database: $backup_file" + return 1 + fi + + # Backup current DB before overwriting + if [[ -f "$SUPERVISOR_DB" ]]; then + backup_db "pre-restore" >/dev/null 2>&1 || true + fi + + cp "$backup_file" "$SUPERVISOR_DB" + [[ -f "${backup_file}-wal" ]] && cp "${backup_file}-wal" "${SUPERVISOR_DB}-wal" 2>/dev/null || true + [[ -f "${backup_file}-shm" ]] && cp "${backup_file}-shm" "${SUPERVISOR_DB}-shm" 2>/dev/null || true + + local task_count + task_count=$(db "$SUPERVISOR_DB" "SELECT count(*) FROM tasks;") + local batch_count + batch_count=$(db "$SUPERVISOR_DB" "SELECT count(*) FROM batches;") + + log_success "Database restored from: $backup_file" + log_info "Tasks: $task_count | Batches: $batch_count" + return 0 +} + ####################################### # Ensure supervisor directory and DB exist ####################################### @@ -475,6 +579,7 @@ ensure_db() { check_sql=$(db "$SUPERVISOR_DB" "SELECT sql FROM sqlite_master WHERE type='table' AND name='tasks';" 2>/dev/null || echo "") if [[ -n "$check_sql" ]] && ! echo "$check_sql" | grep -q 'pr_review'; then log_info "Migrating database schema for post-PR lifecycle states (t128.8)..." + backup_db "pre-migrate-t128.8" >/dev/null 2>&1 || log_warn "Backup failed, proceeding with migration" db "$SUPERVISOR_DB" << 'MIGRATE' PRAGMA foreign_keys=OFF; BEGIN TRANSACTION; @@ -499,7 +604,13 @@ CREATE TABLE tasks ( completed_at TEXT, updated_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now')) ); -INSERT INTO tasks SELECT * FROM tasks_old; +INSERT INTO tasks (id, repo, description, status, session_id, worktree, branch, + log_file, retries, max_retries, model, error, pr_url, + created_at, started_at, completed_at, updated_at) +SELECT id, repo, description, status, session_id, worktree, branch, + log_file, retries, max_retries, model, error, pr_url, + created_at, started_at, completed_at, updated_at +FROM tasks_old; DROP TABLE tasks_old; CREATE INDEX IF NOT EXISTS idx_tasks_status ON tasks(status); CREATE INDEX IF NOT EXISTS idx_tasks_repo ON tasks(repo); @@ -565,6 +676,7 @@ MIGRATE check_sql_t148=$(db "$SUPERVISOR_DB" "SELECT sql FROM sqlite_master WHERE type='table' AND name='tasks';" 2>/dev/null || echo "") if [[ -n "$check_sql_t148" ]] && ! echo "$check_sql_t148" | grep -q 'review_triage'; then log_info "Migrating database schema for review_triage state (t148)..." + backup_db "pre-migrate-t148" >/dev/null 2>&1 || log_warn "Backup failed, proceeding with migration" db "$SUPERVISOR_DB" << 'MIGRATE_T148' PRAGMA foreign_keys=OFF; BEGIN TRANSACTION; @@ -740,6 +852,22 @@ cmd_init() { return 0 } +####################################### +# Backup supervisor database (t162) +####################################### +cmd_backup() { + local reason="${1:-manual}" + backup_db "$reason" +} + +####################################### +# Restore supervisor database from backup (t162) +####################################### +cmd_restore() { + local backup_file="${1:-}" + restore_db "$backup_file" +} + ####################################### # Add a task to the supervisor ####################################### @@ -7132,6 +7260,8 @@ Usage: supervisor-helper.sh running-count [batch_id] Count active tasks supervisor-helper.sh reset Reset task to queued supervisor-helper.sh cancel Cancel task or batch + supervisor-helper.sh backup [reason] Backup supervisor database + supervisor-helper.sh restore [backup_file] Restore from backup (lists if no file) supervisor-helper.sh auto-pickup [--repo path] Scan TODO.md for auto-dispatch tasks supervisor-helper.sh cron [install|uninstall|status] Manage cron-based pulse scheduling supervisor-helper.sh watch [--repo path] Watch TODO.md for changes (fswatch) @@ -7416,6 +7546,8 @@ main() { running-count) cmd_running_count "$@" ;; reset) cmd_reset "$@" ;; cancel) cmd_cancel "$@" ;; + backup) cmd_backup "$@" ;; + restore) cmd_restore "$@" ;; db) cmd_db "$@" ;; help|--help|-h) show_usage ;; *) log_error "Unknown command: $command"; show_usage; return 1 ;; diff --git a/TODO.md b/TODO.md index 889b4f93d..7dd94d4ae 100644 --- a/TODO.md +++ b/TODO.md @@ -59,6 +59,8 @@ Tasks with no open blockers - ready to work on. Use `/ready` to refresh this lis - Notes: Problem: supervisor adds tasks to its DB but not to TODO.md, so workers running /full-loop cannot find the task description. Fix: 1) build_dispatch_cmd now passes task description inline in the prompt (` -- description`), 2) full-loop.md Step 0 has 3-tier resolution: inline desc > TODO.md > supervisor DB, 3) full-loop-helper.sh PR creation queries supervisor DB as fallback. Also adds headless worker rules (no user prompts, no TODO.md edits, graceful auth failure handling) and PR creation hardening (gh auth check, rebase before push, proper title/body). - [ ] t160 fix: supervisor TODO.md push fails under concurrent workers, add reconcile-todo command #bugfix #supervisor ~1h (ai:45m test:15m) logged:2026-02-08 started:2026-02-08 - Notes: Root cause: update_todo_on_complete() commits TODO.md locally but git push fails when multiple concurrent workers push to main simultaneously (non-fast-forward). The function logs a warning but treats it as non-blocking, so tasks transition to deployed even though TODO.md was never pushed. Fix: add commit_and_push_todo() helper with pull-rebase retry logic (3 attempts with backoff). Add reconcile-todo command for bulk reconciliation. Wire reconcile into pulse cycle Phase 7 (runs when no workers active). Affected 12 tasks from batch-20260208. +- [ ] t162 fix: supervisor DB safety - add backup-before-migrate and explicit column migrations #bugfix #supervisor ~30m (ai:25m test:5m) logged:2026-02-08 started:2026-02-08 + - Notes: Root cause: t128.8 migration uses INSERT INTO tasks SELECT * FROM tasks_old which fails on column count mismatch if migrations run out of order. No backup before destructive table rename/recreate migrations. Fix: 1) Add backup_db() helper with SQLite .backup, timestamped copies, auto-prune to 5. 2) Add backup before t128.8 and t148 destructive migrations. 3) Fix t128.8 SELECT * to use explicit column list. 4) Add backup/restore commands for manual recovery. - [x] t152 Fix `((cleaned++))` arithmetic exit code bug in setup.sh causing silent abort under `set -e` #bug #setup ~30m actual:15m (ai:15m) ref:GH#548 logged:2026-02-08 started:2026-02-08 completed:2026-02-08