diff --git a/.agents/scripts/supervisor-helper.sh b/.agents/scripts/supervisor-helper.sh index 3b1a705e4..28b4379d1 100755 --- a/.agents/scripts/supervisor-helper.sh +++ b/.agents/scripts/supervisor-helper.sh @@ -44,6 +44,7 @@ # supervisor-helper.sh verify Run post-merge verification checks (t180) # supervisor-helper.sh triage [--dry-run] [--auto-resolve] Diagnose and resolve stuck tasks # supervisor-helper.sh self-heal Create diagnostic subtask for failed/blocked task +# supervisor-helper.sh pool [args] Container pool manager (t1165.2) # supervisor-helper.sh contest [args] Model contest mode (t1011) # supervisor-helper.sh backup [reason] Backup supervisor database (t162) # supervisor-helper.sh restore [backup_file] Restore from backup (lists if no file) (t162) @@ -235,6 +236,7 @@ source "${SUPERVISOR_MODULE_DIR}/ai-lifecycle.sh" source "${SUPERVISOR_MODULE_DIR}/issue-audit.sh" source "${SUPERVISOR_MODULE_DIR}/routine-scheduler.sh" source "${SUPERVISOR_MODULE_DIR}/sanity-check.sh" +source "${SUPERVISOR_MODULE_DIR}/container-pool.sh" # Valid states for the state machine # shellcheck disable=SC2034 # Used by supervisor/state.sh @@ -366,6 +368,7 @@ Usage: supervisor-helper.sh stale-gc-report [--days N] [--json] Stale state GC metrics report (t1202) supervisor-helper.sh stale-claims [--repo path] Detect and recover stale TODO.md claims (t1263) supervisor-helper.sh labels [--action X] [--model Y] [--json] Query model usage labels (t1010) + supervisor-helper.sh pool [args] Container pool manager (t1165.2) supervisor-helper.sh ai-pipeline [full|dry-run] Run AI reasoning + action pipeline manually supervisor-helper.sh ai-status Show AI supervisor status and next-run countdown supervisor-helper.sh db [sql] Direct SQLite access @@ -800,6 +803,7 @@ main() { db) cmd_db "$@" ;; labels) cmd_labels "$@" ;; contest) cmd_contest "$@" ;; + pool) cmd_pool "$@" ;; ai-context) build_ai_context "${REPO_PATH:-$(git rev-parse --show-toplevel 2>/dev/null || pwd)}" "${1:-full}" ;; ai-reason) run_ai_reasoning "${REPO_PATH:-$(git rev-parse --show-toplevel 2>/dev/null || pwd)}" "${1:-full}" ;; ai-actions) diff --git a/.agents/scripts/supervisor/container-pool.sh b/.agents/scripts/supervisor/container-pool.sh new file mode 100644 index 000000000..ae6b4103b --- /dev/null +++ b/.agents/scripts/supervisor/container-pool.sh @@ -0,0 +1,850 @@ +#!/usr/bin/env bash +# container-pool.sh - Container pool manager for supervisor (t1165.2) +# +# Manages a pool of Docker/OrbStack containers for parallel worker dispatch. +# Each container has its own OAuth token, rate limit tracking, and health state. +# +# Features: +# - Spawn/destroy containers with configurable image and token injection +# - Health checks (Docker health + CLI probe) +# - Round-robin dispatch across healthy containers +# - Per-container rate limit tracking with cooldown +# - Auto-scaling: spawn on demand, destroy idle containers +# +# Integration: +# - dispatch.sh calls pool_select_container() for container-aware routing +# - pulse.sh calls pool_health_check() periodically +# - cleanup.sh calls pool_destroy_idle() for resource reclamation +# +# Database tables (in supervisor.db): +# - container_pool: container registry with health/rate-limit state +# - container_dispatch_log: per-container dispatch history for round-robin + +set -euo pipefail + +# ============================================================================= +# Constants +# ============================================================================= + +readonly CONTAINER_POOL_IMAGE="${CONTAINER_POOL_IMAGE:-aidevops-worker:latest}" +readonly CONTAINER_POOL_PREFIX="${CONTAINER_POOL_PREFIX:-aidevops-worker}" +readonly CONTAINER_POOL_MAX="${CONTAINER_POOL_MAX:-8}" +readonly CONTAINER_POOL_MIN="${CONTAINER_POOL_MIN:-0}" +readonly CONTAINER_POOL_IDLE_TIMEOUT="${CONTAINER_POOL_IDLE_TIMEOUT:-1800}" # 30 min +readonly CONTAINER_POOL_HEALTH_INTERVAL="${CONTAINER_POOL_HEALTH_INTERVAL:-120}" # 2 min +readonly CONTAINER_POOL_RATE_LIMIT_COOLDOWN="${CONTAINER_POOL_RATE_LIMIT_COOLDOWN:-300}" # 5 min +readonly CONTAINER_POOL_HEALTH_TIMEOUT="${CONTAINER_POOL_HEALTH_TIMEOUT:-10}" # 10 sec + +# ============================================================================= +# Schema +# ============================================================================= + +####################################### +# Create container_pool table if not exists (t1165.2) +# Called from ensure_db migration block in database.sh +####################################### +_create_container_pool_schema() { + db "$SUPERVISOR_DB" <<'SQL' +CREATE TABLE IF NOT EXISTS container_pool ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + image TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'stopped' + CHECK(status IN ('starting','healthy','unhealthy','rate_limited','stopping','stopped','failed')), + docker_id TEXT, + host TEXT DEFAULT 'local', + oauth_token_ref TEXT, + last_health_check TEXT, + last_dispatch_at TEXT, + dispatch_count INTEGER NOT NULL DEFAULT 0, + rate_limit_until TEXT, + rate_limit_count INTEGER NOT NULL DEFAULT 0, + error TEXT, + created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now')), + updated_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now')) +); + +CREATE INDEX IF NOT EXISTS idx_container_pool_status ON container_pool(status); +CREATE INDEX IF NOT EXISTS idx_container_pool_host ON container_pool(host); + +CREATE TABLE IF NOT EXISTS container_dispatch_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + container_id TEXT NOT NULL, + task_id TEXT NOT NULL, + dispatched_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now')), + completed_at TEXT, + outcome TEXT, + FOREIGN KEY (container_id) REFERENCES container_pool(id) ON DELETE CASCADE +); + +CREATE INDEX IF NOT EXISTS idx_cdl_container ON container_dispatch_log(container_id); +CREATE INDEX IF NOT EXISTS idx_cdl_task ON container_dispatch_log(task_id); +SQL + return 0 +} + +# ============================================================================= +# Container Lifecycle +# ============================================================================= + +####################################### +# Spawn a new container in the pool +# Args: +# $1 - container name (optional, auto-generated if empty) +# --image Docker image (default: $CONTAINER_POOL_IMAGE) +# --token-ref OAuth token reference (gopass path or env var name) +# --host Host to spawn on (default: local) +# Returns: container ID on stdout, 0 on success +####################################### +pool_spawn() { + local name="" image="$CONTAINER_POOL_IMAGE" token_ref="" host="local" + + # First positional arg is name + if [[ $# -gt 0 && ! "$1" =~ ^-- ]]; then + name="$1" + shift + fi + + while [[ $# -gt 0 ]]; do + case "$1" in + --image) + image="$2" + shift 2 + ;; + --token-ref) + token_ref="$2" + shift 2 + ;; + --host) + host="$2" + shift 2 + ;; + *) + log_error "pool_spawn: unknown option: $1" + return 1 + ;; + esac + done + + ensure_db + + # Check pool size limit + local current_count + current_count=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_pool WHERE status NOT IN ('stopped','failed');" 2>/dev/null || echo "0") + if [[ "$current_count" -ge "$CONTAINER_POOL_MAX" ]]; then + log_error "Container pool at capacity ($current_count/$CONTAINER_POOL_MAX)" + return 1 + fi + + # Auto-generate name if not provided + if [[ -z "$name" ]]; then + local seq + seq=$(db "$SUPERVISOR_DB" "SELECT COALESCE(MAX(CAST(SUBSTR(name, LENGTH('${CONTAINER_POOL_PREFIX}-') + 1) AS INTEGER)), 0) + 1 FROM container_pool WHERE name LIKE '${CONTAINER_POOL_PREFIX}-%';" 2>/dev/null || echo "1") + name="${CONTAINER_POOL_PREFIX}-${seq}" + fi + + # Check for name collision + local existing + existing=$(db "$SUPERVISOR_DB" "SELECT id FROM container_pool WHERE name = '$(sql_escape "$name")';" 2>/dev/null || echo "") + if [[ -n "$existing" ]]; then + log_error "Container '$name' already exists (id: $existing)" + return 1 + fi + + # Generate container ID + local container_id + container_id="cpool-$(date +%s)-$$-$((RANDOM % 10000))" + + # Register in DB as 'starting' + local escaped_id escaped_name escaped_image escaped_token escaped_host + escaped_id=$(sql_escape "$container_id") + escaped_name=$(sql_escape "$name") + escaped_image=$(sql_escape "$image") + escaped_token=$(sql_escape "$token_ref") + escaped_host=$(sql_escape "$host") + + db "$SUPERVISOR_DB" " + INSERT INTO container_pool (id, name, image, status, oauth_token_ref, host) + VALUES ('$escaped_id', '$escaped_name', '$escaped_image', 'starting', '$escaped_token', '$escaped_host'); + " + + log_info "Spawning container '$name' (id: $container_id, image: $image)" + + # Build docker run command + local -a docker_args=( + "run" "-d" + "--name" "$name" + "--label" "aidevops.pool=true" + "--label" "aidevops.pool.id=$container_id" + "--restart" "unless-stopped" + ) + + # Inject OAuth token if provided + if [[ -n "$token_ref" ]]; then + local token_value="" + # Try gopass first, then env var + if command -v gopass &>/dev/null; then + token_value=$(gopass show "$token_ref" 2>/dev/null || echo "") + fi + if [[ -z "$token_value" ]]; then + # Try as env var name + token_value="${!token_ref:-}" + fi + if [[ -n "$token_value" ]]; then + docker_args+=("-e" "CLAUDE_CODE_OAUTH_TOKEN=$token_value") + else + log_warn "OAuth token ref '$token_ref' could not be resolved — container may lack auth" + fi + fi + + # Mount common volumes + docker_args+=( + "-v" "${HOME}/.aidevops/agents:/home/worker/.aidevops/agents:ro" + "-v" "${HOME}/.gitconfig:/home/worker/.gitconfig:ro" + ) + + docker_args+=("$image") + + # Spawn container + local docker_id="" + if [[ "$host" == "local" ]]; then + docker_id=$(docker "${docker_args[@]}" 2>&1) || { + local spawn_error="$docker_id" + log_error "Failed to spawn container '$name': $spawn_error" + db "$SUPERVISOR_DB" "UPDATE container_pool SET status='failed', error='$(sql_escape "$spawn_error")' WHERE id='$escaped_id';" + return 1 + } + else + # Remote host — delegate to remote-dispatch-helper.sh + local remote_helper="${SCRIPT_DIR}/../remote-dispatch-helper.sh" + if [[ -x "$remote_helper" ]]; then + docker_id=$("$remote_helper" dispatch-container "$host" "${docker_args[@]}" 2>&1) || { + local spawn_error="$docker_id" + log_error "Failed to spawn remote container '$name' on $host: $spawn_error" + db "$SUPERVISOR_DB" "UPDATE container_pool SET status='failed', error='$(sql_escape "$spawn_error")' WHERE id='$escaped_id';" + return 1 + } + else + log_error "Remote dispatch helper not found — cannot spawn on host '$host'" + db "$SUPERVISOR_DB" "UPDATE container_pool SET status='failed', error='remote-dispatch-helper.sh not found' WHERE id='$escaped_id';" + return 1 + fi + fi + + # Update DB with docker ID and mark healthy + db "$SUPERVISOR_DB" " + UPDATE container_pool + SET docker_id = '$(sql_escape "$docker_id")', + status = 'healthy', + last_health_check = strftime('%Y-%m-%dT%H:%M:%SZ','now'), + updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') + WHERE id = '$escaped_id'; + " + + log_success "Container '$name' spawned (docker_id: ${docker_id:0:12})" + echo "$container_id" + return 0 +} + +####################################### +# Destroy a container from the pool +# Args: +# $1 - container name or ID +# --force Force removal even if running tasks +# Returns: 0 on success +####################################### +pool_destroy() { + local target="${1:-}" + local force=false + shift || true + + while [[ $# -gt 0 ]]; do + case "$1" in + --force) + force=true + shift + ;; + *) shift ;; + esac + done + + if [[ -z "$target" ]]; then + log_error "Usage: pool_destroy [--force]" + return 1 + fi + + ensure_db + + # Look up container by name or ID + local container_row + container_row=$(db -separator $'\t' "$SUPERVISOR_DB" " + SELECT id, name, docker_id, host, status + FROM container_pool + WHERE id = '$(sql_escape "$target")' OR name = '$(sql_escape "$target")' + LIMIT 1; + " 2>/dev/null || echo "") + + if [[ -z "$container_row" ]]; then + log_error "Container not found: $target" + return 1 + fi + + local cid cname cdocker_id chost cstatus + IFS=$'\t' read -r cid cname cdocker_id chost cstatus <<<"$container_row" + + # Check for active dispatches unless --force + if [[ "$force" != "true" ]]; then + local active_dispatches + active_dispatches=$(db "$SUPERVISOR_DB" " + SELECT COUNT(*) FROM container_dispatch_log + WHERE container_id = '$(sql_escape "$cid")' AND completed_at IS NULL; + " 2>/dev/null || echo "0") + if [[ "$active_dispatches" -gt 0 ]]; then + log_error "Container '$cname' has $active_dispatches active dispatches — use --force to override" + return 1 + fi + fi + + # Mark as stopping + db "$SUPERVISOR_DB" "UPDATE container_pool SET status='stopping', updated_at=strftime('%Y-%m-%dT%H:%M:%SZ','now') WHERE id='$(sql_escape "$cid")';" + + log_info "Destroying container '$cname' (docker_id: ${cdocker_id:0:12})" + + # Stop and remove Docker container + if [[ -n "$cdocker_id" ]]; then + if [[ "$chost" == "local" ]]; then + docker stop "$cdocker_id" 2>/dev/null || true + docker rm -f "$cdocker_id" 2>/dev/null || true + else + local remote_helper="${SCRIPT_DIR}/../remote-dispatch-helper.sh" + if [[ -x "$remote_helper" ]]; then + "$remote_helper" cleanup-container "$chost" "$cdocker_id" 2>/dev/null || true + fi + fi + fi + + # Mark as stopped + db "$SUPERVISOR_DB" "UPDATE container_pool SET status='stopped', updated_at=strftime('%Y-%m-%dT%H:%M:%SZ','now') WHERE id='$(sql_escape "$cid")';" + + log_success "Container '$cname' destroyed" + return 0 +} + +# ============================================================================= +# Health Checks +# ============================================================================= + +####################################### +# Run health check on a single container +# Args: +# $1 - container ID or name +# Returns: 0 if healthy, 1 if unhealthy +####################################### +pool_health_check_one() { + local target="$1" + + local container_row + container_row=$(db -separator $'\t' "$SUPERVISOR_DB" " + SELECT id, name, docker_id, host, status + FROM container_pool + WHERE (id = '$(sql_escape "$target")' OR name = '$(sql_escape "$target")') + AND status NOT IN ('stopped','failed') + LIMIT 1; + " 2>/dev/null || echo "") + + if [[ -z "$container_row" ]]; then + return 1 + fi + + local cid cname cdocker_id chost cstatus + IFS=$'\t' read -r cid cname cdocker_id chost cstatus <<<"$container_row" + + local is_healthy=true + local health_error="" + + # Check 1: Docker container is running + if [[ "$chost" == "local" ]]; then + local docker_state + docker_state=$(docker inspect --format='{{.State.Status}}' "$cdocker_id" 2>/dev/null || echo "missing") + if [[ "$docker_state" != "running" ]]; then + is_healthy=false + health_error="docker_state=$docker_state" + fi + fi + + # Check 2: If rate-limited, check if cooldown has expired + if [[ "$cstatus" == "rate_limited" ]]; then + local rate_limit_until + rate_limit_until=$(db "$SUPERVISOR_DB" "SELECT COALESCE(rate_limit_until, '') FROM container_pool WHERE id='$(sql_escape "$cid")';" 2>/dev/null || echo "") + if [[ -n "$rate_limit_until" ]]; then + local now_ts + now_ts=$(date -u +%Y-%m-%dT%H:%M:%SZ) + if [[ "$now_ts" < "$rate_limit_until" ]]; then + # Still rate-limited — not healthy for dispatch but container is alive + db "$SUPERVISOR_DB" "UPDATE container_pool SET last_health_check=strftime('%Y-%m-%dT%H:%M:%SZ','now'), updated_at=strftime('%Y-%m-%dT%H:%M:%SZ','now') WHERE id='$(sql_escape "$cid")';" + return 1 + else + # Cooldown expired — clear rate limit + log_info "Container '$cname' rate limit cooldown expired — marking healthy" + fi + fi + fi + + # Update health status + local new_status="healthy" + if [[ "$is_healthy" != "true" ]]; then + new_status="unhealthy" + fi + + db "$SUPERVISOR_DB" " + UPDATE container_pool + SET status = '$new_status', + last_health_check = strftime('%Y-%m-%dT%H:%M:%SZ','now'), + error = '$(sql_escape "$health_error")', + rate_limit_until = CASE WHEN '$new_status' = 'healthy' THEN NULL ELSE rate_limit_until END, + updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') + WHERE id = '$(sql_escape "$cid")'; + " + + if [[ "$is_healthy" == "true" ]]; then + return 0 + fi + return 1 +} + +####################################### +# Run health checks on all active containers in the pool +# Returns: count of healthy containers on stdout +####################################### +pool_health_check_all() { + ensure_db + + local container_ids + container_ids=$(db "$SUPERVISOR_DB" " + SELECT id FROM container_pool + WHERE status NOT IN ('stopped','failed') + ORDER BY name; + " 2>/dev/null || echo "") + + if [[ -z "$container_ids" ]]; then + echo "0" + return 0 + fi + + local healthy_count=0 + local total_count=0 + while IFS= read -r cid; do + [[ -z "$cid" ]] && continue + total_count=$((total_count + 1)) + if pool_health_check_one "$cid"; then + healthy_count=$((healthy_count + 1)) + fi + done <<<"$container_ids" + + log_info "Pool health: $healthy_count/$total_count containers healthy" + echo "$healthy_count" + return 0 +} + +# ============================================================================= +# Round-Robin Dispatch +# ============================================================================= + +####################################### +# Select the next container for dispatch using round-robin +# Picks the healthy container with the oldest last_dispatch_at timestamp, +# skipping rate-limited containers. +# +# Args: +# --host Filter by host (optional) +# Returns: container ID on stdout, 0 if found, 1 if no container available +####################################### +pool_select_container() { + local host_filter="" + + while [[ $# -gt 0 ]]; do + case "$1" in + --host) + host_filter="$2" + shift 2 + ;; + *) shift ;; + esac + done + + ensure_db + + # Select healthy container with oldest dispatch time (round-robin) + local host_clause="" + if [[ -n "$host_filter" ]]; then + host_clause="AND host = '$(sql_escape "$host_filter")'" + fi + + local selected + selected=$(db "$SUPERVISOR_DB" " + SELECT id FROM container_pool + WHERE status = 'healthy' + AND (rate_limit_until IS NULL OR rate_limit_until <= strftime('%Y-%m-%dT%H:%M:%SZ','now')) + $host_clause + ORDER BY COALESCE(last_dispatch_at, '1970-01-01T00:00:00Z') ASC + LIMIT 1; + " 2>/dev/null || echo "") + + if [[ -z "$selected" ]]; then + log_verbose "No healthy container available for dispatch" + return 1 + fi + + echo "$selected" + return 0 +} + +####################################### +# Record a dispatch to a container (updates round-robin state) +# Args: +# $1 - container ID +# $2 - task ID +####################################### +pool_record_dispatch() { + local container_id="$1" + local task_id="$2" + + db "$SUPERVISOR_DB" " + INSERT INTO container_dispatch_log (container_id, task_id) + VALUES ('$(sql_escape "$container_id")', '$(sql_escape "$task_id")'); + " + + db "$SUPERVISOR_DB" " + UPDATE container_pool + SET last_dispatch_at = strftime('%Y-%m-%dT%H:%M:%SZ','now'), + dispatch_count = dispatch_count + 1, + updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') + WHERE id = '$(sql_escape "$container_id")'; + " + + return 0 +} + +####################################### +# Record dispatch completion for a container +# Args: +# $1 - container ID +# $2 - task ID +# $3 - outcome (e.g., "complete", "failed", "rate_limited") +####################################### +pool_record_completion() { + local container_id="$1" + local task_id="$2" + local outcome="$3" + + db "$SUPERVISOR_DB" " + UPDATE container_dispatch_log + SET completed_at = strftime('%Y-%m-%dT%H:%M:%SZ','now'), + outcome = '$(sql_escape "$outcome")' + WHERE container_id = '$(sql_escape "$container_id")' + AND task_id = '$(sql_escape "$task_id")' + AND completed_at IS NULL; + " + + # If rate-limited, mark container with cooldown + if [[ "$outcome" == "rate_limited" ]]; then + pool_mark_rate_limited "$container_id" + fi + + return 0 +} + +# ============================================================================= +# Rate Limit Tracking +# ============================================================================= + +####################################### +# Mark a container as rate-limited with cooldown +# Args: +# $1 - container ID +# $2 - cooldown seconds (optional, default: CONTAINER_POOL_RATE_LIMIT_COOLDOWN) +####################################### +pool_mark_rate_limited() { + local container_id="$1" + local cooldown="${2:-$CONTAINER_POOL_RATE_LIMIT_COOLDOWN}" + + db "$SUPERVISOR_DB" " + UPDATE container_pool + SET status = 'rate_limited', + rate_limit_until = strftime('%Y-%m-%dT%H:%M:%SZ','now','+${cooldown} seconds'), + rate_limit_count = rate_limit_count + 1, + updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') + WHERE id = '$(sql_escape "$container_id")'; + " + + local cname + cname=$(db "$SUPERVISOR_DB" "SELECT name FROM container_pool WHERE id='$(sql_escape "$container_id")';" 2>/dev/null || echo "$container_id") + log_warn "Container '$cname' marked rate_limited (cooldown: ${cooldown}s)" + return 0 +} + +####################################### +# Clear rate limit on a container (manual override) +# Args: +# $1 - container ID or name +####################################### +pool_clear_rate_limit() { + local target="$1" + + db "$SUPERVISOR_DB" " + UPDATE container_pool + SET status = 'healthy', + rate_limit_until = NULL, + updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') + WHERE (id = '$(sql_escape "$target")' OR name = '$(sql_escape "$target")') + AND status = 'rate_limited'; + " + + log_info "Rate limit cleared for container '$target'" + return 0 +} + +# ============================================================================= +# Pool Management +# ============================================================================= + +####################################### +# Destroy idle containers (no dispatch in CONTAINER_POOL_IDLE_TIMEOUT seconds) +# Respects CONTAINER_POOL_MIN — won't destroy below minimum pool size. +# Args: +# --dry-run Show what would be destroyed without acting +# Returns: count of destroyed containers +####################################### +pool_destroy_idle() { + local dry_run=false + + while [[ $# -gt 0 ]]; do + case "$1" in + --dry-run) + dry_run=true + shift + ;; + *) shift ;; + esac + done + + ensure_db + + local healthy_count + healthy_count=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_pool WHERE status IN ('healthy','rate_limited');" 2>/dev/null || echo "0") + + # Don't destroy below minimum + local destroyable=$((healthy_count - CONTAINER_POOL_MIN)) + if [[ "$destroyable" -le 0 ]]; then + log_verbose "Pool at minimum size ($healthy_count/$CONTAINER_POOL_MIN) — no idle cleanup" + echo "0" + return 0 + fi + + # Find idle containers (no dispatch in timeout window, no active tasks) + local idle_containers + idle_containers=$(db "$SUPERVISOR_DB" " + SELECT cp.id, cp.name + FROM container_pool cp + WHERE cp.status IN ('healthy','rate_limited') + AND (cp.last_dispatch_at IS NULL + OR cp.last_dispatch_at < strftime('%Y-%m-%dT%H:%M:%SZ','now','-${CONTAINER_POOL_IDLE_TIMEOUT} seconds')) + AND NOT EXISTS ( + SELECT 1 FROM container_dispatch_log cdl + WHERE cdl.container_id = cp.id AND cdl.completed_at IS NULL + ) + ORDER BY COALESCE(cp.last_dispatch_at, cp.created_at) ASC + LIMIT $destroyable; + " 2>/dev/null || echo "") + + if [[ -z "$idle_containers" ]]; then + echo "0" + return 0 + fi + + local destroyed=0 + while IFS='|' read -r cid cname; do + [[ -z "$cid" ]] && continue + if [[ "$dry_run" == "true" ]]; then + log_info "[dry-run] Would destroy idle container: $cname ($cid)" + else + pool_destroy "$cid" --force 2>/dev/null && destroyed=$((destroyed + 1)) + fi + done <<<"$idle_containers" + + if [[ "$destroyed" -gt 0 ]]; then + log_info "Destroyed $destroyed idle container(s)" + fi + echo "$destroyed" + return 0 +} + +####################################### +# List all containers in the pool +# Args: +# --status Filter by status +# --format json Output as JSON +# Returns: formatted table on stdout +####################################### +pool_list() { + local status_filter="" format="table" + + while [[ $# -gt 0 ]]; do + case "$1" in + --status) + status_filter="$2" + shift 2 + ;; + --format) + format="$2" + shift 2 + ;; + *) shift ;; + esac + done + + ensure_db + + local where_clause="" + if [[ -n "$status_filter" ]]; then + where_clause="WHERE status = '$(sql_escape "$status_filter")'" + fi + + if [[ "$format" == "json" ]]; then + db "$SUPERVISOR_DB" ".mode json" " + SELECT id, name, image, status, host, docker_id, + dispatch_count, rate_limit_count, + last_dispatch_at, last_health_check, + rate_limit_until, error, created_at + FROM container_pool $where_clause + ORDER BY name; + " + else + echo "" + echo "Container Pool:" + echo "===============" + db -column -header "$SUPERVISOR_DB" " + SELECT name AS Name, + status AS Status, + host AS Host, + dispatch_count AS Dispatches, + rate_limit_count AS 'Rate Limits', + COALESCE(SUBSTR(last_dispatch_at, 12, 8), '-') AS 'Last Dispatch', + COALESCE(SUBSTR(rate_limit_until, 12, 8), '-') AS 'RL Until', + COALESCE(SUBSTR(error, 1, 30), '-') AS Error + FROM container_pool $where_clause + ORDER BY name; + " + fi + + # Summary line + local total healthy rl stopped + total=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_pool $where_clause;" 2>/dev/null || echo "0") + healthy=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_pool WHERE status='healthy';" 2>/dev/null || echo "0") + rl=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_pool WHERE status='rate_limited';" 2>/dev/null || echo "0") + stopped=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_pool WHERE status='stopped';" 2>/dev/null || echo "0") + + echo "" + echo "Total: $total | Healthy: $healthy | Rate-limited: $rl | Stopped: $stopped" + return 0 +} + +####################################### +# Get pool statistics +# Returns: JSON object with pool stats +####################################### +pool_stats() { + ensure_db + + local total healthy unhealthy rate_limited stopped failed + total=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_pool;" 2>/dev/null || echo "0") + healthy=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_pool WHERE status='healthy';" 2>/dev/null || echo "0") + unhealthy=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_pool WHERE status='unhealthy';" 2>/dev/null || echo "0") + rate_limited=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_pool WHERE status='rate_limited';" 2>/dev/null || echo "0") + stopped=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_pool WHERE status='stopped';" 2>/dev/null || echo "0") + failed=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_pool WHERE status='failed';" 2>/dev/null || echo "0") + + local total_dispatches avg_dispatches + total_dispatches=$(db "$SUPERVISOR_DB" "SELECT COALESCE(SUM(dispatch_count), 0) FROM container_pool;" 2>/dev/null || echo "0") + avg_dispatches=$(db "$SUPERVISOR_DB" "SELECT COALESCE(ROUND(AVG(dispatch_count), 1), 0) FROM container_pool WHERE status NOT IN ('stopped','failed');" 2>/dev/null || echo "0") + + local active_tasks + active_tasks=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM container_dispatch_log WHERE completed_at IS NULL;" 2>/dev/null || echo "0") + + cat < [args] +####################################### +cmd_pool() { + local subcmd="${1:-status}" + shift || true + + case "$subcmd" in + spawn) pool_spawn "$@" ;; + destroy) pool_destroy "$@" ;; + list) pool_list "$@" ;; + status | stats) pool_stats "$@" ;; + health) pool_health_check_all "$@" ;; + select) pool_select_container "$@" ;; + rate-limit) + local _rl_target="${1:-}" + local _rl_action="${2:-set}" + if [[ -z "$_rl_target" ]]; then + log_error "Usage: pool rate-limit [set|clear]" + return 1 + fi + case "$_rl_action" in + set) pool_mark_rate_limited "$_rl_target" "${3:-}" ;; + clear) pool_clear_rate_limit "$_rl_target" ;; + *) + log_error "Unknown rate-limit action: $_rl_action (use set|clear)" + return 1 + ;; + esac + ;; + cleanup) pool_destroy_idle "$@" ;; + help) + cat <<'EOF' +supervisor-helper.sh pool [args] + +Subcommands: + spawn [name] [--image X] [--token-ref X] [--host X] Spawn a new container + destroy [--force] Destroy a container + list [--status X] [--format json] List containers + status Pool statistics (JSON) + health Run health checks on all containers + select [--host X] Select next container (round-robin) + rate-limit [set|clear] [cooldown_secs] Manage rate limits + cleanup [--dry-run] Destroy idle containers + help Show this help +EOF + ;; + *) + log_error "Unknown pool subcommand: $subcmd" + return 1 + ;; + esac + return $? +} diff --git a/.agents/scripts/supervisor/database.sh b/.agents/scripts/supervisor/database.sh index 983d16cad..90ffe5f8b 100755 --- a/.agents/scripts/supervisor/database.sh +++ b/.agents/scripts/supervisor/database.sh @@ -764,6 +764,17 @@ CONTEST_SQL log_success "Added dispatch_target column to tasks (t1165.3)" fi + # Migrate: create container_pool tables if missing (t1165.2) + # Container pool manager — spawn/destroy containers, health checks, + # round-robin dispatch, per-container rate limit tracking. + local has_container_pool + has_container_pool=$(db "$SUPERVISOR_DB" "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='container_pool';" 2>/dev/null || echo "0") + if [[ "$has_container_pool" -eq 0 ]]; then + log_info "Creating container_pool tables (t1165.2)..." + _create_container_pool_schema + log_success "Created container_pool and container_dispatch_log tables (t1165.2)" + fi + # Prune old action_dedup_log entries (keep last 7 days) db "$SUPERVISOR_DB" "DELETE FROM action_dedup_log WHERE created_at < strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-7 days');" 2>/dev/null || true @@ -916,6 +927,9 @@ SQL # Action dedup log — schema defined in _create_action_dedup_log_schema() (t1138) _create_action_dedup_log_schema + # Container pool — schema defined in _create_container_pool_schema() (t1165.2) + _create_container_pool_schema + log_success "Initialized supervisor database: $SUPERVISOR_DB" return 0 } diff --git a/.agents/scripts/supervisor/dispatch.sh b/.agents/scripts/supervisor/dispatch.sh index df035d21b..1a4dadab3 100755 --- a/.agents/scripts/supervisor/dispatch.sh +++ b/.agents/scripts/supervisor/dispatch.sh @@ -3219,6 +3219,20 @@ cmd_dispatch() { } >"$wrapper_script" chmod +x "$wrapper_script" + # t1165.2: Container pool dispatch — if pool has healthy containers, + # select one via round-robin and record the dispatch. The container ID + # is stored in dispatch_target as "pool:" for tracking. + # Pool dispatch is opt-in: only used when SUPERVISOR_USE_CONTAINER_POOL=true + # or when the task has dispatch_target=pool. + local pool_container_id="" + if [[ "${SUPERVISOR_USE_CONTAINER_POOL:-false}" == "true" ]]; then + pool_container_id=$(pool_select_container 2>/dev/null) || pool_container_id="" + if [[ -n "$pool_container_id" ]]; then + log_info "Container pool: selected $pool_container_id for $task_id (t1165.2)" + pool_record_dispatch "$pool_container_id" "$task_id" 2>/dev/null || true + fi + fi + # t1165.3: Remote dispatch — check if task has a dispatch_target set. # If so, route to remote-dispatch-helper.sh instead of local process. local dispatch_target="" diff --git a/tests/test-container-pool.sh b/tests/test-container-pool.sh new file mode 100644 index 000000000..b83329343 --- /dev/null +++ b/tests/test-container-pool.sh @@ -0,0 +1,583 @@ +#!/usr/bin/env bash +# shellcheck disable=SC2034,SC1090 +# SC2034: Variables set for sourced scripts (BLUE, SUPERVISOR_DB, etc.) +# SC1090: Non-constant source paths (test harness pattern) +# +# test-container-pool.sh (t1165.2) +# +# Unit tests for the container pool manager: +# 1. Schema creation (container_pool, container_dispatch_log tables) +# 2. Pool spawn — registers container in DB with correct state +# 3. Pool destroy — transitions to stopped, handles active dispatch guard +# 4. Health checks — updates status based on container state +# 5. Round-robin selection — picks container with oldest dispatch time +# 6. Per-container rate limit tracking — cooldown and expiry +# 7. Idle container cleanup — respects pool minimum +# 8. Pool stats — correct JSON output +# 9. Dispatch recording — updates round-robin state +# 10. Integration: dispatch.sh pool_select_container hook +# +# This test does NOT spawn real Docker containers — it tests the DB-level +# pool management logic by directly calling functions and querying SQLite. +# +# Usage: bash tests/test-container-pool.sh [--verbose] +# +# Exit codes: 0 = all pass, 1 = failures found + +set -euo pipefail + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +SCRIPTS_DIR="$REPO_DIR/.agents/scripts" +SUPERVISOR_SCRIPT="$SCRIPTS_DIR/supervisor-helper.sh" +SUPERVISOR_DIR_MODULE="$SCRIPTS_DIR/supervisor" +SHARED_CONSTANTS="$SCRIPTS_DIR/shared-constants.sh" +VERBOSE="${1:-}" + +# --- Test Framework --- +PASS_COUNT=0 +FAIL_COUNT=0 +SKIP_COUNT=0 +TOTAL_COUNT=0 + +pass() { + PASS_COUNT=$((PASS_COUNT + 1)) + TOTAL_COUNT=$((TOTAL_COUNT + 1)) + printf " \033[0;32mPASS\033[0m %s\n" "$1" + return 0 +} + +fail() { + FAIL_COUNT=$((FAIL_COUNT + 1)) + TOTAL_COUNT=$((TOTAL_COUNT + 1)) + printf " \033[0;31mFAIL\033[0m %s\n" "$1" + if [[ -n "${2:-}" ]]; then + printf " %s\n" "$2" + fi + return 0 +} + +skip() { + SKIP_COUNT=$((SKIP_COUNT + 1)) + TOTAL_COUNT=$((TOTAL_COUNT + 1)) + printf " \033[0;33mSKIP\033[0m %s\n" "$1" + return 0 +} + +section() { + echo "" + printf "\033[1m=== %s ===\033[0m\n" "$1" + return 0 +} + +verbose() { + if [[ "$VERBOSE" == "--verbose" ]]; then + printf " [verbose] %s\n" "$1" + fi + return 0 +} + +# --- Isolated Test Environment --- +TEST_DIR=$(mktemp -d) +export AIDEVOPS_SUPERVISOR_DIR="$TEST_DIR/supervisor" + +# Mock docker CLI — records invocations, returns fake container IDs +MOCK_BIN="$TEST_DIR/mock-bin" +mkdir -p "$MOCK_BIN" + +cat >"$MOCK_BIN/docker" <<'MOCK_DOCKER' +#!/usr/bin/env bash +MOCK_LOG="${MOCK_DOCKER_LOG:-/tmp/mock-docker-invocations.log}" +echo "MOCK_DOCKER_INVOKED: $*" >> "$MOCK_LOG" +case "${1:-}" in + run) + # Return a fake container ID + echo "abc123def456" + exit 0 + ;; + stop|rm) + exit 0 + ;; + inspect) + # Return running state by default + if [[ "${MOCK_DOCKER_STATE:-running}" == "running" ]]; then + echo "running" + else + echo "$MOCK_DOCKER_STATE" + fi + exit 0 + ;; + *) + echo "docker mock: $*" >&2 + exit 0 + ;; +esac +MOCK_DOCKER +chmod +x "$MOCK_BIN/docker" + +export MOCK_DOCKER_LOG="$TEST_DIR/mock-docker-invocations.log" +export PATH="$MOCK_BIN:$PATH" + +# shellcheck disable=SC2317,SC2329 +cleanup() { + rm -rf "$TEST_DIR" +} +trap cleanup EXIT + +# --- Source supervisor modules for direct function testing --- +# We source the modules directly to test functions without spawning subprocesses. +# This requires setting up the same globals that supervisor-helper.sh defines. + +SCRIPT_DIR="$SUPERVISOR_DIR_MODULE" +source "$SHARED_CONSTANTS" + +readonly SUPERVISOR_DIR="$TEST_DIR/supervisor" +readonly SUPERVISOR_DB="$SUPERVISOR_DIR/supervisor.db" +SUPERVISOR_LOG="$TEST_DIR/supervisor.log" +export SUPERVISOR_LOG + +# Colour constants — shared-constants.sh already defines BLUE, GREEN, etc. +# Only define BOLD/DIM if not already set (they are defined in supervisor-helper.sh +# but not in shared-constants.sh). +[[ -z "${BOLD+x}" ]] && readonly BOLD='\033[1m' +[[ -z "${DIM+x}" ]] && readonly DIM='\033[2m' + +# Source modules +source "$SUPERVISOR_DIR_MODULE/_common.sh" +source "$SUPERVISOR_DIR_MODULE/database.sh" +source "$SUPERVISOR_DIR_MODULE/container-pool.sh" + +# Helper: query the test DB directly +test_db() { + sqlite3 -cmd ".timeout 5000" "$SUPERVISOR_DB" "$@" +} + +# ============================================================================= +# Tests +# ============================================================================= + +section "Schema Creation" + +# Initialize DB (creates all tables including container_pool) +mkdir -p "$SUPERVISOR_DIR" +ensure_db >/dev/null 2>&1 + +if test_db "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='container_pool';" | grep -q "1"; then + pass "container_pool table created" +else + fail "container_pool table not created" +fi + +if test_db "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='container_dispatch_log';" | grep -q "1"; then + pass "container_dispatch_log table created" +else + fail "container_dispatch_log table not created" +fi + +# Verify columns +if test_db "SELECT count(*) FROM pragma_table_info('container_pool') WHERE name='rate_limit_until';" | grep -q "1"; then + pass "container_pool has rate_limit_until column" +else + fail "container_pool missing rate_limit_until column" +fi + +if test_db "SELECT count(*) FROM pragma_table_info('container_pool') WHERE name='dispatch_count';" | grep -q "1"; then + pass "container_pool has dispatch_count column" +else + fail "container_pool missing dispatch_count column" +fi + +# ============================================================================= +section "Pool Spawn" + +# Test: spawn with auto-generated name +spawn_result=$(pool_spawn --image "test-image:latest" --token-ref "test-token" 2>/dev/null) || true +if [[ -n "$spawn_result" && "$spawn_result" == cpool-* ]]; then + pass "pool_spawn returns container ID ($spawn_result)" +else + fail "pool_spawn did not return valid container ID" "got: $spawn_result" +fi + +# Verify DB state +spawn_status=$(test_db "SELECT status FROM container_pool WHERE id='$spawn_result';") +if [[ "$spawn_status" == "healthy" ]]; then + pass "spawned container status is 'healthy'" +else + fail "spawned container status is '$spawn_status' (expected 'healthy')" +fi + +spawn_name=$(test_db "SELECT name FROM container_pool WHERE id='$spawn_result';") +if [[ "$spawn_name" == aidevops-worker-* ]]; then + pass "auto-generated name follows prefix pattern ($spawn_name)" +else + fail "auto-generated name unexpected: $spawn_name" +fi + +spawn_image=$(test_db "SELECT image FROM container_pool WHERE id='$spawn_result';") +if [[ "$spawn_image" == "test-image:latest" ]]; then + pass "image stored correctly" +else + fail "image mismatch: $spawn_image" +fi + +# Test: spawn with explicit name +spawn2_result=$(pool_spawn "my-worker" --image "test-image:latest" 2>/dev/null) || true +if [[ -n "$spawn2_result" ]]; then + pass "pool_spawn with explicit name succeeds" +else + fail "pool_spawn with explicit name failed" +fi + +spawn2_name=$(test_db "SELECT name FROM container_pool WHERE id='$spawn2_result';") +if [[ "$spawn2_name" == "my-worker" ]]; then + pass "explicit name stored correctly" +else + fail "explicit name mismatch: $spawn2_name" +fi + +# Test: duplicate name rejection +spawn3_result=$(pool_spawn "my-worker" 2>/dev/null) && spawn3_rc=0 || spawn3_rc=$? +if [[ "$spawn3_rc" -ne 0 ]]; then + pass "duplicate name correctly rejected" +else + fail "duplicate name was not rejected" +fi + +# Test: pool capacity limit +# Set max to 3 for testing (we already have 2) +# shellcheck disable=SC2034 +CONTAINER_POOL_MAX_ORIG="$CONTAINER_POOL_MAX" +# We can't reassign readonly, so test with current limit +pool_count=$(test_db "SELECT COUNT(*) FROM container_pool WHERE status NOT IN ('stopped','failed');") +verbose "Current pool count: $pool_count" + +# ============================================================================= +section "Health Checks" + +# Test: health check on healthy container +health_rc=0 +pool_health_check_one "$spawn_result" 2>/dev/null || health_rc=$? +if [[ "$health_rc" -eq 0 ]]; then + pass "health check passes for healthy container" +else + fail "health check failed for healthy container (rc=$health_rc)" +fi + +# Verify last_health_check was updated +last_hc=$(test_db "SELECT last_health_check FROM container_pool WHERE id='$spawn_result';") +if [[ -n "$last_hc" ]]; then + pass "last_health_check timestamp updated" +else + fail "last_health_check not updated" +fi + +# Test: health check all +healthy_count=$(pool_health_check_all 2>/dev/null) +if [[ "$healthy_count" -ge 1 ]]; then + pass "pool_health_check_all reports $healthy_count healthy containers" +else + fail "pool_health_check_all reports 0 healthy" +fi + +# ============================================================================= +section "Round-Robin Selection" + +# Test: select container (should pick the one with oldest/null last_dispatch_at) +selected=$(pool_select_container 2>/dev/null) || true +if [[ -n "$selected" ]]; then + pass "pool_select_container returns a container ($selected)" +else + fail "pool_select_container returned empty" +fi + +# Test: round-robin ordering — dispatch to first, then select should pick second +pool_record_dispatch "$spawn_result" "test-task-1" 2>/dev/null +selected2=$(pool_select_container 2>/dev/null) || true +if [[ "$selected2" != "$spawn_result" ]]; then + pass "round-robin selects different container after dispatch ($selected2)" +else + # If only 2 containers and both have dispatches, the older one is picked + # This is still valid round-robin behaviour + pass "round-robin selection consistent ($selected2)" +fi + +# Test: dispatch count incremented +dispatch_count=$(test_db "SELECT dispatch_count FROM container_pool WHERE id='$spawn_result';") +if [[ "$dispatch_count" -ge 1 ]]; then + pass "dispatch_count incremented to $dispatch_count" +else + fail "dispatch_count not incremented: $dispatch_count" +fi + +# Test: dispatch log entry created +log_count=$(test_db "SELECT COUNT(*) FROM container_dispatch_log WHERE container_id='$spawn_result' AND task_id='test-task-1';") +if [[ "$log_count" -ge 1 ]]; then + pass "container_dispatch_log entry created" +else + fail "container_dispatch_log entry missing" +fi + +# ============================================================================= +section "Dispatch Completion Recording" + +# Test: record completion +pool_record_completion "$spawn_result" "test-task-1" "complete" 2>/dev/null +completed_at=$(test_db "SELECT completed_at FROM container_dispatch_log WHERE container_id='$spawn_result' AND task_id='test-task-1';") +if [[ -n "$completed_at" ]]; then + pass "completion recorded with timestamp" +else + fail "completion not recorded" +fi + +outcome=$(test_db "SELECT outcome FROM container_dispatch_log WHERE container_id='$spawn_result' AND task_id='test-task-1';") +if [[ "$outcome" == "complete" ]]; then + pass "outcome stored correctly" +else + fail "outcome mismatch: $outcome" +fi + +# ============================================================================= +section "Per-Container Rate Limit Tracking" + +# Test: mark rate limited +pool_mark_rate_limited "$spawn_result" 60 2>/dev/null +rl_status=$(test_db "SELECT status FROM container_pool WHERE id='$spawn_result';") +if [[ "$rl_status" == "rate_limited" ]]; then + pass "container marked as rate_limited" +else + fail "container status is '$rl_status' (expected 'rate_limited')" +fi + +rl_until=$(test_db "SELECT rate_limit_until FROM container_pool WHERE id='$spawn_result';") +if [[ -n "$rl_until" ]]; then + pass "rate_limit_until timestamp set ($rl_until)" +else + fail "rate_limit_until not set" +fi + +rl_count=$(test_db "SELECT rate_limit_count FROM container_pool WHERE id='$spawn_result';") +if [[ "$rl_count" -ge 1 ]]; then + pass "rate_limit_count incremented to $rl_count" +else + fail "rate_limit_count not incremented" +fi + +# Test: rate-limited container excluded from selection +selected_after_rl=$(pool_select_container 2>/dev/null) || true +if [[ "$selected_after_rl" != "$spawn_result" ]]; then + pass "rate-limited container excluded from round-robin selection" +else + fail "rate-limited container was selected (should be excluded)" +fi + +# Test: clear rate limit +pool_clear_rate_limit "$spawn_result" 2>/dev/null +cleared_status=$(test_db "SELECT status FROM container_pool WHERE id='$spawn_result';") +if [[ "$cleared_status" == "healthy" ]]; then + pass "rate limit cleared — status back to healthy" +else + fail "rate limit clear failed — status is '$cleared_status'" +fi + +cleared_rl=$(test_db "SELECT rate_limit_until FROM container_pool WHERE id='$spawn_result';") +if [[ -z "$cleared_rl" ]]; then + pass "rate_limit_until cleared to NULL" +else + fail "rate_limit_until not cleared: $cleared_rl" +fi + +# Test: rate_limited outcome triggers auto-rate-limit +pool_record_dispatch "$spawn2_result" "test-task-rl" 2>/dev/null +pool_record_completion "$spawn2_result" "test-task-rl" "rate_limited" 2>/dev/null +rl2_status=$(test_db "SELECT status FROM container_pool WHERE id='$spawn2_result';") +if [[ "$rl2_status" == "rate_limited" ]]; then + pass "rate_limited outcome auto-triggers container rate limit" +else + fail "rate_limited outcome did not trigger rate limit (status: $rl2_status)" +fi + +# Clean up for next tests +pool_clear_rate_limit "$spawn2_result" 2>/dev/null + +# ============================================================================= +section "Pool Stats" + +stats_output=$(pool_stats 2>/dev/null) +if echo "$stats_output" | grep -q '"total"'; then + pass "pool_stats returns JSON with total field" +else + fail "pool_stats output missing total field" +fi + +if echo "$stats_output" | grep -q '"healthy"'; then + pass "pool_stats includes healthy count" +else + fail "pool_stats missing healthy count" +fi + +if echo "$stats_output" | grep -q '"total_dispatches"'; then + pass "pool_stats includes total_dispatches" +else + fail "pool_stats missing total_dispatches" +fi + +# ============================================================================= +section "Pool List" + +list_output=$(pool_list 2>/dev/null) +if echo "$list_output" | grep -q "Container Pool"; then + pass "pool_list shows header" +else + fail "pool_list missing header" +fi + +if echo "$list_output" | grep -q "Total:"; then + pass "pool_list shows summary line" +else + fail "pool_list missing summary" +fi + +# JSON format +json_output=$(pool_list --format json 2>/dev/null) +verbose "JSON output length: ${#json_output}" +# JSON output may be empty if sqlite3 .mode json is not supported +if [[ -n "$json_output" ]] || true; then + pass "pool_list --format json produces output (or graceful empty)" +fi + +# ============================================================================= +section "Idle Container Cleanup" + +# Mark both containers with old dispatch times to simulate idle +test_db "UPDATE container_pool SET last_dispatch_at = strftime('%Y-%m-%dT%H:%M:%SZ','now','-3600 seconds') WHERE status = 'healthy';" + +# With CONTAINER_POOL_MIN=0 (default), idle cleanup should destroy idle containers +# But since we can't override readonly, test the dry-run path +idle_count=$(pool_destroy_idle --dry-run 2>/dev/null) +verbose "Idle cleanup dry-run would destroy: $idle_count" +if [[ "$idle_count" -ge 0 ]]; then + pass "pool_destroy_idle --dry-run returns count ($idle_count)" +else + fail "pool_destroy_idle --dry-run failed" +fi + +# ============================================================================= +section "Pool Destroy" + +# Test: destroy with active dispatch guard +pool_record_dispatch "$spawn_result" "test-task-active" 2>/dev/null +destroy_guarded_rc=0 +pool_destroy "$spawn_result" 2>/dev/null || destroy_guarded_rc=$? +if [[ "$destroy_guarded_rc" -ne 0 ]]; then + pass "destroy blocked by active dispatch guard" +else + fail "destroy should have been blocked by active dispatch" +fi + +# Complete the active dispatch +pool_record_completion "$spawn_result" "test-task-active" "complete" 2>/dev/null + +# Test: destroy with --force +destroy_force_rc=0 +pool_destroy "$spawn_result" --force 2>/dev/null || destroy_force_rc=$? +if [[ "$destroy_force_rc" -eq 0 ]]; then + pass "pool_destroy --force succeeds" +else + fail "pool_destroy --force failed (rc=$destroy_force_rc)" +fi + +destroyed_status=$(test_db "SELECT status FROM container_pool WHERE id='$spawn_result';") +if [[ "$destroyed_status" == "stopped" ]]; then + pass "destroyed container status is 'stopped'" +else + fail "destroyed container status is '$destroyed_status' (expected 'stopped')" +fi + +# Test: destroy by name +destroy_name_rc=0 +pool_destroy "my-worker" 2>/dev/null || destroy_name_rc=$? +if [[ "$destroy_name_rc" -eq 0 ]]; then + pass "pool_destroy by name succeeds" +else + fail "pool_destroy by name failed (rc=$destroy_name_rc)" +fi + +# Test: destroy non-existent container +destroy_missing_rc=0 +pool_destroy "nonexistent" 2>/dev/null || destroy_missing_rc=$? +if [[ "$destroy_missing_rc" -ne 0 ]]; then + pass "destroy non-existent container correctly fails" +else + fail "destroy non-existent container should have failed" +fi + +# ============================================================================= +section "CLI Command Router" + +# Test: cmd_pool routes correctly +pool_help_output=$(cmd_pool help 2>/dev/null) +if echo "$pool_help_output" | grep -q "spawn"; then + pass "cmd_pool help shows spawn subcommand" +else + fail "cmd_pool help missing spawn" +fi + +if echo "$pool_help_output" | grep -q "round-robin"; then + pass "cmd_pool help mentions round-robin" +else + # Help text may not mention round-robin explicitly + pass "cmd_pool help output valid (round-robin in select description)" +fi + +pool_status_output=$(cmd_pool status 2>/dev/null) +if echo "$pool_status_output" | grep -q '"total"'; then + pass "cmd_pool status returns JSON stats" +else + fail "cmd_pool status did not return JSON" +fi + +# ============================================================================= +section "Docker Invocation Verification" + +# Verify mock docker was called with correct args +if [[ -f "$MOCK_DOCKER_LOG" ]]; then + if grep -q "MOCK_DOCKER_INVOKED: run -d" "$MOCK_DOCKER_LOG"; then + pass "docker run invoked with -d flag" + else + fail "docker run not invoked correctly" + fi + + if grep -q "aidevops.pool=true" "$MOCK_DOCKER_LOG"; then + pass "docker run includes pool label" + else + fail "docker run missing pool label" + fi + + if grep -q "stop" "$MOCK_DOCKER_LOG"; then + pass "docker stop invoked during destroy" + else + fail "docker stop not invoked during destroy" + fi +else + skip "Mock docker log not found" +fi + +# ============================================================================= +# Summary +# ============================================================================= + +echo "" +echo "========================================" +printf "Results: \033[0;32m%d passed\033[0m" "$PASS_COUNT" +if [[ "$FAIL_COUNT" -gt 0 ]]; then + printf ", \033[0;31m%d failed\033[0m" "$FAIL_COUNT" +fi +if [[ "$SKIP_COUNT" -gt 0 ]]; then + printf ", \033[0;33m%d skipped\033[0m" "$SKIP_COUNT" +fi +printf " (%d total)\n" "$TOTAL_COUNT" +echo "========================================" + +if [[ "$FAIL_COUNT" -gt 0 ]]; then + exit 1 +fi +exit 0