diff --git a/.agents/scripts/commands/pulse.md b/.agents/scripts/commands/pulse.md index caf3f4bff..f038da9db 100644 --- a/.agents/scripts/commands/pulse.md +++ b/.agents/scripts/commands/pulse.md @@ -371,6 +371,8 @@ gh pr close --repo \ Check `ps axo pid,etime,command | grep '/full-loop' | grep '\.opencode'`. Any worker running 3+ hours with no open PR is likely stuck. Kill it: `kill `. Comment on the issue with the full audit-quality fields (model, branch, reason, diagnosis, next action — see "Audit-quality state in issue and PR comments" below). This frees a slot. If the worker has recent commits or an open PR with activity, leave it alone — it's making progress. +Before killing a worker for thrash, read the latest worker transcript/log tail and attempt one targeted coaching intervention unless the worker is clearly hard-stuck (for example: repeated identical fatal error, no commits for many hours, or provider backoff exhaustion). Coaching intervention means: post a concise issue comment with the exact blocker pattern, then re-dispatch with a narrower acceptance target and explicit checkpoint deadline. If that coached retry still fails to produce a checkpoint, then kill/requeue and comment why completion was not possible. + ### Struggle-ratio check (t1367) The "Active Workers" section in the pre-fetched state includes a `struggle_ratio` for each worker that has a worktree. This metric is `messages / max(1, commits)` — a high ratio means the worker is sending many messages but producing few commits (thrashing). @@ -554,6 +556,8 @@ gh issue edit --repo --add-assignee "$RUNNER_USER" --add-label " sleep 2 ``` +9. **Fill-to-cap post-condition (t1449):** before ending the pulse cycle, compare active workers vs `MAX_WORKERS`. If below cap and runnable scoped issues/PR work exists in any repo class, continue dispatching until cap is reached or no runnable candidates remain. Do not leave slots idle because of class reservations when one class is PR-capped or empty. + ### Candidate discovery baseline (t1443 + t1448) Do NOT treat `auto-dispatch` or `status:available` as hard gates. They are hints only. @@ -585,6 +589,7 @@ If you dispatch an unassigned issue without `auto-dispatch`/`status:available`, - Use `--dir ` from repos.json - Route non-code tasks with `--agent`: SEO, Content, Marketing, Business, Research (see AGENTS.md "Agent Routing") - If a dispatched worker later looks stalled, `worker-watchdog.sh` now inspects the recent OpenCode transcript tail before killing it, includes that diagnostic evidence in the retry trail, and gives provider-wait evidence one extra timeout window before re-queueing the issue. +- Product/tooling reservations are soft optimization targets. When product repos are at daily PR cap (or otherwise non-dispatchable), immediately reallocate those slots to tooling/system work. - **Bundle-aware agent routing (t1364.6):** Before dispatching, check if the target repo has a bundle with `agent_routing` overrides. Run `bundle-helper.sh get agent_routing ` — if the task domain (code, seo, content, marketing) has a non-default agent, use `--agent `. Example: a content-site bundle routes `marketing` tasks to the Marketing agent instead of Build+. Explicit `--agent` flags in the issue body always override bundle defaults. - **Scope boundary (t1405, GH#2928):** ONLY dispatch workers for repos in the pre-fetched state (i.e., repos with `pulse: true` in repos.json). The `PULSE_SCOPE_REPOS` env var (set by `pulse-wrapper.sh`) contains the comma-separated list of in-scope repo slugs. Workers inherit this env var and use it to restrict code changes (branches, PRs) to scoped repos. Workers CAN still file issues on any repo (cross-repo self-improvement), but the pulse must NEVER dispatch a worker to implement a fix on a repo outside this scope — even if an issue exists there. Issues on non-pulse repos enter that repo's queue for their own maintainers to handle. - **Lineage context for subtasks (t1408.3):** When dispatching a subtask (task ID contains a dot, e.g., `t1408.3`), include a lineage context block in the dispatch prompt. This tells the worker what the parent task is, what sibling tasks exist, and to focus only on its specific scope. See `tools/ai-assistants/headless-dispatch.md` "Lineage Context for Subtask Workers" for the full format and assembly instructions. Example dispatch with lineage: diff --git a/.agents/scripts/pulse-wrapper.sh b/.agents/scripts/pulse-wrapper.sh index 392a7cbad..715d74883 100755 --- a/.agents/scripts/pulse-wrapper.sh +++ b/.agents/scripts/pulse-wrapper.sh @@ -73,7 +73,7 @@ fi # Configuration ####################################### PULSE_STALE_THRESHOLD="${PULSE_STALE_THRESHOLD:-3600}" # 60 min hard ceiling (raised from 30 min — GH#2958) -PULSE_IDLE_TIMEOUT="${PULSE_IDLE_TIMEOUT:-300}" # 5 min idle = process completed, sitting in file watcher (t1398.3) +PULSE_IDLE_TIMEOUT="${PULSE_IDLE_TIMEOUT:-600}" # 10 min idle before kill (reduces false positives during active triage) PULSE_IDLE_CPU_THRESHOLD="${PULSE_IDLE_CPU_THRESHOLD:-5}" # CPU% below this = idle (0-100 scale) PULSE_PROGRESS_TIMEOUT="${PULSE_PROGRESS_TIMEOUT:-600}" # 10 min no log output = stuck (GH#2958) ORPHAN_MAX_AGE="${ORPHAN_MAX_AGE:-7200}" # 2 hours — kill orphans older than this @@ -945,23 +945,25 @@ _append_priority_allocations() { fi # Read allocation values - local max_workers product_repos tooling_repos product_min tooling_max reservation_pct quality_debt_cap_pct + local max_workers product_repos tooling_repos dispatchable_product_repos product_min tooling_max reservation_pct quality_debt_cap_pct max_workers=$(grep '^MAX_WORKERS=' "$alloc_file" | cut -d= -f2) || max_workers=4 product_repos=$(grep '^PRODUCT_REPOS=' "$alloc_file" | cut -d= -f2) || product_repos=0 tooling_repos=$(grep '^TOOLING_REPOS=' "$alloc_file" | cut -d= -f2) || tooling_repos=0 + dispatchable_product_repos=$(grep '^DISPATCHABLE_PRODUCT_REPOS=' "$alloc_file" | cut -d= -f2) || dispatchable_product_repos="$product_repos" product_min=$(grep '^PRODUCT_MIN=' "$alloc_file" | cut -d= -f2) || product_min=0 tooling_max=$(grep '^TOOLING_MAX=' "$alloc_file" | cut -d= -f2) || tooling_max=0 reservation_pct=$(grep '^PRODUCT_RESERVATION_PCT=' "$alloc_file" | cut -d= -f2) || reservation_pct=60 quality_debt_cap_pct=$(grep '^QUALITY_DEBT_CAP_PCT=' "$alloc_file" | cut -d= -f2) || quality_debt_cap_pct=30 echo "Worker pool: **${max_workers}** total slots" - echo "Product repos (${product_repos}): **${product_min}** reserved slots (${reservation_pct}% minimum)" + echo "Product repos (${product_repos}, dispatchable now: ${dispatchable_product_repos}): **${product_min}** reserved slots (${reservation_pct}% target minimum)" echo "Tooling repos (${tooling_repos}): **${tooling_max}** slots (remainder)" echo "Quality-debt cap: **${quality_debt_cap_pct}%** of worker pool" echo "" echo "**Enforcement rules:**" - echo "- Before dispatching a tooling-repo worker, check: are product-repo workers using fewer than ${product_min} slots? If yes, the remaining product slots are reserved — do NOT fill them with tooling work." - echo "- If product repos have no pending work (no open issues, no failing PRs), their reserved slots become available for tooling." + echo "- Reservations are soft targets, not hard gates. If one class has no dispatchable candidates, immediately reassign its unused slots to the other class." + echo "- Product repos at daily PR cap are treated as temporarily non-dispatchable for reservation purposes." + echo "- Do not leave slots idle when runnable scoped work exists in any class." echo "- If all ${max_workers} slots are needed for product work, tooling gets 0 (product reservation is a minimum, not a maximum)." echo "- Merges (priority 1) and CI fixes (priority 2) are exempt — they always proceed regardless of class." echo "" @@ -1882,11 +1884,34 @@ calculate_priority_allocations() { [[ "$product_repos" =~ ^[0-9]+$ ]] || product_repos=0 [[ "$tooling_repos" =~ ^[0-9]+$ ]] || tooling_repos=0 + # Count product repos that can actually dispatch now (not blocked by daily PR cap) + local dispatchable_product_repos today_utc + dispatchable_product_repos=0 + today_utc=$(date -u +%Y-%m-%d) + if [[ "$product_repos" -gt 0 && "$DAILY_PR_CAP" -gt 0 ]]; then + while IFS= read -r slug; do + [[ -n "$slug" ]] || continue + local pr_json daily_pr_count + pr_json=$(gh pr list --repo "$slug" --state open --json createdAt --limit 100 2>/dev/null) || pr_json="[]" + daily_pr_count=$(echo "$pr_json" | jq --arg today "$today_utc" '[.[] | select(.createdAt | startswith($today))] | length' 2>/dev/null) || daily_pr_count=0 + [[ "$daily_pr_count" =~ ^[0-9]+$ ]] || daily_pr_count=0 + if [[ "$daily_pr_count" -lt "$DAILY_PR_CAP" ]]; then + dispatchable_product_repos=$((dispatchable_product_repos + 1)) + fi + done < <(jq -r '.initialized_repos[] | select(.pulse == true and (.local_only // false) == false and .slug != "" and .priority == "product") | .slug' "$repos_json" 2>/dev/null) + else + dispatchable_product_repos="$product_repos" + fi + [[ "$dispatchable_product_repos" =~ ^[0-9]+$ ]] || dispatchable_product_repos="$product_repos" + if [[ "$dispatchable_product_repos" -lt "$product_repos" ]]; then + echo "[pulse-wrapper] Product dispatchability reduced by daily PR caps: ${dispatchable_product_repos}/${product_repos} repos can accept new workers" >>"$LOGFILE" + fi + # Calculate reservations # product_min = ceil(max_workers * PRODUCT_RESERVATION_PCT / 100) # Using integer arithmetic: ceil(a/b) = (a + b - 1) / b local product_min tooling_max - if [[ "$product_repos" -eq 0 ]]; then + if [[ "$dispatchable_product_repos" -eq 0 ]]; then # No product repos — all slots available for tooling product_min=0 tooling_max="$max_workers" @@ -1914,6 +1939,7 @@ calculate_priority_allocations() { echo "MAX_WORKERS=${max_workers}" echo "PRODUCT_REPOS=${product_repos}" echo "TOOLING_REPOS=${tooling_repos}" + echo "DISPATCHABLE_PRODUCT_REPOS=${dispatchable_product_repos}" echo "PRODUCT_MIN=${product_min}" echo "TOOLING_MAX=${tooling_max}" echo "PRODUCT_RESERVATION_PCT=${PRODUCT_RESERVATION_PCT}"