diff --git a/.agents/scripts/contributor-activity-helper.sh b/.agents/scripts/contributor-activity-helper.sh index 1091514af..18951af23 100755 --- a/.agents/scripts/contributor-activity-helper.sh +++ b/.agents/scripts/contributor-activity-helper.sh @@ -1023,25 +1023,23 @@ print(','.join(sorted(logins))) # Query GitHub Search API for each login. # Uses gh api with search/issues endpoint — returns total_count without pagination. # Rate limit: 30 requests/min for search API. With 4 queries per user, - # we can handle ~7 users per minute. For larger teams, the function - # checks remaining rate limit and sleeps until reset if needed. + # we can handle ~7 users per minute. If budget is exhausted, bail out + # with partial results instead of blocking (t1429). local results_json="[" local first=true + local _ps_partial=false local IFS=',' for login in $logins_csv; do # Check search API rate limit before each batch of 4 queries per user local remaining remaining=$(gh api rate_limit --jq '.resources.search.remaining' 2>/dev/null) || remaining=30 if [[ "$remaining" -lt 5 ]]; then - local reset_at - reset_at=$(gh api rate_limit --jq '.resources.search.reset' 2>/dev/null) || reset_at=0 - local now_epoch - now_epoch=$(date +%s) - local wait_secs=$((reset_at - now_epoch + 1)) - if [[ "$wait_secs" -gt 0 && "$wait_secs" -lt 120 ]]; then - echo "Rate limit low (${remaining} remaining), waiting ${wait_secs}s..." >&2 - sleep "$wait_secs" - fi + # t1429: bail out with partial results instead of sleeping. + # The old code slept until reset, creating an infinite blocking + # loop when multiple users × repos exhausted the 30 req/min budget. + echo "Rate limit exhausted (${remaining} remaining), returning partial results" >&2 + _ps_partial=true + break fi # Issues created by this user in this repo since the date @@ -1070,13 +1068,14 @@ print(','.join(sorted(logins))) unset IFS results_json+="]" - # Format output + # Format output (pass partial flag so callers can detect truncated data) echo "$results_json" | python3 -c " import sys import json format_type = sys.argv[1] period_name = sys.argv[2] +is_partial = sys.argv[3] == 'true' data = json.load(sys.stdin) @@ -1086,7 +1085,8 @@ for d in data: data.sort(key=lambda x: x['total_output'], reverse=True) if format_type == 'json': - print(json.dumps(data, indent=2)) + result = {'data': data, 'partial': is_partial} + print(json.dumps(result, indent=2)) else: if not data: print(f'_No GitHub activity for the last {period_name}._') @@ -1097,7 +1097,10 @@ else: for d in data: pct = round(d['total_output'] / grand_total * 100, 1) print(f'| {d[\"login\"]} | {d[\"issues_created\"]} | {d[\"prs_created\"]} | {d[\"prs_merged\"]} | {d[\"commented_on\"]} | {pct}% |') -" "$format" "$period" + if is_partial: + print() + print('_Partial results — GitHub Search API rate limit exhausted._') +" "$format" "$period" "$_ps_partial" return 0 } diff --git a/.agents/scripts/pulse-wrapper.sh b/.agents/scripts/pulse-wrapper.sh index cd658a77b..9e25361c9 100755 --- a/.agents/scripts/pulse-wrapper.sh +++ b/.agents/scripts/pulse-wrapper.sh @@ -3259,19 +3259,18 @@ prefetch_contribution_watch() { ####################################### # Main # -# Execution order (GH#2958): +# Execution order (t1429): # 1. Gate checks (consent, dedup) -# 2. Cleanup (orphans, worktrees) -# 3. Pre-pulse housekeeping (quality sweep, health issues) — these are -# shell-level operations that run quickly and don't need the LLM. -# Running them BEFORE the pulse ensures the LLM session gets maximum -# time for its actual job (triage, dispatch, PR review). -# 4. Prefetch state (parallel gh API calls) -# 5. Run pulse (LLM session — the main event) -# -# Previously, quality sweep and health issues ran AFTER the pulse. This -# meant the pulse's 30-min timeout was shared with these operations, -# and the LLM session was killed before completing its work. +# 2. Cleanup (orphans, worktrees, stashes) +# 3. Prefetch state (parallel gh API calls) +# 4. Run pulse (LLM session — dispatch workers, merge PRs) +# +# Statistics (quality sweep, health issues, person-stats) run in a +# SEPARATE process — stats-wrapper.sh — on its own cron schedule. +# They must never share a process with the pulse because they depend +# on GitHub Search API (30 req/min limit). When budget is exhausted, +# contributor-activity-helper.sh bails out with partial results, but +# even the API calls themselves add latency that delays dispatch. ####################################### main() { if ! check_session_gate; then @@ -3293,11 +3292,6 @@ main() { calculate_priority_allocations check_session_count >/dev/null - # Run housekeeping BEFORE the pulse — these are shell-level operations - # that don't need the LLM and shouldn't eat into pulse time (GH#2958). - run_daily_quality_sweep - update_health_issues - # Contribution watch: lightweight scan of external issues/PRs (t1419). # Deterministic — only checks timestamps/authorship, never processes # comment bodies. Output appended to STATE_FILE for the pulse agent. diff --git a/.agents/scripts/stats-wrapper.sh b/.agents/scripts/stats-wrapper.sh new file mode 100755 index 000000000..934378dd7 --- /dev/null +++ b/.agents/scripts/stats-wrapper.sh @@ -0,0 +1,123 @@ +#!/usr/bin/env bash +# stats-wrapper.sh - Separate process for statistics and health updates +# +# Runs quality sweep, health issue updates, and person-stats independently +# of the supervisor pulse. These operations depend on GitHub Search API +# (30 req/min limit) and can block for extended periods when rate-limited. +# Running them in-process with the pulse prevented dispatch and merge work +# from ever executing. See t1429 for the full root cause analysis. +# +# Called by cron/launchd every 15 minutes. Has its own PID dedup and hard timeout. + +set -euo pipefail + +####################################### +# PATH normalisation — same as pulse-wrapper.sh +####################################### +export PATH="/bin:/usr/bin:/usr/local/bin:/opt/homebrew/bin:${PATH}" + +# Use ${BASH_SOURCE[0]:-$0} for shell portability — BASH_SOURCE is undefined +# in zsh (MCP shell environment). See GH#3931. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" || return 2>/dev/null || exit +source "${SCRIPT_DIR}/shared-constants.sh" +source "${SCRIPT_DIR}/worker-lifecycle-common.sh" + +####################################### +# Configuration +####################################### +STATS_TIMEOUT="${STATS_TIMEOUT:-600}" # 10 min hard ceiling +STATS_TIMEOUT=$(_validate_int STATS_TIMEOUT "$STATS_TIMEOUT" 600 60) + +STATS_PIDFILE="${HOME}/.aidevops/logs/stats.pid" +STATS_LOGFILE="${HOME}/.aidevops/logs/stats.log" + +mkdir -p "$(dirname "$STATS_PIDFILE")" + +####################################### +# PID-based dedup — same pattern as pulse-wrapper check_dedup() +####################################### +check_stats_dedup() { + if [[ ! -f "$STATS_PIDFILE" ]]; then + return 0 + fi + + # PID file format: "PID EPOCH" (PID + start timestamp) + local old_pid old_epoch + read -r old_pid old_epoch <"$STATS_PIDFILE" 2>/dev/null || { + rm -f "$STATS_PIDFILE" + return 0 + } + + if [[ -z "$old_pid" ]]; then + rm -f "$STATS_PIDFILE" + return 0 + fi + + if ! ps -p "$old_pid" >/dev/null 2>&1; then + rm -f "$STATS_PIDFILE" + return 0 + fi + + # Check age using stored epoch (portable — no date -d / _get_process_age) + old_epoch="${old_epoch:-0}" + local now + now=$(date +%s) + local elapsed=$((now - old_epoch)) + + if [[ "$elapsed" -gt "$STATS_TIMEOUT" ]]; then + echo "[stats-wrapper] Killing stale stats process $old_pid (${elapsed}s)" >>"$STATS_LOGFILE" + _kill_tree "$old_pid" || true + sleep 2 + if kill -0 "$old_pid" 2>/dev/null; then + _force_kill_tree "$old_pid" || true + fi + rm -f "$STATS_PIDFILE" + return 0 + fi + + echo "[stats-wrapper] Stats already running (PID $old_pid, ${elapsed}s). Skipping." >>"$STATS_LOGFILE" + return 1 +} + +####################################### +# Main +####################################### +main() { + if ! check_stats_dedup; then + return 0 + fi + + echo "$$ $(date +%s)" >"$STATS_PIDFILE" + trap 'rm -f "$STATS_PIDFILE"' EXIT + + echo "[stats-wrapper] Starting at $(date -u +%Y-%m-%dT%H:%M:%SZ)" >>"$STATS_LOGFILE" + + # Source pulse-wrapper to reuse its functions + # (update_health_issues, run_daily_quality_sweep, etc.) + # pulse-wrapper.sh has a source guard — main() won't execute on source. + # shellcheck source=pulse-wrapper.sh + source "${SCRIPT_DIR}/pulse-wrapper.sh" || { + echo "[stats-wrapper] Failed to source pulse-wrapper.sh" >>"$STATS_LOGFILE" + return 1 + } + + run_daily_quality_sweep || true + update_health_issues || true + + echo "[stats-wrapper] Finished at $(date -u +%Y-%m-%dT%H:%M:%SZ)" >>"$STATS_LOGFILE" + return 0 +} + +# Shell-portable source detection — same as pulse-wrapper (GH#3931) +_stats_is_sourced() { + if [[ -n "${BASH_SOURCE[0]:-}" ]]; then + [[ "${BASH_SOURCE[0]}" != "${0}" ]] + elif [[ -n "${ZSH_EVAL_CONTEXT:-}" ]]; then + [[ ":${ZSH_EVAL_CONTEXT}:" == *":file:"* ]] + else + return 1 + fi +} +if ! _stats_is_sourced; then + main "$@" +fi diff --git a/setup.sh b/setup.sh index 4fda96838..2300ef238 100755 --- a/setup.sh +++ b/setup.sh @@ -990,6 +990,91 @@ PLIST fi fi + # Enable stats-wrapper — runs quality sweep and health issue updates + # separately from the pulse (t1429). Only installed when the supervisor + # pulse is enabled (stats are useless without it). + local stats_script="$HOME/.aidevops/agents/scripts/stats-wrapper.sh" + local stats_label="com.aidevops.aidevops-stats-wrapper" + if [[ -x "$stats_script" ]] && [[ "$_pulse_lower" == "true" ]]; then + local _stats_installed=false + if _launchd_has_agent "$stats_label"; then + _stats_installed=true + elif crontab -l 2>/dev/null | grep -qF "aidevops: stats-wrapper"; then + _stats_installed=true + fi + if [[ "$_stats_installed" == "false" ]]; then + if [[ "$(uname -s)" == "Darwin" ]]; then + local stats_plist="$HOME/Library/LaunchAgents/${stats_label}.plist" + local _xml_stats_script _xml_stats_home _xml_stats_path + _xml_stats_script=$(_xml_escape "$stats_script") + _xml_stats_home=$(_xml_escape "$HOME") + _xml_stats_path=$(_xml_escape "$PATH") + cat >"$stats_plist" < + + + + Label + ${stats_label} + ProgramArguments + + /bin/bash + ${_xml_stats_script} + + StartInterval + 900 + StandardOutPath + ${_xml_stats_home}/.aidevops/logs/stats.log + StandardErrorPath + ${_xml_stats_home}/.aidevops/logs/stats.log + EnvironmentVariables + + PATH + ${_xml_stats_path} + HOME + ${_xml_stats_home} + + RunAtLoad + + KeepAlive + + + +PLIST + if launchctl load "$stats_plist"; then + print_info "Stats wrapper enabled (launchd, every 15 min)" + else + print_warning "Failed to load stats wrapper LaunchAgent" + fi + else + local _cron_stats_script + _cron_stats_script=$(_cron_escape "$stats_script") + ( + crontab -l 2>/dev/null | grep -v 'aidevops: stats-wrapper' + echo "*/15 * * * * PATH=\"/usr/local/bin:/usr/bin:/bin\" /bin/bash ${_cron_stats_script} >> \"\$HOME/.aidevops/logs/stats.log\" 2>&1 # aidevops: stats-wrapper" + ) | crontab - || true + if crontab -l 2>/dev/null | grep -qF "aidevops: stats-wrapper"; then + print_info "Stats wrapper enabled (cron, every 15 min)" + fi + fi + fi + elif [[ "$_pulse_lower" == "false" ]]; then + # Remove stats scheduler if pulse is disabled + if [[ "$(uname -s)" == "Darwin" ]]; then + local stats_plist="$HOME/Library/LaunchAgents/${stats_label}.plist" + if _launchd_has_agent "$stats_label"; then + launchctl unload "$stats_plist" || true + rm -f "$stats_plist" + print_info "Stats wrapper disabled (launchd agent removed — pulse is off)" + fi + else + if crontab -l 2>/dev/null | grep -qF "aidevops: stats-wrapper"; then + crontab -l 2>/dev/null | grep -v 'aidevops: stats-wrapper' | crontab - || true + print_info "Stats wrapper disabled (cron entry removed — pulse is off)" + fi + fi + fi + # Enable repo-sync scheduler if not already installed # Keeps local git repos up to date with daily ff-only pulls # Respects config: aidevops config set orchestration.repo_sync false diff --git a/todo/tasks/t1429-brief.md b/todo/tasks/t1429-brief.md new file mode 100644 index 000000000..0b35451c1 --- /dev/null +++ b/todo/tasks/t1429-brief.md @@ -0,0 +1,82 @@ +# t1429: Separate stats from pulse — prevent rate-limit blocking + +## Origin + +- **Created:** 2026-03-10 +- **Session:** opencode:ses_3271deae6ffeB626iF1sxHYePP +- **Created by:** alex-solovyev (human + ai-interactive) +- **Conversation context:** User observed pulse processes stuck in infinite rate-limit wait loops. Root cause analysis revealed `contributor-activity-helper.sh` blocks on GitHub Search API rate limits (30 req/min), and this runs inside `pulse-wrapper.sh main()` BEFORE `run_pulse()`, preventing any useful work (dispatch, merge) from ever executing. + +## What + +1. **Remove stats from pulse-wrapper.sh** — `run_daily_quality_sweep` and `update_health_issues` (which calls `_refresh_person_stats_cache` -> `contributor-activity-helper.sh`) must not run in the pulse process at all. +2. **Create `stats-wrapper.sh`** — a separate cron-schedulable script that runs quality sweep, health issues, and person-stats on its own schedule (e.g., every 15-30 min) with its own PID dedup and a hard timeout. +3. **Fix `contributor-activity-helper.sh`** — the rate-limit wait loop (line ~1035) must bail out instead of sleeping indefinitely. Return partial results when budget is exhausted. +4. **Wire cron** — `setup.sh` installs the stats cron entry alongside the pulse entry. + +## Why + +The pulse is the only mechanism for dispatching workers and merging PRs. When stats block it, zero useful work gets done. The pulse log showed hours of `Rate limit low (0 remaining), waiting 56s...` with the pulse never reaching `run_pulse()`. This is a production blocker — the entire autonomous orchestration system is dead when stats consume the Search API budget. + +## How (Approach) + +- `pulse-wrapper.sh:main()` — remove `run_daily_quality_sweep` and `update_health_issues` calls, update the execution order comment +- New `.agents/scripts/stats-wrapper.sh` — extract stats logic into standalone script with PID dedup, hard timeout (10 min), and its own log file (`~/.aidevops/logs/stats.log`) +- `contributor-activity-helper.sh:person_stats()` — replace infinite sleep loop with a max-retries (1) approach: if budget < 5, skip remaining users and return partial results +- `setup.sh` — add `stats-wrapper` cron entry (every 15 min) + +Key files: +- `.agents/scripts/pulse-wrapper.sh:3276` — main() function +- `.agents/scripts/contributor-activity-helper.sh:1031` — blocking for loop +- `setup.sh` — cron installation + +## Acceptance Criteria + +- [ ] `pulse-wrapper.sh main()` does not call `run_daily_quality_sweep` or `update_health_issues` + ```yaml + verify: + method: codebase + pattern: "run_daily_quality_sweep|update_health_issues" + path: ".agents/scripts/pulse-wrapper.sh" + expect: absent + ``` +- [ ] `stats-wrapper.sh` exists and is executable + ```yaml + verify: + method: bash + run: "test -x .agents/scripts/stats-wrapper.sh" + ``` +- [ ] `contributor-activity-helper.sh` does not sleep indefinitely on rate limit — bails out with partial results + ```yaml + verify: + method: codebase + pattern: "sleep.*wait_secs" + path: ".agents/scripts/contributor-activity-helper.sh" + expect: absent + ``` +- [ ] ShellCheck clean on modified scripts + ```yaml + verify: + method: bash + run: "shellcheck .agents/scripts/pulse-wrapper.sh .agents/scripts/stats-wrapper.sh .agents/scripts/contributor-activity-helper.sh" + ``` + +## Context & Decisions + +- Stats were originally placed before pulse (GH#2958) to "not eat into pulse time" — but the blocking rate-limit loop made this worse than the original problem +- Person-stats cache (t1426) tried to fix this by gating on Search API budget, but the blocking loop inside `contributor-activity-helper.sh` itself was never fixed +- The pulse progress watchdog doesn't catch this because the rate-limit loop writes to stderr -> log grows -> progress detection sees "activity" + +## Relevant Files + +- `.agents/scripts/pulse-wrapper.sh:3276` — main() to modify +- `.agents/scripts/pulse-wrapper.sh:2149` — `_refresh_person_stats_cache()` to extract +- `.agents/scripts/pulse-wrapper.sh:2245` — `update_health_issues()` to extract +- `.agents/scripts/contributor-activity-helper.sh:1031` — blocking for loop to fix +- `setup.sh` — cron installation + +## Dependencies + +- **Blocked by:** none +- **Blocks:** pulse functionality (currently dead) +- **External:** none