diff --git a/.circleci/config.yml b/.circleci/config.yml index 23f0554be93f4..162f27a903fbe 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1074,8 +1074,8 @@ jobs: path: ops/ai-eng/contracts-test-maintenance/log.json destination: log.json - run: - name: Prepare Slack notification - command: just prepare-slack-notification >> $BASH_ENV + name: Build Slack notification + command: just build-slack-notification >> $BASH_ENV working_directory: ops/ai-eng when: always - slack/notify: diff --git a/ops/ai-eng/contracts-test-maintenance/VERSION b/ops/ai-eng/contracts-test-maintenance/VERSION index f0bb29e763888..88c5fb891dcf1 100644 --- a/ops/ai-eng/contracts-test-maintenance/VERSION +++ b/ops/ai-eng/contracts-test-maintenance/VERSION @@ -1 +1 @@ -1.3.0 +1.4.0 diff --git a/ops/ai-eng/contracts-test-maintenance/components/devin-api/devin_client.py b/ops/ai-eng/contracts-test-maintenance/components/devin-api/devin_client.py index bad0ff0e3de75..cb80541ff09b1 100644 --- a/ops/ai-eng/contracts-test-maintenance/components/devin-api/devin_client.py +++ b/ops/ai-eng/contracts-test-maintenance/components/devin-api/devin_client.py @@ -4,7 +4,7 @@ then monitors the session until completion while logging the results. """ -from datetime import datetime +from datetime import datetime, timezone import glob import json import os @@ -45,6 +45,7 @@ def write_log(session_id, status, session_data): ranking_file = f"../tests_ranker/output/{run_id}_ranking.json" with open(ranking_file, "r") as f: + data = json.load(f) selected_files = { "test_path": data["entries"][0]["test_path"], @@ -73,8 +74,8 @@ def write_log(session_id, status, session_data): "status": status, } - # Only add PR link if status is finished - if status == "finished" and session_data: + # Add PR link if status is finished or no_changes_needed (both create PRs) + if status in ["finished", "no_changes_needed"] and session_data: pr_data = session_data.get("pull_request") or {} pr_url = pr_data.get("url") if pr_url: @@ -136,28 +137,38 @@ def create_session(prompt): headers = _create_headers(api_key, "application/json") data = json.dumps({"prompt": prompt}).encode("utf-8") - response_data = _make_request(f"{base_url}/sessions", headers, data, "POST") - session_id = response_data["session_id"] + retry_delay = 60 + while True: + response_data = _make_request(f"{base_url}/sessions", headers, data, "POST") + + if response_data is None: + print(f"Session creation timed out, retrying in {retry_delay}s...") + time.sleep(retry_delay) + retry_delay = min(retry_delay * 2, 480) + continue - print(f"Created session: {session_id}") - return session_id + session_id = response_data["session_id"] + print(f"Created session: {session_id}") + return session_id def monitor_session(session_id): """Monitor session status until completion.""" api_key, base_url = _validate_environment() headers = _create_headers(api_key) - last_status = None + last_status_enum = None retry_delay = 60 # Start with 1 minute setup_printed = False timeout_count = 0 + blocked_start_time = None # Track when we first entered blocked state + blocked_timeout = 300 # 5 minutes timeout for blocked state without outcome while True: try: - status = _make_request(f"{base_url}/sessions/{session_id}", headers) + api_response = _make_request(f"{base_url}/sessions/{session_id}", headers) # Handle server timeout (no response) - retry with backoff - if status is None: + if api_response is None: timeout_count += 1 # Only print after 3rd consecutive timeout to reduce noise if timeout_count >= 3: @@ -171,10 +182,10 @@ def monitor_session(session_id): if timeout_count > 0: timeout_count = 0 - current_status = status.get("status_enum") + status_enum = api_response.get("status_enum") # Handle Devin setup phase (status_enum is None but we got a response) - if current_status is None: + if status_enum is None: if not setup_printed: print("Devin is setting up...") setup_printed = True @@ -182,49 +193,86 @@ def monitor_session(session_id): continue # Print setup completion message once - if setup_printed and current_status: + if setup_printed and status_enum: print("Devin finished setup") setup_printed = False # Only print when status changes and is meaningful - if current_status and current_status != last_status: - print(f"Status: {current_status}") - last_status = current_status + if status_enum and status_enum != last_status_enum: + print(f"Status: {status_enum}") + last_status_enum = status_enum # Stop monitoring for terminal statuses (only if we have valid status data) - if status and current_status in ["blocked", "expired", "suspend_requested", "suspend_requested_frontend"]: + if api_response and status_enum in ["blocked", "finished", "expired", "suspend_requested", "suspend_requested_frontend"]: # Handle user stopping the session - if current_status in ["suspend_requested", "suspend_requested_frontend"]: + if status_enum in ["suspend_requested", "suspend_requested_frontend"]: print("Session stopped by user") return - # Blocked = PR created or analysis completed without changes - if current_status == "blocked": - structured_output = status.get("structured_output") or {} - pr_data = status.get("pull_request") or {} - - # Check if analysis completed without changes - if structured_output.get("analysis_complete") and not structured_output.get("changes_needed"): - reason = structured_output.get("reason", "no changes needed") - print(f"Session completed - {reason}") - write_log(session_id, "finished_no_changes", status) + # Blocked or finished - check for outcome + if status_enum in ["blocked", "finished"]: + # Ensure we have valid status data before accessing nested fields + if api_response is None: + print("Warning: Terminal status reached but no status data available, retrying...") + time.sleep(retry_delay) + continue + + # Check structured output and PR (both should be populated when blocked) + # Note: Devin API nests structured_output twice: {structured_output: {structured_output: {...}}} + # The outer structured_output can be null, so we use `or {}` to handle that case + structured = (api_response.get("structured_output") or {}).get("structured_output") or {} + analysis_complete = structured.get("analysis_complete", False) + changes_needed = structured.get("changes_needed") + + pr_data = api_response.get("pull_request") or {} + pr_url = pr_data.get("url") + + # Case 1: Structured output indicates no changes needed + if analysis_complete and changes_needed is False: + reason = structured.get("reason", "Not provided") + print(f"Session completed - no changes needed") + print(f"Reason: {reason}") + if pr_url: + print(f"PR created for TOML tracking: {pr_url}") + + write_log(session_id, "no_changes_needed", api_response) return - # Check if PR was created - if pr_data.get("url"): - print("Session completed successfully - PR created") - write_log(session_id, "finished", status) + # Case 2: PR created with test improvements (only if no structured output yet) + # We need to wait for structured_output to determine if this is a no-changes case + if pr_url and analysis_complete: + # We have both PR and completed analysis, and changes_needed != False + # This means actual test improvements were made + print(f"Session completed successfully - PR created: {pr_url}") + write_log(session_id, "finished", api_response) return - # Blocked without completion signal - print(f"Session blocked without PR - check Devin web interface") - # Don't write log.json so artifact won't be stored for failed sessions - sys.exit(1) # Exit with error code to mark job as failed + # If blocked without complete data, keep waiting briefly + # Devin may still be populating the data + if status_enum == "blocked": + if blocked_start_time is None: + blocked_start_time = time.time() + print("Devin is blocked - waiting for complete outcome data...") + + elapsed = time.time() - blocked_start_time + if elapsed > blocked_timeout: + print(f"Timeout: Devin blocked for {int(elapsed)}s without outcome - check Devin web interface") + sys.exit(1) + + time.sleep(5) + continue + + # Reset blocked timer if we move out of blocked state + blocked_start_time = None + + # Finished without PR and no structured output = error + print(f"Session finished without PR or clear outcome - check Devin web interface") + sys.exit(1) # Expired = session timed out - if current_status == "expired": + if status_enum == "expired": print(f"Session expired") - write_log(session_id, "expired", status) + write_log(session_id, "expired", api_response) return time.sleep(5) diff --git a/ops/ai-eng/contracts-test-maintenance/components/prompt-renderer/render.py b/ops/ai-eng/contracts-test-maintenance/components/prompt-renderer/render.py index 62639cbca214c..bff635001248d 100644 --- a/ops/ai-eng/contracts-test-maintenance/components/prompt-renderer/render.py +++ b/ops/ai-eng/contracts-test-maintenance/components/prompt-renderer/render.py @@ -8,7 +8,7 @@ def load_ranking_data(): - """Load the ranking JSON file and return the first entry and run_id.""" + """Load the ranking JSON file and return the first entry, stale entries, and run_id.""" ranking_dir = Path(__file__).parent / "../tests_ranker" / "output" # Get the ranking file @@ -23,7 +23,31 @@ def load_ranking_data(): if not data.get("entries"): raise ValueError(f"No entries found in {ranking_file.name}") - return data["entries"][0], run_id + stale_toml_entries = data.get("stale_toml_entries", []) + + return data["entries"][0], stale_toml_entries, run_id + + +def format_stale_entries(stale_entries): + """Format stale TOML entries as markdown list. + + Args: + stale_entries: List of dicts with test_path, contract_path, old_hash, new_hash + + Returns: + Formatted markdown string with bullet list of stale entries, or "(none)" if empty + """ + if not stale_entries: + return "(none)" + + lines = [] + for entry in stale_entries: + test_path = entry.get("test_path", "unknown") + old_hash = entry.get("old_hash", "unknown")[:7] + new_hash = entry.get("new_hash", "unknown")[:7] + lines.append(f"- `{test_path}` (contract changed: {old_hash} → {new_hash})") + + return "\n".join(lines) def load_prompt_template(): @@ -34,10 +58,22 @@ def load_prompt_template(): return f.read() -def render_prompt(template, test_path, contract_path): - """Replace the placeholders in the template with actual paths.""" - return template.replace("{TEST_PATH}", test_path).replace( - "{CONTRACT_PATH}", contract_path +def render_prompt(template, test_path, contract_path, stale_entries_list): + """Replace the placeholders in the template with actual paths and stale entries. + + Args: + template: The prompt template string + test_path: Path to the test file + contract_path: Path to the contract file + stale_entries_list: Formatted markdown list of stale entries + + Returns: + Rendered prompt with all placeholders replaced + """ + return ( + template.replace("{TEST_PATH}", test_path) + .replace("{CONTRACT_PATH}", contract_path) + .replace("{{STALE_ENTRIES_LIST}}", stale_entries_list) ) @@ -63,7 +99,7 @@ def main(): """Main function to render and save the prompt instance.""" try: # Load ranking data and get run_id - first_entry, run_id = load_ranking_data() + first_entry, stale_toml_entries, run_id = load_ranking_data() test_path = first_entry["test_path"] contract_path = first_entry["contract_path"] @@ -71,11 +107,16 @@ def main(): print(f" Test path: {test_path}") print(f" Contract path: {contract_path}") + # Format stale entries for injection + stale_entries_list = format_stale_entries(stale_toml_entries) + if stale_toml_entries: + print(f" Stale TOML entries: {len(stale_toml_entries)}") + # Load prompt template template = load_prompt_template() - # Render the prompt with actual paths - rendered_prompt = render_prompt(template, test_path, contract_path) + # Render the prompt with actual paths and stale entries + rendered_prompt = render_prompt(template, test_path, contract_path, stale_entries_list) # Save the rendered prompt output_file = save_prompt_instance(rendered_prompt, run_id) diff --git a/ops/ai-eng/contracts-test-maintenance/components/slack-notifier/build_notification.sh b/ops/ai-eng/contracts-test-maintenance/components/slack-notifier/build_notification.sh new file mode 100755 index 0000000000000..db7e03bb0867c --- /dev/null +++ b/ops/ai-eng/contracts-test-maintenance/components/slack-notifier/build_notification.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -eo pipefail + +LOG_FILE="$1" + +if [ -z "$LOG_FILE" ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +STATUS=$(jq -r '.status // empty' "$LOG_FILE") +PR_URL=$(jq -r '.pull_request_url // empty' "$LOG_FILE") +TEST_FILE=$(jq -r '.selected_files.test_path | split("/") | .[-1]' "$LOG_FILE") + +if [ "$STATUS" = "no_changes_needed" ] && [ -n "$PR_URL" ]; then + # No changes needed but PR opened to add TOML tracking entry + MESSAGE=$' AI Contracts Test Maintenance System analyzed '"${TEST_FILE}"$' - no changes needed (test coverage already comprehensive)\n<'"${PR_URL}"$'|View PR to add no-changes tracking>' + SLACK_JSON=$(jq -n --arg msg "$MESSAGE" '{"text": $msg}') + echo "$SLACK_JSON" +elif [ -n "$PR_URL" ]; then + # Normal case: PR with test improvements + MESSAGE=$' AI Contracts Test Maintenance System created a PR for '"${TEST_FILE}"$'\n<'"${PR_URL}"$'|View PR> | ' + SLACK_JSON=$(jq -n --arg msg "$MESSAGE" '{"text": $msg}') + echo "$SLACK_JSON" +elif [ "$STATUS" = "no_changes_needed" ]; then + # Edge case: no changes and no PR (shouldn't happen with new workflow) + MESSAGE=$' AI Contracts Test Maintenance System analyzed '"${TEST_FILE}"$' - no changes needed (test coverage already comprehensive)' + SLACK_JSON=$(jq -n --arg msg "$MESSAGE" '{"text": $msg}') + echo "$SLACK_JSON" +else + echo "No notification needed (status: $STATUS)" >&2 + echo '{}' +fi diff --git a/ops/ai-eng/contracts-test-maintenance/components/slack-notifier/prepare_notification.sh b/ops/ai-eng/contracts-test-maintenance/components/slack-notifier/prepare_notification.sh deleted file mode 100755 index 1c366ad296201..0000000000000 --- a/ops/ai-eng/contracts-test-maintenance/components/slack-notifier/prepare_notification.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Prepare Slack notification for AI Contracts Test Maintenance System results -# Outputs a JSON message to be used with CircleCI Slack orb - -LOG_FILE="../../log.json" - -# Extract data from log file -PR_URL=$(jq -r '.pull_request_url // empty' "$LOG_FILE") -TEST_FILE=$(jq -r '.selected_files.test_path | split("/") | .[-1]' "$LOG_FILE") -STATUS=$(jq -r '.status // empty' "$LOG_FILE") - -# Prepare message based on outcome -if [ -n "$PR_URL" ]; then - # PR was created - notify team for review - MESSAGE=$' AI Contracts Test Maintenance System created a PR for '"${TEST_FILE}"$'\n<'"${PR_URL}"$'|View PR> | ' - SLACK_JSON=$(jq -n --arg msg "$MESSAGE" '{"text": $msg}') - echo "export AI_PR_SLACK_TEMPLATE='${SLACK_JSON}'" -elif [ "$STATUS" = "finished_no_changes" ]; then - # Analysis complete but no changes needed - informational only - MESSAGE=$'AI Contracts Test Maintenance System analyzed '"${TEST_FILE}"$' - no changes needed (test coverage is already comprehensive)' - SLACK_JSON=$(jq -n --arg msg "$MESSAGE" '{"text": $msg}') - echo "export AI_PR_SLACK_TEMPLATE='${SLACK_JSON}'" -else - # No notification needed - echo "No PR created, skipping notification" - echo "export AI_PR_SLACK_TEMPLATE=''" -fi diff --git a/ops/ai-eng/contracts-test-maintenance/components/tests_ranker/test_ranker.py b/ops/ai-eng/contracts-test-maintenance/components/tests_ranker/test_ranker.py index 49de537d94c13..d1881ea9bf277 100644 --- a/ops/ai-eng/contracts-test-maintenance/components/tests_ranker/test_ranker.py +++ b/ops/ai-eng/contracts-test-maintenance/components/tests_ranker/test_ranker.py @@ -20,6 +20,36 @@ # === Git Utilities === +def get_file_commit_hash(file_path: Path, repo_root: Path) -> Optional[str]: + """Get the commit hash of the last commit that modified a file. + + Args: + file_path: Path to the file. + repo_root: Path to the git repository root. + + Returns: + Full SHA-1 hash of the last commit, or None if unable to determine. + """ + try: + relative_path = file_path.relative_to(repo_root) + + result = subprocess.run( + ["git", "log", "-1", "--format=%H", "--", str(relative_path)], + cwd=repo_root, + capture_output=True, + text=True, + check=True, + ) + + if result.stdout.strip(): + return result.stdout.strip() + + except (subprocess.CalledProcessError, ValueError, OSError): + pass + + return None + + def get_file_commit_timestamp(file_path: Path, repo_root: Path) -> Optional[int]: """Get the timestamp of the last commit that modified a file. @@ -306,14 +336,97 @@ def fetch_last_processed_from_circleci() -> list[Path]: return [] -def load_exclusions(contracts_bedrock: Path) -> tuple[list[Path], set[Path]]: +def fetch_no_changes_needed_exclusions(contracts_bedrock: Path, repo_root: Path) -> tuple[set[Path], list[dict]]: + """Load 'no changes needed' tests from TOML file, detecting stale entries. + + Reads tracking file from no-need-changes.toml, checks each entry's contract hash. + Returns both tests to exclude (unchanged contracts) and stale entries (changed contracts). + + Args: + contracts_bedrock: Path to the contracts-bedrock directory. + repo_root: Path to the git repository root. + + Returns: + Tuple of (excluded_tests, stale_entries): + - excluded_tests: Set of test paths to exclude (contracts haven't changed) + - stale_entries: List of dicts with stale entry info for Devin to clean up + """ + try: + print("Checking no-need-changes.toml for tracked tests...") + + toml_file = Path(__file__).parent.parent.parent / "no-need-changes.toml" + + if not toml_file.exists(): + print("No tracking file found") + return set(), [] + + with toml_file.open("rb") as f: + tracking_data = tomllib.load(f) + + tests = tracking_data.get("tests", []) + if not tests: + print("No tracked tests found") + return set(), [] + + print(f"Loaded {len(tests)} tracked test(s)") + + excluded_tests = set() + stale_entries = [] + + for entry in tests: + test_path = entry.get("test_path") + contract_path = entry.get("contract_path") + recorded_hash = entry.get("contract_hash") + + if not test_path or not contract_path or not recorded_hash: + continue + + # Get current contract hash + full_contract_path = contracts_bedrock / contract_path + current_hash = get_file_commit_hash(full_contract_path, repo_root) + + if not current_hash: + # Can't get hash - skip this entry + continue + + if current_hash == recorded_hash: + # Contract unchanged - exclude from ranking + excluded_tests.add(Path(test_path)) + print(f" Excluding: {test_path} (contract unchanged)") + else: + # Contract changed - entry is stale (Devin should remove it) + print(f" Stale entry: {test_path} (contract changed: {recorded_hash[:7]} → {current_hash[:7]})") + stale_entries.append({ + "test_path": test_path, + "contract_path": contract_path, + "old_hash": recorded_hash, + "new_hash": current_hash + }) + + if excluded_tests: + print(f"✓ Excluding {len(excluded_tests)} test(s) with unchanged contracts") + + if stale_entries: + print(f"⚠ Found {len(stale_entries)} stale entry(ies) - Devin will clean these up") + + return excluded_tests, stale_entries + + except Exception as e: + print(f"Could not fetch tracking file: {e}") + return set(), [] + + +def load_exclusions(contracts_bedrock: Path) -> tuple[list[Path], set[Path], list[dict]]: """Load and normalize exclusion paths from TOML configuration. Args: contracts_bedrock: Path to the contracts-bedrock directory. Returns: - Tuple of (excluded_dirs, excluded_files) as normalized Path objects. + Tuple of (excluded_dirs, excluded_files, stale_toml_entries): + - excluded_dirs: List of excluded directory paths + - excluded_files: Set of excluded file paths + - stale_toml_entries: List of stale entries from no-need-changes.toml Raises: FileNotFoundError: If exclusions.toml file is not found. @@ -347,7 +460,12 @@ def load_exclusions(contracts_bedrock: Path) -> tuple[list[Path], set[Path]]: for test_file in last_processed_files: excluded_files.add(test_file) - return excluded_dirs, excluded_files + # Add TOML-based "no changes needed" exclusions (permanent until contract changes) + repo_root = get_base_paths()[0] + no_changes_exclusions, stale_toml_entries = fetch_no_changes_needed_exclusions(contracts_bedrock, repo_root) + excluded_files.update(no_changes_exclusions) + + return excluded_dirs, excluded_files, stale_toml_entries def is_path_excluded( @@ -412,7 +530,10 @@ def filter_excluded_files( def generate_ranking_json( - entries: list[dict[str, str | int | float | None]], output_dir: Path, run_id: str + entries: list[dict[str, str | int | float | None]], + output_dir: Path, + run_id: str, + stale_toml_entries: list[dict] = None ) -> Path: """Generate the ranking JSON file. @@ -420,10 +541,13 @@ def generate_ranking_json( entries: List of test-to-contract mappings with scores. output_dir: Directory to write the output file. run_id: Timestamp-based run identifier. + stale_toml_entries: List of stale entries from no-need-changes.toml (optional). Returns: Path to the generated JSON file. """ + if stale_toml_entries is None: + stale_toml_entries = [] # Ensure output directory exists output_dir.mkdir(parents=True, exist_ok=True) @@ -441,6 +565,7 @@ def generate_ranking_json( "run_id": run_id, "generated_at": datetime.now(timezone.utc).isoformat(), "entries": sorted_entries, + "stale_toml_entries": stale_toml_entries, } # Write to output file with run_id @@ -539,16 +664,16 @@ def main() -> None: # Get base paths repo_root, contracts_bedrock, output_dir = get_base_paths() - # Load exclusions - excluded_dirs, excluded_files = load_exclusions(contracts_bedrock) + # Load exclusions (now also returns stale TOML entries) + excluded_dirs, excluded_files, stale_toml_entries = load_exclusions(contracts_bedrock) # Collect test entries entries = collect_test_entries( contracts_bedrock, excluded_dirs, excluded_files, repo_root ) - # Generate ranking JSON with run_id - output_file = generate_ranking_json(entries, output_dir, run_id) + # Generate ranking JSON with run_id and stale entries + output_file = generate_ranking_json(entries, output_dir, run_id, stale_toml_entries) print(f"Generated {output_file} with {len(entries)} entries") print(f"Run ID: {run_id}") diff --git a/ops/ai-eng/contracts-test-maintenance/docs/runbook.md b/ops/ai-eng/contracts-test-maintenance/docs/runbook.md index 582d321c2f21c..5da50e030cd73 100644 --- a/ops/ai-eng/contracts-test-maintenance/docs/runbook.md +++ b/ops/ai-eng/contracts-test-maintenance/docs/runbook.md @@ -15,6 +15,8 @@ The system uses a **two-branch scoring algorithm**: - **Automated CI Integration**: Runs twice weekly on schedule (Monday/Thursday) or on-demand - **Smart Prioritization**: Focuses on tests that are most out of sync with their contracts - **Duplicate Prevention**: Automatically excludes recently processed files (last 2 weeks) +- **No-Changes Tracking**: Tests with comprehensive coverage tracked in TOML file to avoid redundant work +- **Stale Entry Detection**: Automatically identifies when tracked tests need re-analysis due to contract changes - **Resilient Monitoring**: Handles long-running Devin sessions with retry logic - **Full Audit Trail**: All runs logged with complete traceability @@ -26,7 +28,8 @@ The system uses a **two-branch scoring algorithm**: contracts-test-maintenance/ ├── VERSION # System version ├── exclusion.toml # Static exclusions configuration -├── log.json # Latest execution log +├── no-need-changes.toml # Tests with comprehensive coverage (auto-managed) +├── log.jsonl # Execution history and results ├── prompt/ │ └── prompt.md # AI instruction template (~2000 lines) ├── components/ @@ -36,10 +39,8 @@ contracts-test-maintenance/ │ ├── prompt-renderer/ # Stage 2: Prompt generation │ │ ├── render.py │ │ └── output/{run_id}_prompt.md -│ ├── devin-api/ # Stage 3: AI execution -│ │ └── devin_client.py -│ └── slack-notifier/ # Slack notification preparation -│ └── prepare_notification.sh +│ └── devin-api/ # Stage 3: AI execution +│ └── devin_client.py └── docs/ └── runbook.md # This document ``` @@ -83,7 +84,9 @@ ai-contracts-test: **Slack Notifications**: - Automatic notification posted to #evm-safety Slack channel when PR is created -- Includes PR URL, test file information, and link to reviewer guide +- Two notification types: + - **Test improvements**: Includes PR URL, test file info, and reviewer guide link + - **No changes needed**: Notifies team that test has comprehensive coverage with PR for TOML tracking update - Helps expedite review process by alerting reviewers immediately **In CircleCI**: @@ -166,6 +169,14 @@ The `just rank` command generates `components/tests_ranker/output/{run_id}_ranki "staleness_days": -98.21, "score": 135.84 } + ], + "stale_toml_entries": [ + { + "test_path": "test/L1/SystemConfig.t.sol", + "contract_path": "src/L1/SystemConfig.sol", + "old_hash": "abc1234", + "new_hash": "def5678" + } ] } ``` @@ -180,10 +191,14 @@ The `just rank` command generates `components/tests_ranker/output/{run_id}_ranki - `contract_commit_ts` - Unix timestamp of contract file's last commit - `staleness_days` - Calculated staleness (positive = contract newer) - `score` - Priority score (higher = more urgent) +- `stale_toml_entries` - Array of no-need-changes.toml entries that need removal (contract hash changed) ### Prompt Renderer Output -The `just render` command generates a markdown file in `components/prompt-renderer/output/` with the name format `{run_id}_prompt.md`. This file contains the AI prompt template with the highest-priority test and contract paths filled in, ready to be used for test maintenance analysis. +The `just render` command generates a markdown file in `components/prompt-renderer/output/` with the name format `{run_id}_prompt.md`. This file contains the AI prompt template with: +- The highest-priority test and contract paths filled in +- List of stale no-need-changes.toml entries (if any) that Devin should remove before starting analysis +- Instructions for adding new entries when no changes are needed For example, a run with ID `20250922_143052` will generate `20250922_143052_prompt.md`. The system automatically links prompts to their corresponding ranking runs through the shared run ID. @@ -239,17 +254,26 @@ All Devin sessions are automatically logged to `log.json` with: - `run_time` - Human-readable timestamp of the run - `devin_session_id` - Unique Devin session identifier - `selected_files` - The test-contract pair that was worked on -- `status` - Final session status ("finished", "finished_no_changes", "blocked", "expired") -- `pull_request_url` - GitHub PR URL (only present if status is "finished") +- `status` - Final session status ("finished", "no_changes_needed", "blocked", "expired", "failed") +- `pull_request_url` - GitHub PR URL (present for both "finished" and "no_changes_needed" statuses) #### Duplicate Prevention -The ranking system automatically excludes files processed in the **last 2 weeks** to prevent duplicate work: +The ranking system uses two complementary strategies to prevent duplicate work: + +**1. No-Changes Tracking (with automatic reintegration)**: +- Tests with comprehensive coverage tracked in `no-need-changes.toml` +- Each entry includes contract git hash for validation +- Automatically excluded from ranking while contract remains unchanged +- **Automatically reintegrated** when contract changes (hash mismatch detected) +- Stale entries flagged in ranking output for Devin to remove + +**2. Recent Processing Exclusion (2-week cooldown)**: - Queries CircleCI API for recent successful runs - Extracts test paths from stored `log.json` artifacts - Temporarily excludes these files from ranking - Files become available again after 2 weeks -- This prevents immediate re-ranking of files still under review +- Prevents immediate re-ranking of files still under review ## Configuration @@ -320,8 +344,12 @@ else: 3. Get git commit timestamps using `git log -1 --format=%ct` 4. Calculate staleness: `contract_commit_ts - test_commit_ts` 5. Calculate priority score using two-branch algorithm -6. Apply exclusions (static from `exclusion.toml` + dynamic from CircleCI) -7. Sort by score (descending) and output to JSON +6. Apply exclusions: + - Static exclusions from `exclusion.toml` + - Dynamic exclusions from CircleCI artifacts (2-week window) + - **No-changes tracking from `no-need-changes.toml`** (excluded while contract hash matches) +7. Detect stale TOML entries (contract hash changed since entry was added) +8. Sort by score (descending) and output to JSON with stale entries **Output Fields**: - `test_path`: Relative path from `contracts-bedrock/` @@ -330,20 +358,25 @@ else: - `contract_commit_ts`: Unix timestamp (seconds since epoch) - `staleness_days`: Float, positive = contract is newer - `score`: Float, higher = more urgent attention needed +- `stale_toml_entries`: Array of entries that need removal (contract changed) ### Stage 2: Prompt Rendering **Process**: 1. Load ranking JSON from Stage 1 output -2. Extract first entry (highest priority test) -3. Load prompt template from `prompt/prompt.md` -4. Replace placeholders: +2. Extract first entry (highest priority test) and stale TOML entries +3. Format stale entries as markdown bullet list +4. Load prompt template from `prompt/prompt.md` +5. Replace placeholders: - `{TEST_PATH}` → test file path - `{CONTRACT_PATH}` → contract file path -5. Save rendered prompt to `output/` with same `run_id` + - `{{STALE_ENTRIES_LIST}}` → formatted list of stale entries (or "(none)") +6. Save rendered prompt to `output/` with same `run_id` **The Prompt Template** contains: - Role definition and task instructions +- **Stale entries cleanup instructions** (if any entries flagged) +- **No-changes tracking instructions** (how to add entries when no improvements needed) - Comprehensive testing methodology (4 phases) - Naming conventions for test contracts and functions - Fuzz testing decision trees @@ -356,24 +389,31 @@ else: **Process**: 1. Find latest prompt file from Stage 2 -2. Create Devin session via POST to `/sessions` endpoint +2. Create Devin session via POST to `/sessions` endpoint with session creation retry logic 3. Monitor session with polling: - Poll every 30 seconds for status updates + - Check for blocked state with 5-minute timeout + - Parse `structured_output` for completion signal - Implement exponential backoff for server errors (502/504) - Continue monitoring until terminal state reached -4. Log results to `log.json` with full session details +4. Determine final status based on Devin state and structured output +5. Log results to `log.json` with full session details **Devin API Terminal States**: -- `blocked`: Devin reached a blocking state (e.g., needs approval, PR created) +- `blocked`: Devin reached a blocking state (e.g., needs approval, PR created, or waiting) - `expired`: Session timeout reached - `suspend_requested` / `suspend_requested_frontend`: User manually stopped session -**Logged Status Values** (what gets written to `log.json`): -- `finished`: Devin status was "blocked" AND PR was successfully created -- `finished_no_changes`: Devin completed analysis and determined no changes needed (uses structured output) -- `blocked`: Devin status was "blocked" without PR URL or completion signal -- `expired`: Session timed out -- Note: User-stopped sessions are not logged +**Status Detection Logic**: +The client distinguishes between different outcomes by examining both Devin API status and structured output: + +1. **`finished`**: Devin blocked + PR created for test improvements +2. **`no_changes_needed`**: Devin blocked + structured_output indicates comprehensive coverage + PR created for TOML entry +3. **`blocked`**: Devin blocked but no PR or unclear state +4. **`expired`**: Session timeout +5. User-stopped sessions are not logged + +**Note**: The `status` field in `log.json` represents our interpretation of what happened, not the raw Devin API status. **Error Handling**: - 30-second timeout per HTTP request diff --git a/ops/ai-eng/contracts-test-maintenance/exclusion.toml b/ops/ai-eng/contracts-test-maintenance/exclusion.toml index 76a04639b1bed..d049863db28c9 100644 --- a/ops/ai-eng/contracts-test-maintenance/exclusion.toml +++ b/ops/ai-eng/contracts-test-maintenance/exclusion.toml @@ -22,6 +22,5 @@ files = [ "test/universal/BenchmarkTest.t.sol", "test/universal/ExtendedPause.t.sol", "test/vendor/Initializable.t.sol", - "test/vendor/InitializableOZv5.t.sol", - "test/universal/ReinitializableBase.t.sol" + "test/vendor/InitializableOZv5.t.sol" ] diff --git a/ops/ai-eng/contracts-test-maintenance/no-need-changes.toml b/ops/ai-eng/contracts-test-maintenance/no-need-changes.toml new file mode 100644 index 0000000000000..9b1330b9c16b5 --- /dev/null +++ b/ops/ai-eng/contracts-test-maintenance/no-need-changes.toml @@ -0,0 +1,32 @@ +# Files that don't need changes +# When a test file is analyzed and determined to not need changes, +# add an entry here with the test path, contract path, and contract hash. +# Entries are automatically removed when the contract hash changes. + +# Example entry: +# [[tests]] +# test_path = "test/L1/SystemConfig.t.sol" +# contract_path = "src/L1/SystemConfig.sol" +# contract_hash = "abc123..." +# recorded_at = "2025-12-11T20:00:00Z" +# devin_session_id = "devin-abc123" +# run_id = "20251211_200000" +# reason = "Test coverage is comprehensive" + +[[tests]] +test_path = "test/universal/ReinitializableBase.t.sol" +contract_path = "src/universal/ReinitializableBase.sol" +contract_hash = "cbe992160b4978a49dd92949106531f7bd6e2029" +recorded_at = "2025-12-12T17:18:04Z" +devin_session_id = "677a871d381f44848ff297549125765c" +run_id = "20251212_171800" +reason = "Test coverage is already comprehensive with all functions and code paths tested. The constructor has both focused (zero version revert) and fuzz tests (valid versions 1-255), and the initVersion() getter is verified within the constructor test." + +[[tests]] +test_path = "test/universal/SafeSend.t.sol" +contract_path = "src/universal/SafeSend.sol" +contract_hash = "35f7553422a7b81ad998d04ed1f67e1fa56d6b8d" +recorded_at = "2025-12-12T17:32:49Z" +devin_session_id = "50e7ca21d9264fe6a62c7ec944da7e43" +run_id = "20251212_173200" +reason = "Test coverage is already comprehensive with all functions and code paths tested. The constructor has three fuzz tests covering EOA, contract, and zero address recipients with thorough assertions." diff --git a/ops/ai-eng/contracts-test-maintenance/prompt/prompt.md b/ops/ai-eng/contracts-test-maintenance/prompt/prompt.md index 38e350fe4c56b..7ec2c73501f3e 100644 --- a/ops/ai-eng/contracts-test-maintenance/prompt/prompt.md +++ b/ops/ai-eng/contracts-test-maintenance/prompt/prompt.md @@ -28,10 +28,44 @@ Only make changes you're confident about - analyze code behavior before testing. Don't guess or assume - if unsure, examine the source contract carefully. + +1. NO creating NEW tests for inherited functions - only test functions declared in target contract +2. NO creating test contracts for constructor parameters - use Constructor_Test instead +3. NO failing tests kept - all must pass or task fails +4. NO removing ANY existing tests - even if they test inherited functions (enhance/modify instead) + + + +- Enhancement First: Always improve existing tests before adding new ones +- Function-First Organization: Every function gets its own test contract; Uncategorized_Test is reserved for true multi-function integration scenarios +- Preserve Behavior: Modify tests only to improve coverage/naming while keeping original functionality +- Contract Reality: Test what the contract DOES, not what you think it SHOULD do +- Target Contract Only: Never test inherited functions - only test functions declared in the contract under test +- Test Valid Scenarios: Focus on legitimate use cases and edge cases, not artificial failure modes from broken setup +- Test Intent vs Side Effects: Verify tests fail for their intended reason, not technical side effects +- Test Uniqueness: Each test must verify distinct logic - different values alone don't justify separate tests + + + +{TEST_PATH} +{CONTRACT_PATH} + + + +Enhance the provided Solidity test file following these objectives: +1. Convert regular tests to fuzz tests where appropriate +2. Add tests for uncovered code paths (if statements, branches, reverts) +3. Ensure every public/external function has at least one test + +Focus on mechanical improvements that increase coverage and quality. + + -You MUST maintain a `structured_output` field to communicate task completion status. Please update the structured output immediately after completing your analysis (Phases 1-2) to indicate whether changes are needed. +**IMPORTANT: You MUST maintain a `structured_output` field to communicate task status.** + +Please update the structured output **immediately after completing Phases 1-2 (Enhancement Analysis & Coverage Gap Analysis)** to indicate whether changes are needed. -**Required Format:** +**Required JSON Structure:** ```json { "analysis_complete": boolean, @@ -40,12 +74,13 @@ You MUST maintain a `structured_output` field to communicate task completion sta } ``` +**Field Definitions:** +- `analysis_complete`: Set to `true` as soon as you determine the outcome after Phases 1-2 +- `changes_needed`: Set to `true` if you will create or modify tests; `false` if no changes needed +- `reason`: Brief explanation (1-2 sentences) of why changes are/aren't needed + **When to Update:** -- Update immediately after completing Phases 1-2 (Enhancement Analysis & Coverage Gap Analysis) -- Set `analysis_complete: true` as soon as you determine the outcome -- Set `changes_needed: true` if you will create or modify tests -- Set `changes_needed: false` if no changes are needed after thorough analysis -- Set `reason` with a brief explanation of the outcome +Update the structured output immediately after completing your analysis (Phases 1-2), before starting implementation work. This allows the system to detect when no changes are required. **Examples:** @@ -54,7 +89,7 @@ No changes needed: { "analysis_complete": true, "changes_needed": false, - "reason": "Test coverage is already comprehensive with all functions and code paths tested" + "reason": "Test coverage is already comprehensive with all functions and code paths tested. The constructor has both focused and fuzz tests covering all valid inputs." } ``` @@ -63,42 +98,63 @@ Changes needed: { "analysis_complete": true, "changes_needed": true, - "reason": "Converting 3 tests to fuzz tests and adding coverage for 2 untested functions" + "reason": "Converting 3 tests to fuzz tests and adding coverage for 2 untested error conditions in the validate() function." } ``` + +**Critical:** Set this output BEFORE proceeding to Phase 3. If `changes_needed: false`, you can stop after Phase 2. - -1. NO creating NEW tests for inherited functions - only test functions declared in target contract -2. NO creating test contracts for constructor parameters - use Constructor_Test instead -3. NO failing tests kept - all must pass or task fails -4. NO removing ANY existing tests - even if they test inherited functions (enhance/modify instead) - + +**IMPORTANT: When no changes are needed (`changes_needed: false`), you MUST update the tracking file.** - -- Enhancement First: Always improve existing tests before adding new ones -- Function-First Organization: Every function gets its own test contract; Uncategorized_Test is reserved for true multi-function integration scenarios -- Preserve Behavior: Modify tests only to improve coverage/naming while keeping original functionality -- Contract Reality: Test what the contract DOES, not what you think it SHOULD do -- Target Contract Only: Never test inherited functions - only test functions declared in the contract under test -- Test Valid Scenarios: Focus on legitimate use cases and edge cases, not artificial failure modes from broken setup -- Test Intent vs Side Effects: Verify tests fail for their intended reason, not technical side effects -- Test Uniqueness: Each test must verify distinct logic - different values alone don't justify separate tests - +If your analysis determines that no changes are needed, add an entry to `ops/ai-eng/contracts-test-maintenance/no-need-changes.toml` with the following information: - -{TEST_PATH} -{CONTRACT_PATH} - +```toml +[[tests]] +test_path = "{TEST_PATH}" +contract_path = "{CONTRACT_PATH}" +contract_hash = "" +recorded_at = "" +devin_session_id = "" +run_id = "" +reason = "" +``` - -Enhance the provided Solidity test file following these objectives: -1. Convert regular tests to fuzz tests where appropriate -2. Add tests for uncovered code paths (if statements, branches, reverts) -3. Ensure every public/external function has at least one test +**How to get the contract hash:** +Run this command: `git log -1 --format=%H -- packages/contracts-bedrock/{CONTRACT_PATH}` + +**Example entry:** +```toml +[[tests]] +test_path = "test/L1/SystemConfig.t.sol" +contract_path = "src/L1/SystemConfig.sol" +contract_hash = "abc123def456..." +recorded_at = "2025-12-11T20:30:00Z" +devin_session_id = "devin-abc123" +run_id = "20251211_203000" +reason = "Test coverage is already comprehensive with all functions and code paths tested." +``` -Focus on mechanical improvements that increase coverage and quality. - +**Critical Steps:** +1. Get the contract hash using the git command above +2. Add the entry to the TOML file +3. Commit this change with message: `chore(ai-test): skip {TEST_PATH} - already has comprehensive coverage` +4. **Create a pull request** with the default template to record this decision + +This tracking allows the system to automatically exclude well-tested files from future runs until the contract changes. + + + +**IMPORTANT: Clean up stale TOML entries before starting your analysis.** + +The following entries in `ops/ai-eng/contracts-test-maintenance/no-need-changes.toml` have outdated contract hashes and must be removed: + +{{STALE_ENTRIES_LIST}} + +Remove these entries from the TOML file and commit with message: +`chore(ai-test): remove stale entries from no-need-changes.toml` + **Structured Enhancement Methodology** diff --git a/ops/ai-eng/justfile b/ops/ai-eng/justfile index 33cdf858fbbec..4af89c07777cb 100644 --- a/ops/ai-eng/justfile +++ b/ops/ai-eng/justfile @@ -30,6 +30,8 @@ ai-contracts-test: # Step 3: Send to Devin just devin -# Prepare Slack notification based on log.json results -prepare-slack-notification: - cd contracts-test-maintenance/components/slack-notifier && ./prepare_notification.sh +# Build Slack notification JSON from log.json results +build-slack-notification: + #!/usr/bin/env bash + SLACK_JSON=$(bash contracts-test-maintenance/components/slack-notifier/build_notification.sh contracts-test-maintenance/log.json) + echo "export AI_PR_SLACK_TEMPLATE='${SLACK_JSON}'"