Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1074,8 +1074,8 @@ jobs:
path: ops/ai-eng/contracts-test-maintenance/log.json
destination: log.json
- run:
name: Prepare Slack notification
command: just prepare-slack-notification >> $BASH_ENV
name: Build Slack notification
command: just build-slack-notification >> $BASH_ENV
working_directory: ops/ai-eng
when: always
- slack/notify:
Expand Down
2 changes: 1 addition & 1 deletion ops/ai-eng/contracts-test-maintenance/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.3.0
1.4.0
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
then monitors the session until completion while logging the results.
"""

from datetime import datetime
from datetime import datetime, timezone
import glob
import json
import os
Expand Down Expand Up @@ -45,6 +45,7 @@ def write_log(session_id, status, session_data):

ranking_file = f"../tests_ranker/output/{run_id}_ranking.json"
with open(ranking_file, "r") as f:

data = json.load(f)
selected_files = {
"test_path": data["entries"][0]["test_path"],
Expand Down Expand Up @@ -73,8 +74,8 @@ def write_log(session_id, status, session_data):
"status": status,
}

# Only add PR link if status is finished
if status == "finished" and session_data:
# Add PR link if status is finished or no_changes_needed (both create PRs)
if status in ["finished", "no_changes_needed"] and session_data:
pr_data = session_data.get("pull_request") or {}
pr_url = pr_data.get("url")
if pr_url:
Expand Down Expand Up @@ -136,28 +137,38 @@ def create_session(prompt):
headers = _create_headers(api_key, "application/json")
data = json.dumps({"prompt": prompt}).encode("utf-8")

response_data = _make_request(f"{base_url}/sessions", headers, data, "POST")
session_id = response_data["session_id"]
retry_delay = 60
while True:
response_data = _make_request(f"{base_url}/sessions", headers, data, "POST")

if response_data is None:
print(f"Session creation timed out, retrying in {retry_delay}s...")
time.sleep(retry_delay)
retry_delay = min(retry_delay * 2, 480)
continue

print(f"Created session: {session_id}")
return session_id
session_id = response_data["session_id"]
print(f"Created session: {session_id}")
return session_id


def monitor_session(session_id):
"""Monitor session status until completion."""
api_key, base_url = _validate_environment()
headers = _create_headers(api_key)
last_status = None
last_status_enum = None
retry_delay = 60 # Start with 1 minute
setup_printed = False
timeout_count = 0
blocked_start_time = None # Track when we first entered blocked state
blocked_timeout = 300 # 5 minutes timeout for blocked state without outcome

while True:
try:
status = _make_request(f"{base_url}/sessions/{session_id}", headers)
api_response = _make_request(f"{base_url}/sessions/{session_id}", headers)

# Handle server timeout (no response) - retry with backoff
if status is None:
if api_response is None:
timeout_count += 1
# Only print after 3rd consecutive timeout to reduce noise
if timeout_count >= 3:
Expand All @@ -171,60 +182,97 @@ def monitor_session(session_id):
if timeout_count > 0:
timeout_count = 0

current_status = status.get("status_enum")
status_enum = api_response.get("status_enum")

# Handle Devin setup phase (status_enum is None but we got a response)
if current_status is None:
if status_enum is None:
if not setup_printed:
print("Devin is setting up...")
setup_printed = True
time.sleep(5)
continue

# Print setup completion message once
if setup_printed and current_status:
if setup_printed and status_enum:
print("Devin finished setup")
setup_printed = False

# Only print when status changes and is meaningful
if current_status and current_status != last_status:
print(f"Status: {current_status}")
last_status = current_status
if status_enum and status_enum != last_status_enum:
print(f"Status: {status_enum}")
last_status_enum = status_enum

# Stop monitoring for terminal statuses (only if we have valid status data)
if status and current_status in ["blocked", "expired", "suspend_requested", "suspend_requested_frontend"]:
if api_response and status_enum in ["blocked", "finished", "expired", "suspend_requested", "suspend_requested_frontend"]:
# Handle user stopping the session
if current_status in ["suspend_requested", "suspend_requested_frontend"]:
if status_enum in ["suspend_requested", "suspend_requested_frontend"]:
print("Session stopped by user")
return

# Blocked = PR created or analysis completed without changes
if current_status == "blocked":
structured_output = status.get("structured_output") or {}
pr_data = status.get("pull_request") or {}

# Check if analysis completed without changes
if structured_output.get("analysis_complete") and not structured_output.get("changes_needed"):
reason = structured_output.get("reason", "no changes needed")
print(f"Session completed - {reason}")
write_log(session_id, "finished_no_changes", status)
# Blocked or finished - check for outcome
if status_enum in ["blocked", "finished"]:
# Ensure we have valid status data before accessing nested fields
if api_response is None:
print("Warning: Terminal status reached but no status data available, retrying...")
time.sleep(retry_delay)
continue

# Check structured output and PR (both should be populated when blocked)
# Note: Devin API nests structured_output twice: {structured_output: {structured_output: {...}}}
# The outer structured_output can be null, so we use `or {}` to handle that case
structured = (api_response.get("structured_output") or {}).get("structured_output") or {}
analysis_complete = structured.get("analysis_complete", False)
changes_needed = structured.get("changes_needed")

pr_data = api_response.get("pull_request") or {}
pr_url = pr_data.get("url")

# Case 1: Structured output indicates no changes needed
if analysis_complete and changes_needed is False:
reason = structured.get("reason", "Not provided")
print(f"Session completed - no changes needed")
print(f"Reason: {reason}")
if pr_url:
print(f"PR created for TOML tracking: {pr_url}")

write_log(session_id, "no_changes_needed", api_response)
return

# Check if PR was created
if pr_data.get("url"):
print("Session completed successfully - PR created")
write_log(session_id, "finished", status)
# Case 2: PR created with test improvements (only if no structured output yet)
# We need to wait for structured_output to determine if this is a no-changes case
if pr_url and analysis_complete:
# We have both PR and completed analysis, and changes_needed != False
# This means actual test improvements were made
print(f"Session completed successfully - PR created: {pr_url}")
write_log(session_id, "finished", api_response)
return

# Blocked without completion signal
print(f"Session blocked without PR - check Devin web interface")
# Don't write log.json so artifact won't be stored for failed sessions
sys.exit(1) # Exit with error code to mark job as failed
# If blocked without complete data, keep waiting briefly
# Devin may still be populating the data
if status_enum == "blocked":
if blocked_start_time is None:
blocked_start_time = time.time()
print("Devin is blocked - waiting for complete outcome data...")

elapsed = time.time() - blocked_start_time
if elapsed > blocked_timeout:
print(f"Timeout: Devin blocked for {int(elapsed)}s without outcome - check Devin web interface")
sys.exit(1)

time.sleep(5)
continue

# Reset blocked timer if we move out of blocked state
blocked_start_time = None

# Finished without PR and no structured output = error
print(f"Session finished without PR or clear outcome - check Devin web interface")
sys.exit(1)

# Expired = session timed out
if current_status == "expired":
if status_enum == "expired":
print(f"Session expired")
write_log(session_id, "expired", status)
write_log(session_id, "expired", api_response)
return

time.sleep(5)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def load_ranking_data():
"""Load the ranking JSON file and return the first entry and run_id."""
"""Load the ranking JSON file and return the first entry, stale entries, and run_id."""
ranking_dir = Path(__file__).parent / "../tests_ranker" / "output"

# Get the ranking file
Expand All @@ -23,7 +23,31 @@ def load_ranking_data():
if not data.get("entries"):
raise ValueError(f"No entries found in {ranking_file.name}")

return data["entries"][0], run_id
stale_toml_entries = data.get("stale_toml_entries", [])

return data["entries"][0], stale_toml_entries, run_id


def format_stale_entries(stale_entries):
"""Format stale TOML entries as markdown list.

Args:
stale_entries: List of dicts with test_path, contract_path, old_hash, new_hash

Returns:
Formatted markdown string with bullet list of stale entries, or "(none)" if empty
"""
if not stale_entries:
return "(none)"

lines = []
for entry in stale_entries:
test_path = entry.get("test_path", "unknown")
old_hash = entry.get("old_hash", "unknown")[:7]
new_hash = entry.get("new_hash", "unknown")[:7]
lines.append(f"- `{test_path}` (contract changed: {old_hash} → {new_hash})")

return "\n".join(lines)


def load_prompt_template():
Expand All @@ -34,10 +58,22 @@ def load_prompt_template():
return f.read()


def render_prompt(template, test_path, contract_path):
"""Replace the placeholders in the template with actual paths."""
return template.replace("{TEST_PATH}", test_path).replace(
"{CONTRACT_PATH}", contract_path
def render_prompt(template, test_path, contract_path, stale_entries_list):
"""Replace the placeholders in the template with actual paths and stale entries.

Args:
template: The prompt template string
test_path: Path to the test file
contract_path: Path to the contract file
stale_entries_list: Formatted markdown list of stale entries

Returns:
Rendered prompt with all placeholders replaced
"""
return (
template.replace("{TEST_PATH}", test_path)
.replace("{CONTRACT_PATH}", contract_path)
.replace("{{STALE_ENTRIES_LIST}}", stale_entries_list)
)


Expand All @@ -63,19 +99,24 @@ def main():
"""Main function to render and save the prompt instance."""
try:
# Load ranking data and get run_id
first_entry, run_id = load_ranking_data()
first_entry, stale_toml_entries, run_id = load_ranking_data()
test_path = first_entry["test_path"]
contract_path = first_entry["contract_path"]

print(f"Using ranking from run {run_id}:")
print(f" Test path: {test_path}")
print(f" Contract path: {contract_path}")

# Format stale entries for injection
stale_entries_list = format_stale_entries(stale_toml_entries)
if stale_toml_entries:
print(f" Stale TOML entries: {len(stale_toml_entries)}")

# Load prompt template
template = load_prompt_template()

# Render the prompt with actual paths
rendered_prompt = render_prompt(template, test_path, contract_path)
# Render the prompt with actual paths and stale entries
rendered_prompt = render_prompt(template, test_path, contract_path, stale_entries_list)

# Save the rendered prompt
output_file = save_prompt_instance(rendered_prompt, run_id)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash

set -eo pipefail

LOG_FILE="$1"

if [ -z "$LOG_FILE" ]; then
echo "Usage: $0 <log.json path>" >&2
exit 1
fi

STATUS=$(jq -r '.status // empty' "$LOG_FILE")
PR_URL=$(jq -r '.pull_request_url // empty' "$LOG_FILE")
TEST_FILE=$(jq -r '.selected_files.test_path | split("/") | .[-1]' "$LOG_FILE")

if [ "$STATUS" = "no_changes_needed" ] && [ -n "$PR_URL" ]; then
# No changes needed but PR opened to add TOML tracking entry
MESSAGE=$'<!subteam^S07K486JEH4> AI Contracts Test Maintenance System analyzed '"${TEST_FILE}"$' - no changes needed (test coverage already comprehensive)\n<'"${PR_URL}"$'|View PR to add no-changes tracking>'
SLACK_JSON=$(jq -n --arg msg "$MESSAGE" '{"text": $msg}')
echo "$SLACK_JSON"
elif [ -n "$PR_URL" ]; then
# Normal case: PR with test improvements
MESSAGE=$'<!subteam^S07K486JEH4> AI Contracts Test Maintenance System created a PR for '"${TEST_FILE}"$'\n<'"${PR_URL}"$'|View PR> | <https://www.notion.so/oplabs/AI-Contract-Test-Maintenance-System-PR-Reviewer-Guide-288f153ee16280478c0ed1adc5edd9f9|Reviewer Guide>'
SLACK_JSON=$(jq -n --arg msg "$MESSAGE" '{"text": $msg}')
echo "$SLACK_JSON"
elif [ "$STATUS" = "no_changes_needed" ]; then
# Edge case: no changes and no PR (shouldn't happen with new workflow)
MESSAGE=$'<!subteam^S07K486JEH4> AI Contracts Test Maintenance System analyzed '"${TEST_FILE}"$' - no changes needed (test coverage already comprehensive)'
SLACK_JSON=$(jq -n --arg msg "$MESSAGE" '{"text": $msg}')
echo "$SLACK_JSON"
else
echo "No notification needed (status: $STATUS)" >&2
echo '{}'
fi

This file was deleted.

Loading