diff --git a/.claude/skills/backport/SKILL.md b/.claude/skills/backport/SKILL.md new file mode 100644 index 000000000000..80d0666d12db --- /dev/null +++ b/.claude/skills/backport/SKILL.md @@ -0,0 +1,205 @@ +--- +name: backport +description: Backport a merged PR to a release branch, resolving conflicts if needed +argument-hint: +--- + +# Backport PR + +Backport a merged PR to a release branch staging area. Uses the existing +`scripts/backport_to_staging.sh` for the happy path, then resolves conflicts +manually if the diff does not apply cleanly. + +## Usage + +``` +/backport 12345 v4 # release branch +/backport 12345 v4-devnet-2 # devnet branch +``` + +## Workflow + +### Step 1: Validate Arguments + +Confirm exactly two arguments are provided: a PR number and a target branch. + +Supported target branches: +- Release branches: `v2`, `v3`, `v4` +- Devnet branches: `v4-devnet-1`, `v4-devnet-2`, etc. + +**Abort if:** +- Missing arguments -> "Usage: /backport " + +### Step 2: Validate PR State + +```bash +gh pr view --repo AztecProtocol/aztec-packages --json state,title +``` + +**Abort if:** +- `state` is not `MERGED` -> "PR # is , only merged PRs can be backported." + +### Step 3: Check if Already Backported + +Check whether this PR has already been backported to the staging branch by +looking for its PR number in the commit log: + +```bash +STAGING_BRANCH="backport-to-${TARGET_BRANCH}-staging" +git fetch origin "$STAGING_BRANCH" 2>/dev/null +if git log "origin/$STAGING_BRANCH" --oneline --grep="(#)" | grep -q .; then + echo "PR # has already been backported to $STAGING_BRANCH." +fi +``` + +**Abort if** the PR number appears in the staging branch commit log. Show the +matching commit(s) and tell the user the backport already exists. + +### Step 4: Create Isolated Worktree + +Create a temporary worktree so the backport does not disturb the user's current +branch or working tree. Save the original directory to return to later. + +```bash +ORIGINAL_DIR=$(pwd) +REPO_ROOT=$(git rev-parse --show-toplevel) +WORKTREE_DIR=$(mktemp -d) +git worktree add "$WORKTREE_DIR" HEAD +cd "$WORKTREE_DIR" +``` + +All subsequent steps run inside the worktree. On completion or failure, always +clean up (see Step 10). + +### Step 5: Attempt Automated Backport + +Run the backport script from the worktree: + +```bash +./scripts/backport_to_staging.sh +``` + +**If the script succeeds:** Skip to Step 10 (cleanup and report). + +**If the script fails:** Continue to Step 6 (conflict resolution). + +### Step 6: Assess Conflicts + +The script will have left the worktree on the `backport-to--staging` +branch with partially applied changes and `.rej` files for hunks that failed. + +1. **Verify current branch** is `backport-to--staging` + +2. **Identify the state of the working tree:** + ```bash + git status + ``` + +3. **Find all reject files:** + ```bash + find . -name '*.rej' -not -path './node_modules/*' -not -path './.git/*' + ``` + +4. **Get the full PR diff for reference:** + ```bash + gh pr diff + ``` + +### Step 7: Resolve Conflicts + +For each `.rej` file: + +1. **Read the reject file** to understand what hunk failed to apply +2. **Read the current version** of the corresponding source file on the staging branch +3. **Understand the intent** of the change from the PR diff context +4. **Apply the change manually** by editing the source file, adapting the change to + the current state of the code on the release branch +5. **Delete the `.rej` file** after resolving + +Also check for files that may need to be created or deleted based on the PR diff +but were not handled by the partial apply. + +**Important considerations:** +- The release branch may have diverged significantly from `next`. Do not assume + the surrounding code is the same as in the original PR. +- When adapting changes, preserve the semantic intent of the PR, not the exact + line-by-line diff. +- If a file referenced in the diff does not exist at all on the release branch, + evaluate whether it should be created or if the change is irrelevant. If + irrelevant, skip it and note this in the final report. + +### Step 8: Verify Build + +Check if changes exist in `yarn-project`: +```bash +git diff --name-only | grep '^yarn-project/' || true +``` + +If yarn-project changes exist, run from `yarn-project`: +```bash +yarn build +``` + +Check if changes exist outside `yarn-project`: +```bash +git diff --name-only | grep -v '^yarn-project/' || true +``` + +If changes exist outside yarn-project, run bootstrap from the repo root: +```bash +BOOTSTRAP_TO=yarn-project ./bootstrap.sh +``` + +Fix any build errors that arise from the backport adaptation. + +### Step 9: Finish with Script + +Clean up and let the script handle commit, push, and PR management: + +```bash +find . -name '*.rej' -delete +git add -A +./scripts/backport_to_staging.sh --continue +``` + +### Step 10: Cleanup and Report + +Return to the original directory and remove the temporary worktree: + +```bash +cd "$ORIGINAL_DIR" +git worktree remove "$WORKTREE_DIR" +``` + +**Always clean up the worktree**, even if earlier steps failed. If `git worktree +remove` fails (e.g., uncommitted changes), use `git worktree remove --force`. + +Print a summary: +- PR number and title that was backported +- Target branch and staging branch name +- Whether conflicts were encountered and resolved +- Link to the staging PR (if one was created or already exists) + +## Key Points + +- **Always use a worktree**: All backport work happens in a temporary git worktree + so the user's current branch and working tree are never disturbed. Always clean + up the worktree when done, even on failure. +- **Script first, manual second**: Always try the automated script first. It handles + branch setup, authorship, push, and PR management. Only do manual conflict + resolution if it fails. +- **Use `--continue` after resolving**: The script's `--continue` mode picks up where + the initial run left off (commit, push, PR creation, body update). +- **Preserve author attribution**: The script uses `--author` to set the original PR + author on the commit. The committer stays as whoever runs the command (GPG signing + works). +- **Verify builds but skip tests**: Run `yarn build` or bootstrap to confirm the + backport compiles. Do not run the full test suite -- that is CI's job. +- **Semantic, not mechanical**: When resolving conflicts, adapt the change to the + release branch's code state. The goal is the same behavioral change, not an exact + diff match. +- **Clean up `.rej` files**: Always delete `.rej` files before committing. +- **Staging branch convention**: The staging branch is always + `backport-to-{TARGET_BRANCH}-staging` (e.g., `backport-to-v4-staging`, + `backport-to-v4-devnet-2-staging`). Multiple backports accumulate on the same + staging branch and get merged together. diff --git a/ci3/ci-metrics/app.py b/ci3/ci-metrics/app.py index c62875e7d19a..e2925d9ae3b0 100644 --- a/ci3/ci-metrics/app.py +++ b/ci3/ci-metrics/app.py @@ -6,6 +6,7 @@ import os import re import redis +import time import threading from pathlib import Path @@ -37,14 +38,30 @@ def verify_password(username, password): def _init(): - """Initialize SQLite and start background threads.""" + """Initialize SQLite, warm caches, and start background threads.""" try: db.get_db() metrics.start_test_listener(r) + metrics.start_phase_listener(r) metrics.start_ci_run_sync(r) + github_data.start_merge_queue_poller() + github_data.start_pr_dirs_worker() print("[ci-metrics] Background threads started") except Exception as e: print(f"[ci-metrics] Warning: startup failed: {e}") + # Warm billing caches so first request isn't slow + try: + from billing.gcp import _ensure_cached as _warm_gcp + _warm_gcp() + print("[ci-metrics] GCP billing cache warmed") + except Exception as e: + print(f"[ci-metrics] GCP billing warmup failed: {e}") + try: + from billing.aws import _ensure_cached as _warm_aws + _warm_aws() + print("[ci-metrics] AWS costs cache warmed") + except Exception as e: + print(f"[ci-metrics] AWS costs warmup failed: {e}") threading.Thread(target=_init, daemon=True, name='metrics-init').start() @@ -101,6 +118,74 @@ def _json(data): return Response(json.dumps(data), mimetype='application/json') +_TEN_DAYS = 10 * 24 * 3600 + + +def _cache_ttl(date_to: str) -> int: + """Return 10-day TTL for historical ranges (date_to < today), else 5 min.""" + try: + if datetime.strptime(date_to, '%Y-%m-%d').date() < datetime.now().date(): + return _TEN_DAYS + except ValueError: + pass + return 300 + + +# ---- Author mapping: git display name → GitHub username ---- + +_author_map = {} +_author_map_ts = 0 + + +def _get_author_map() -> dict: + """Build git display name → GitHub username mapping from ci_runs + pr_authors.""" + global _author_map, _author_map_ts + now = time.time() + if now - _author_map_ts < 3600 and _author_map: + return _author_map + rows = db.query(''' + SELECT cr.author as git_name, pa.author as github_user, COUNT(*) as c + FROM ci_runs cr + JOIN pr_authors pa ON cr.pr_number = pa.pr_number + WHERE cr.author IS NOT NULL AND cr.author != '' + AND pa.author IS NOT NULL AND pa.author != '' + GROUP BY cr.author, pa.author + ''') + name_to_gh = {} + for row in rows: + gn = row['git_name'] + gh = row['github_user'] + if gn not in name_to_gh: + name_to_gh[gn] = {} + name_to_gh[gn][gh] = name_to_gh[gn].get(gh, 0) + row['c'] + result = {} + for gn, gh_counts in name_to_gh.items(): + best = max(gh_counts, key=gh_counts.get) + result[gn] = best + result[best] = best # identity mapping for usernames used as commit_author + _author_map = result + _author_map_ts = now + return result + + +def _normalize_authors(authors_str: str) -> str: + """Normalize comma-separated git names to deduplicated GitHub usernames.""" + if not authors_str: + return '' + amap = _get_author_map() + seen = set() + result = [] + for name in authors_str.split(','): + name = name.strip() + if not name: + continue + gh = amap.get(name, name) + if gh not in seen: + seen.add(gh) + result.append(gh) + return ','.join(result) + + # ---- Namespace billing ---- @app.route('/namespace-billing') @@ -166,7 +251,7 @@ def api_ci_runs(): ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) if date_from else None ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) if date_to else None - runs = metrics.get_ci_runs(r, ts_from, ts_to) + runs = metrics.get_ci_runs(ts_from, ts_to) if status_filter: runs = [run for run in runs if run.get('status') == status_filter] @@ -185,7 +270,7 @@ def api_ci_runs(): @auth.login_required def api_ci_stats(): ts_from = int((datetime.now() - timedelta(days=7)).timestamp() * 1000) - runs = metrics.get_ci_runs(r, ts_from) + runs = metrics.get_ci_runs(ts_from) total = len(runs) passed = sum(1 for run in runs if run.get('status') == 'PASSED') @@ -233,6 +318,7 @@ def api_costs_overview(): buckets[key]['aws_total'] += entry.get('aws_total', 0) buckets[key]['gcp_total'] += entry.get('gcp_total', 0) result['by_date'] = sorted(buckets.values(), key=lambda x: x['date']) + result['period'] = {'from': date_from, 'to': date_to} return _json(result) @@ -287,7 +373,7 @@ def api_costs_attribution(): ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) - runs = metrics.get_ci_runs(r, ts_from, ts_to) + runs = metrics.get_ci_runs(ts_from, ts_to) runs_with_cost = [run for run in runs if run.get('cost_usd') is not None] # Enrich merge queue runs with PR author from GitHub @@ -311,6 +397,9 @@ def api_costs_attribution(): prn = info['pr_number'] if prn and int(prn) in pr_authors: author = pr_authors[int(prn)]['author'] + # Attribute nightly / release runs to a special 'release' actor + if info['type'] in ('nightly', 'releases'): + author = 'release' inst_type = run.get('instance_type', 'unknown') vcpus = run.get('instance_vcpus') @@ -383,14 +472,17 @@ def api_costs_attribution(): instances.sort(key=lambda x: -(x['cost_usd'] or 0)) all_types = sorted(by_type.keys()) + # Pre-compute runs-per-date to avoid O(dates × instances) + runs_per_date = {} + for inst in instances: + runs_per_date[inst['date']] = runs_per_date.get(inst['date'], 0) + 1 by_date_list = [] for date in sorted(by_date_type): - entry = {'date': date, 'total': 0, 'runs': 0} + entry = {'date': date, 'total': 0, 'runs': runs_per_date.get(date, 0)} for rt in all_types: entry[rt] = round(by_date_type[date].get(rt, 0), 2) entry['total'] += by_date_type[date].get(rt, 0) entry['total'] = round(entry['total'], 2) - entry['runs'] = sum(1 for inst in instances if inst['date'] == date) by_date_list.append(entry) by_date_list = _aggregate_dates(by_date_list, granularity, @@ -405,6 +497,7 @@ def api_costs_attribution(): 'by_date': by_date_list, 'run_types': all_types, 'instances': instances[:500], + 'period': {'from': date_from, 'to': date_to}, 'totals': {'aws': round(total_aws, 2), 'gcp': round(gcp_total, 2), 'gcp_unattributed': round(gcp_total, 2), 'combined': round(total_aws + gcp_total, 2)}, @@ -421,7 +514,7 @@ def api_costs_runners(): ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) - runs = metrics.get_ci_runs(r, ts_from, ts_to) + runs = metrics.get_ci_runs(ts_from, ts_to) runs_with_cost = [run for run in runs if run.get('cost_usd') is not None] if dashboard: runs_with_cost = [run for run in runs_with_cost if run.get('dashboard') == dashboard] @@ -475,6 +568,7 @@ def api_costs_runners(): 'by_date': by_date, 'by_instance_type': by_instance, 'by_dashboard': by_dashboard, + 'period': {'from': date_from, 'to': date_to}, 'summary': { 'total_cost': round(total_cost, 2), 'spot_pct': round(100.0 * spot_cost / max(total_cost, 0.01), 1), @@ -493,13 +587,18 @@ def api_ci_performance(): date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) dashboard = request.args.get('dashboard', '') granularity = request.args.get('granularity', 'daily') + _ck = f'perf:{date_from}:{date_to}:{dashboard}:{granularity}' + if cached := db.cache_get(_ck): + return _json(cached) + _t0 = time.perf_counter() ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) - runs = metrics.get_ci_runs(r, ts_from, ts_to) + runs = metrics.get_ci_runs(ts_from, ts_to) runs = [run for run in runs if run.get('status') in ('PASSED', 'FAILED')] if dashboard: runs = [run for run in runs if run.get('dashboard') == dashboard] + _t1 = time.perf_counter() by_date_map = {} for run in runs: @@ -519,6 +618,7 @@ def api_ci_performance(): by_date = [] for date in sorted(by_date_map): d = by_date_map[date] + durs = sorted(d['durations']) by_date.append({ 'date': date, 'total': d['total'], @@ -526,75 +626,107 @@ def api_ci_performance(): 'failed': d['failed'], 'pass_rate': round(100.0 * d['passed'] / max(d['total'], 1), 1), 'failure_rate': round(100.0 * d['failed'] / max(d['total'], 1), 1), - 'avg_duration_mins': round(sum(d['durations']) / len(d['durations']), 1) if d['durations'] else None, + 'avg_duration_mins': round(sum(durs) / len(durs), 1) if durs else None, + 'p50_duration_mins': round(durs[len(durs) // 2], 1) if durs else None, + 'p95_duration_mins': round(durs[int(len(durs) * 0.95)], 1) if durs else None, + 'max_duration_mins': round(max(durs), 1) if durs else None, }) + _t2 = time.perf_counter() + # Merge test outcome counts from test_daily_stats before aggregation + ds_conditions = ['date >= ?', 'date <= ?'] + ds_params = [date_from, date_to] + if dashboard: + ds_conditions.append('dashboard = ?') + ds_params.append(dashboard) + ds_where = 'WHERE ' + ' AND '.join(ds_conditions) + + daily_test_counts = db.query(f''' + SELECT date, SUM(passed) as passed, SUM(failed) as failed, SUM(flaked) as flaked + FROM test_daily_stats {ds_where} + GROUP BY date + ''', ds_params) + daily_test_map = {r['date']: r for r in daily_test_counts} + for d in by_date: + tc = daily_test_map.get(d['date'], {}) + d['flake_count'] = tc.get('flaked', 0) or 0 + d['test_failure_count'] = tc.get('failed', 0) or 0 + d['test_success_count'] = tc.get('passed', 0) or 0 + by_date = _aggregate_dates(by_date, granularity, - sum_fields=['total', 'passed', 'failed'], - avg_fields=['avg_duration_mins']) + sum_fields=['total', 'passed', 'failed', + 'flake_count', 'test_failure_count', 'test_success_count'], + avg_fields=['avg_duration_mins', 'p50_duration_mins', + 'p95_duration_mins', 'max_duration_mins']) for d in by_date: d['pass_rate'] = round(100.0 * d['passed'] / max(d['total'], 1), 1) d['failure_rate'] = round(100.0 * d['failed'] / max(d['total'], 1), 1) - # Daily flake/failure counts from test_events - if dashboard: - flake_daily = db.query(''' - SELECT substr(timestamp, 1, 10) as date, COUNT(*) as count - FROM test_events WHERE status = 'flaked' AND dashboard = ? - AND timestamp >= ? AND timestamp < ? - GROUP BY substr(timestamp, 1, 10) - ''', (dashboard, date_from, date_to + 'T23:59:59')) - fail_test_daily = db.query(''' - SELECT substr(timestamp, 1, 10) as date, COUNT(*) as count - FROM test_events WHERE status = 'failed' AND dashboard = ? - AND timestamp >= ? AND timestamp < ? - GROUP BY substr(timestamp, 1, 10) - ''', (dashboard, date_from, date_to + 'T23:59:59')) - else: - flake_daily = db.query(''' - SELECT substr(timestamp, 1, 10) as date, COUNT(*) as count - FROM test_events WHERE status = 'flaked' - AND timestamp >= ? AND timestamp < ? - GROUP BY substr(timestamp, 1, 10) - ''', (date_from, date_to + 'T23:59:59')) - fail_test_daily = db.query(''' - SELECT substr(timestamp, 1, 10) as date, COUNT(*) as count - FROM test_events WHERE status = 'failed' - AND timestamp >= ? AND timestamp < ? - GROUP BY substr(timestamp, 1, 10) - ''', (date_from, date_to + 'T23:59:59')) - flake_daily_map = {r['date']: r['count'] for r in flake_daily} - fail_test_daily_map = {r['date']: r['count'] for r in fail_test_daily} - for d in by_date: - d['flake_count'] = flake_daily_map.get(d['date'], 0) - d['test_failure_count'] = fail_test_daily_map.get(d['date'], 0) + # Duration by dashboard (pipeline) — from pre-aggregated ci_run_daily_stats + dbd_rows = db.query(''' + SELECT date, dashboard, run_count, passed, failed, + sum_duration, min_duration, max_duration, p50_duration, p95_duration + FROM ci_run_daily_stats + WHERE date >= ? AND date <= ? + ORDER BY date + ''', (date_from, date_to)) + + dbd_map = {} # {dashboard: [{date, avg_duration_mins, ...}]} + for r in dbd_rows: + dbd_map.setdefault(r['dashboard'], []).append({ + 'date': r['date'], + 'avg_duration_mins': round(r['sum_duration'] / max(r['run_count'], 1), 1), + 'total_duration_mins': round(r['sum_duration'], 1), + 'p50_duration_mins': r['p50_duration'], + 'p95_duration_mins': r['p95_duration'], + 'count': r['run_count'], + }) + + duration_by_dashboard = {} + for db_name, entries in dbd_map.items(): + duration_by_dashboard[db_name] = _aggregate_dates( + entries, granularity, + sum_fields=['count', 'total_duration_mins'], + avg_fields=['avg_duration_mins', 'p50_duration_mins', 'p95_duration_mins']) - # Top flakes/failures + _t3 = time.perf_counter() + # Top flakes/failures (with affected authors — filter out empty/NULL) + _author_concat = "GROUP_CONCAT(DISTINCT CASE WHEN commit_author IS NOT NULL AND commit_author != '' THEN commit_author END)" if dashboard: - top_flakes = db.query(''' - SELECT test_cmd, COUNT(*) as count, ref_name + top_flakes = db.query(f''' + SELECT test_cmd, COUNT(*) as count, dashboard, + {_author_concat} as authors FROM test_events WHERE status='flaked' AND dashboard = ? AND timestamp >= ? AND timestamp <= ? - GROUP BY test_cmd ORDER BY count DESC LIMIT 15 + GROUP BY test_cmd ORDER BY count DESC LIMIT 20 ''', (dashboard, date_from, date_to + 'T23:59:59')) - top_failures = db.query(''' - SELECT test_cmd, COUNT(*) as count + top_failures = db.query(f''' + SELECT test_cmd, COUNT(*) as count, dashboard, + {_author_concat} as authors FROM test_events WHERE status='failed' AND dashboard = ? AND timestamp >= ? AND timestamp <= ? - GROUP BY test_cmd ORDER BY count DESC LIMIT 15 + GROUP BY test_cmd ORDER BY count DESC LIMIT 20 ''', (dashboard, date_from, date_to + 'T23:59:59')) else: - top_flakes = db.query(''' - SELECT test_cmd, COUNT(*) as count, ref_name + top_flakes = db.query(f''' + SELECT test_cmd, COUNT(*) as count, dashboard, + {_author_concat} as authors FROM test_events WHERE status='flaked' AND timestamp >= ? AND timestamp <= ? - GROUP BY test_cmd ORDER BY count DESC LIMIT 15 + GROUP BY test_cmd ORDER BY count DESC LIMIT 20 ''', (date_from, date_to + 'T23:59:59')) - top_failures = db.query(''' - SELECT test_cmd, COUNT(*) as count + top_failures = db.query(f''' + SELECT test_cmd, COUNT(*) as count, dashboard, + {_author_concat} as authors FROM test_events WHERE status='failed' AND timestamp >= ? AND timestamp <= ? - GROUP BY test_cmd ORDER BY count DESC LIMIT 15 + GROUP BY test_cmd ORDER BY count DESC LIMIT 20 ''', (date_from, date_to + 'T23:59:59')) + # Normalize git display names → GitHub usernames + for row in top_flakes: + row['authors'] = _normalize_authors(row.get('authors', '')) + for row in top_failures: + row['authors'] = _normalize_authors(row.get('authors', '')) + # Summary total = len(runs) passed = sum(1 for run in runs if run.get('status') == 'PASSED') @@ -606,38 +738,24 @@ def api_ci_performance(): if complete and ts: durations.append((complete - ts) / 60000.0) - if dashboard: - flake_count = db.query(''' - SELECT COUNT(*) as c FROM test_events WHERE status='flaked' AND dashboard = ? - AND timestamp >= ? AND timestamp <= ? - ''', (dashboard, date_from, date_to + 'T23:59:59')) - total_tests = db.query(''' - SELECT COUNT(*) as c FROM test_events WHERE status IN ('failed','flaked') AND dashboard = ? - AND timestamp >= ? AND timestamp <= ? - ''', (dashboard, date_from, date_to + 'T23:59:59')) - total_failures_count = db.query(''' - SELECT COUNT(*) as c FROM test_events WHERE status='failed' AND dashboard = ? - AND timestamp >= ? AND timestamp <= ? - ''', (dashboard, date_from, date_to + 'T23:59:59')) - else: - flake_count = db.query(''' - SELECT COUNT(*) as c FROM test_events WHERE status='flaked' AND timestamp >= ? AND timestamp <= ? - ''', (date_from, date_to + 'T23:59:59')) - total_tests = db.query(''' - SELECT COUNT(*) as c FROM test_events WHERE status IN ('failed','flaked') AND timestamp >= ? AND timestamp <= ? - ''', (date_from, date_to + 'T23:59:59')) - total_failures_count = db.query(''' - SELECT COUNT(*) as c FROM test_events WHERE status='failed' AND timestamp >= ? AND timestamp <= ? - ''', (date_from, date_to + 'T23:59:59')) - - fc = flake_count[0]['c'] if flake_count else 0 - tc = total_tests[0]['c'] if total_tests else 0 - tfc = total_failures_count[0]['c'] if total_failures_count else 0 - - return _json({ + # Test outcome summary from test_daily_stats + ds_summary = db.query(f''' + SELECT SUM(passed) as passed, SUM(failed) as failed, SUM(flaked) as flaked + FROM test_daily_stats {ds_where} + ''', ds_params) + ds_s = ds_summary[0] if ds_summary else {} + fc = ds_s.get('flaked', 0) or 0 + tfc = ds_s.get('failed', 0) or 0 + tpc = ds_s.get('passed', 0) or 0 + tc = fc + tfc + tpc + + _t4 = time.perf_counter() + _result = { 'by_date': by_date, + 'duration_by_dashboard': duration_by_dashboard, 'top_flakes': top_flakes, 'top_failures': top_failures, + 'period': {'from': date_from, 'to': date_to}, 'summary': { 'total_runs': total, 'pass_rate': round(100.0 * passed / max(total, 1), 1), @@ -646,8 +764,12 @@ def api_ci_performance(): 'flake_rate': round(100.0 * fc / max(tc, 1), 1) if tc else 0, 'total_flakes': fc, 'total_test_failures': tfc, + 'total_test_successes': tpc, }, - }) + } + print(f"[perf] ci_performance {date_from}..{date_to} | get_ci_runs={_t1-_t0:.3f}s db_queries={_t2-_t1:.3f}s agg={_t3-_t2:.3f}s top_flakes={_t4-_t3:.3f}s total={_t4-_t0:.3f}s", flush=True) + db.cache_set(_ck, _result, _cache_ttl(date_to)) + return _json(_result) # ---- GitHub integration ---- @@ -682,10 +804,19 @@ def api_pr_metrics(): date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) author = request.args.get('author', '') + _ck = f'pr_metrics:{date_from}:{date_to}:{author}' + if cached := db.cache_get(_ck): + return _json(cached) + _t0 = time.perf_counter() ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) - ci_runs = metrics.get_ci_runs(r, ts_from, ts_to) - return _json(github_data.get_pr_metrics(date_from, date_to, author, ci_runs)) + ci_runs = metrics.get_ci_runs(ts_from, ts_to) + _t1 = time.perf_counter() + _result = github_data.get_pr_metrics(date_from, date_to, author, ci_runs) + _t2 = time.perf_counter() + print(f"[perf] pr_metrics {date_from}..{date_to} | get_ci_runs={_t1-_t0:.3f}s get_pr_metrics={_t2-_t1:.3f}s total={_t2-_t0:.3f}s", flush=True) + db.cache_set(_ck, _result, _cache_ttl(date_to)) + return _json(_result) @app.route('/api/merge-queue/stats') @@ -693,7 +824,32 @@ def api_pr_metrics(): def api_merge_queue_stats(): date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) - return _json(github_data.get_merge_queue_stats(date_from, date_to)) + _ck = f'mq_stats:{date_from}:{date_to}' + if cached := db.cache_get(_ck): + return _json(cached) + _t0 = time.perf_counter() + _result = github_data.get_merge_queue_stats(date_from, date_to) + _t1 = time.perf_counter() + print(f"[perf] merge_queue_stats {date_from}..{date_to} | get_merge_queue_stats={_t1-_t0:.3f}s total={_t1-_t0:.3f}s", flush=True) + db.cache_set(_ck, _result, _cache_ttl(date_to)) + return _json(_result) + + +@app.route('/api/test-history/') +@auth.login_required +def api_test_history(test_hash): + """Test event history by hash — SQLite backing for Redis history_ lists.""" + branch = request.args.get('branch', '') + limit = min(int(request.args.get('limit', 1000)), 5000) + rows = metrics.get_test_history(test_hash, branch, limit) + return _json(rows) + + +@app.route('/api/ci/runs/pr/') +@auth.login_required +def api_ci_runs_for_pr(pr_number): + limit = min(int(request.args.get('limit', 100)), 500) + return _json(metrics.get_ci_runs_for_pr(pr_number, limit)) @app.route('/api/ci/flakes-by-command') @@ -702,8 +858,38 @@ def api_flakes_by_command(): date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) dashboard = request.args.get('dashboard', '') + _ck = f'flakes:{date_from}:{date_to}:{dashboard}' + _t0 = time.perf_counter() metrics.sync_failed_tests_to_sqlite(r) - return _json(metrics.get_flakes_by_command(date_from, date_to, dashboard)) + _t1 = time.perf_counter() + if cached := db.cache_get(_ck): + return _json(cached) + _result = metrics.get_flakes_by_command(date_from, date_to, dashboard) + _t2 = time.perf_counter() + print(f"[perf] flakes_by_command {date_from}..{date_to} | sync={_t1-_t0:.3f}s get_flakes={_t2-_t1:.3f}s total={_t2-_t0:.3f}s", flush=True) + db.cache_set(_ck, _result, _cache_ttl(date_to)) + return _json(_result) + + +# ---- CI Phase timing ---- + +@app.route('/api/ci/phases') +@auth.login_required +def api_ci_phases(): + """CI phase timing breakdown: avg time per phase, by date, and per run.""" + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + dashboard = request.args.get('dashboard', '') + run_id = request.args.get('run_id', '') + _ck = f'phases:{date_from}:{date_to}:{dashboard}:{run_id}' + if cached := db.cache_get(_ck): + return _json(cached) + _t0 = time.perf_counter() + _result = metrics.get_phases(date_from, date_to, dashboard, run_id) + _t1 = time.perf_counter() + print(f"[perf] ci_phases {date_from}..{date_to} | get_phases={_t1-_t0:.3f}s total={_t1-_t0:.3f}s", flush=True) + db.cache_set(_ck, _result, _cache_ttl(date_to)) + return _json(_result) # ---- Test timings ---- @@ -717,96 +903,157 @@ def api_test_timings(): dashboard = request.args.get('dashboard', '') status = request.args.get('status', '') # filter to specific status test_cmd = request.args.get('test_cmd', '') # filter to specific test - - conditions = ['duration_secs IS NOT NULL', 'duration_secs > 0', - 'timestamp >= ?', "timestamp < ? || 'T23:59:59'"] - params = [date_from, date_to] - + _ck = f'timings:{date_from}:{date_to}:{dashboard}:{status}:{test_cmd}' + _ttl = _cache_ttl(date_to) + if cached := db.cache_get(_ck): + return _json(cached) + _t0 = time.perf_counter() + + # Base WHERE for test_daily_stats + ds_conds = ['date >= ?', 'date <= ?'] + ds_params = [date_from, date_to] if dashboard: - conditions.append('dashboard = ?') - params.append(dashboard) - if status: - conditions.append('status = ?') - params.append(status) + ds_conds.append('dashboard = ?') + ds_params.append(dashboard) if test_cmd: - conditions.append('test_cmd = ?') - params.append(test_cmd) - - where = 'WHERE ' + ' AND '.join(conditions) - - # Per-test stats - by_test = db.query(f''' - SELECT test_cmd, - COUNT(*) as count, - ROUND(AVG(duration_secs), 1) as avg_secs, - ROUND(MIN(duration_secs), 1) as min_secs, - ROUND(MAX(duration_secs), 1) as max_secs, - SUM(CASE WHEN status = 'passed' THEN 1 ELSE 0 END) as passed, - SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed, - SUM(CASE WHEN status = 'flaked' THEN 1 ELSE 0 END) as flaked, - dashboard - FROM test_events {where} - GROUP BY test_cmd - ORDER BY count DESC - LIMIT 200 - ''', params) - - # Add pass rate - for row in by_test: - total = row['passed'] + row['failed'] + row['flaked'] - row['pass_rate'] = round(100.0 * row['passed'] / max(total, 1), 1) - row['total_time_secs'] = round(row['avg_secs'] * row['count'], 0) - - # Daily time series (aggregate across all tests or filtered test) - by_date = db.query(f''' - SELECT substr(timestamp, 1, 10) as date, - COUNT(*) as count, - ROUND(AVG(duration_secs), 1) as avg_secs, - ROUND(MAX(duration_secs), 1) as max_secs, - SUM(CASE WHEN status = 'passed' THEN 1 ELSE 0 END) as passed, - SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed, - SUM(CASE WHEN status = 'flaked' THEN 1 ELSE 0 END) as flaked - FROM test_events {where} - GROUP BY substr(timestamp, 1, 10) - ORDER BY date - ''', params) - - # Summary - summary_rows = db.query(f''' - SELECT COUNT(*) as count, - ROUND(AVG(duration_secs), 1) as avg_secs, - ROUND(MAX(duration_secs), 1) as max_secs, - SUM(duration_secs) as total_secs, - SUM(CASE WHEN status = 'passed' THEN 1 ELSE 0 END) as passed, - SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed, - SUM(CASE WHEN status = 'flaked' THEN 1 ELSE 0 END) as flaked - FROM test_events {where} - ''', params) - s = summary_rows[0] if summary_rows else {} - - # Slowest individual test runs + ds_conds.append('test_cmd = ?') + ds_params.append(test_cmd) + ds_where = 'WHERE ' + ' AND '.join(ds_conds) + + if not status: + # Fast path: push GROUP BY into SQL — returns N_tests + N_dates rows, not N_tests*N_dates rows + by_test_rows = db.query(f''' + SELECT test_cmd, MAX(dashboard) as dashboard, + SUM(passed) as passed, SUM(failed) as failed, SUM(flaked) as flaked, + SUM(total_secs) as total_secs, SUM(count_timed) as count_timed, + MIN(min_secs) as min_secs, MAX(max_secs) as max_secs + FROM test_daily_stats {ds_where} + GROUP BY test_cmd + ORDER BY SUM(passed)+SUM(failed)+SUM(flaked) DESC LIMIT 500 + ''', ds_params) + _t1 = time.perf_counter() + + by_date_rows = db.query(f''' + SELECT date, + SUM(passed) as passed, SUM(failed) as failed, SUM(flaked) as flaked, + SUM(total_secs) as total_secs, SUM(count_timed) as count_timed + FROM test_daily_stats {ds_where} + GROUP BY date ORDER BY date + ''', ds_params) + _t2 = time.perf_counter() + + by_test = [] + for t in by_test_rows: + count = (t['passed'] or 0) + (t['failed'] or 0) + (t['flaked'] or 0) + avg_secs = round(t['total_secs'] / t['count_timed'], 1) if t['count_timed'] else None + by_test.append({ + 'test_cmd': t['test_cmd'], 'dashboard': t['dashboard'], 'count': count, + 'passed': t['passed'] or 0, 'failed': t['failed'] or 0, 'flaked': t['flaked'] or 0, + 'pass_rate': round(100.0 * (t['passed'] or 0) / max(count, 1), 1), + 'avg_secs': avg_secs, 'min_secs': t['min_secs'], 'max_secs': t['max_secs'], + 'total_time_secs': round(t['total_secs'] or 0, 0), + }) + + by_date = [] + for d in by_date_rows: + count = (d['passed'] or 0) + (d['failed'] or 0) + (d['flaked'] or 0) + avg_secs = round(d['total_secs'] / d['count_timed'], 1) if d['count_timed'] else None + by_date.append({ + 'date': d['date'], 'passed': d['passed'] or 0, + 'failed': d['failed'] or 0, 'flaked': d['flaked'] or 0, + 'count': count, 'avg_secs': avg_secs, + }) + + total_passed = sum(d['passed'] for d in by_date) + total_failed = sum(d['failed'] for d in by_date) + total_flaked = sum(d['flaked'] for d in by_date) + total_secs_all = sum(d['total_secs'] or 0 for d in by_date_rows) + count_timed_all = sum(d['count_timed'] or 0 for d in by_date_rows) + else: + # Slow fallback: status filter requires scanning test_events + te_conds = ['duration_secs IS NOT NULL', 'duration_secs > 0', + 'timestamp >= ?', "timestamp < ? || 'T23:59:59'"] + te_params = [date_from, date_to] + if dashboard: + te_conds.append('dashboard = ?') + te_params.append(dashboard) + te_conds.append('status = ?') + te_params.append(status) + if test_cmd: + te_conds.append('test_cmd = ?') + te_params.append(test_cmd) + te_where = 'WHERE ' + ' AND '.join(te_conds) + + raw = db.query(f''' + SELECT test_cmd, dashboard, + COUNT(*) as count, + ROUND(AVG(duration_secs),1) as avg_secs, + ROUND(MIN(duration_secs),1) as min_secs, + ROUND(MAX(duration_secs),1) as max_secs, + SUM(duration_secs) as total_secs, + substr(timestamp,1,10) as date + FROM test_events {te_where} + GROUP BY test_cmd + ORDER BY count DESC LIMIT 200 + ''', te_params) + _t1 = time.perf_counter() + by_test = [dict(r, pass_rate=0, passed=0, failed=r['count'] if status=='failed' else 0, + flaked=r['count'] if status=='flaked' else 0, + total_time_secs=round(r['total_secs'] or 0, 0)) for r in raw] + + by_date_raw = db.query(f''' + SELECT substr(timestamp,1,10) as date, COUNT(*) as count + FROM test_events {te_where} + GROUP BY substr(timestamp,1,10) ORDER BY date + ''', te_params) + by_date = [{'date': r['date'], 'count': r['count'], 'passed': 0, + 'failed': r['count'] if status=='failed' else 0, + 'flaked': r['count'] if status=='flaked' else 0} for r in by_date_raw] + + total_passed = 0 + total_failed = sum(r['count'] for r in by_date) if status == 'failed' else 0 + total_flaked = sum(r['count'] for r in by_date) if status == 'flaked' else 0 + total_secs_all = sum(r.get('total_secs') or 0 for r in raw) + count_timed_all = sum(r['count'] for r in raw) + _t2 = time.perf_counter() + + # Slowest individual runs — uses idx_test_events_duration index + sl_conds = ['duration_secs IS NOT NULL', 'duration_secs > 0', + 'timestamp >= ?', "timestamp <= ? || 'T23:59:59'"] + sl_params = [date_from, date_to] + if dashboard: + sl_conds.append('dashboard = ?') + sl_params.append(dashboard) + if test_cmd: + sl_conds.append('test_cmd = ?') + sl_params.append(test_cmd) + sl_where = 'WHERE ' + ' AND '.join(sl_conds) slowest = db.query(f''' SELECT test_cmd, status, duration_secs, dashboard, - substr(timestamp, 1, 10) as date, commit_author, log_url - FROM test_events {where} - ORDER BY duration_secs DESC - LIMIT 50 - ''', params) - - return _json({ + substr(timestamp,1,10) as date, commit_author, log_url + FROM test_events {sl_where} + ORDER BY duration_secs DESC LIMIT 50 + ''', sl_params) + _t3 = time.perf_counter() + + print(f"[perf] test_timings {date_from}..{date_to} | by_test={_t1-_t0:.3f}s by_date={_t2-_t1:.3f}s slowest={_t3-_t2:.3f}s total={_t3-_t0:.3f}s", flush=True) + _result = { 'by_test': by_test, 'by_date': by_date, 'slowest': slowest, + 'period': {'from': date_from, 'to': date_to}, 'summary': { - 'total_runs': s.get('count', 0), - 'avg_duration_secs': s.get('avg_secs'), - 'max_duration_secs': s.get('max_secs'), - 'total_compute_secs': round(s.get('total_secs', 0) or 0, 0), - 'passed': s.get('passed', 0), - 'failed': s.get('failed', 0), - 'flaked': s.get('flaked', 0), + 'total_runs': total_passed + total_failed + total_flaked, + 'avg_duration_secs': round(total_secs_all / count_timed_all, 1) if count_timed_all > 0 else None, + 'max_duration_secs': slowest[0]['duration_secs'] if slowest else None, + 'total_compute_secs': round(total_secs_all, 0), + 'passed': total_passed, + 'failed': total_failed, + 'flaked': total_flaked, }, - }) + } + db.cache_set(_ck, _result, _ttl) + return _json(_result) # ---- Dashboard views ---- @@ -844,5 +1091,59 @@ def test_timings(): return "Dashboard not found", 404 +@app.route('/ci-health-report') +@auth.login_required +def ci_health_report(): + path = Path(__file__).parent / 'views' / 'ci-health-report.html' + if path.exists(): + return path.read_text() + return "Report not found", 404 + + +@app.route('/commits') +@auth.login_required +def commits_page(): + path = Path(__file__).parent / 'views' / 'commits.html' + return path.read_text() + + +@app.route('/api/commits') +@auth.login_required +def api_commits(): + branch = request.args.get('branch', 'next') + page = max(1, int(request.args.get('page', 1))) + per_page = min(int(request.args.get('per_page', 50)), 100) + return _json(github_data.get_recent_commits(branch, page, per_page)) + + +@app.route('/flake-prs') +@auth.login_required +def flake_prs(): + path = Path(__file__).parent / 'views' / 'flake-prs.html' + if path.exists(): + return path.read_text() + return "Page not found", 404 + + +@app.route('/api/flake-prs') +@auth.login_required +def api_flake_prs(): + rows = db.query(''' + SELECT pa.pr_number, pa.author, pa.title, pa.branch, + pa.additions, pa.deletions, pa.fetched_at, + MIN(cr.timestamp_ms) as first_seen_ms + FROM pr_authors pa + LEFT JOIN ci_runs cr ON cr.pr_number = pa.pr_number + WHERE ( + pa.title LIKE '%flake%' OR pa.title LIKE '%deflake%' + OR pa.branch LIKE '%flake%' OR pa.branch LIKE '%deflake%' + ) + GROUP BY pa.pr_number + ORDER BY pa.pr_number DESC + LIMIT 200 + ''') + return _json([dict(r) for r in rows]) + + if __name__ == '__main__': app.run(host='0.0.0.0', port=8081) diff --git a/ci3/ci-metrics/billing/aws.py b/ci3/ci-metrics/billing/aws.py index 481393d74ec3..4dc9061b37df 100644 --- a/ci3/ci-metrics/billing/aws.py +++ b/ci3/ci-metrics/billing/aws.py @@ -54,6 +54,8 @@ # Messaging 'Amazon Simple Notification Service': 'sns', 'Amazon Simple Queue Service': 'sqs', + # Savings Plans / Reserved Instances + 'Savings Plans for AWS Compute usage': 'savings_plans', # Other 'Tax': 'tax', 'AWS Support (Business)': 'support', @@ -63,6 +65,16 @@ import re +# One-time contract payments: annual Savings Plan upfronts and monthly Reserved Instance charges. +# These appear as large single-day spikes but are not operational spend. +_ONE_TIME_CATEGORIES = frozenset({ + 'savings_plan_1yr_annual', + 'savings_plan_3yr_annual', + 'savings_plan_1yr_annual_partial', + 'savings_plan_3yr_annual_partial', + 'reserved_instance_monthly', +}) + _cache = {'rows': [], 'ts': 0} _cache_lock = threading.Lock() _detail_cache = {'rows': [], 'ts': 0} @@ -152,7 +164,10 @@ def _fetch_aws_costs(date_from: str, date_to: str) -> list[dict]: TimePeriod={'Start': date_from, 'End': date_to}, Granularity='DAILY', Metrics=['UnblendedCost'], - GroupBy=[{'Type': 'DIMENSION', 'Key': 'SERVICE'}], + GroupBy=[ + {'Type': 'DIMENSION', 'Key': 'SERVICE'}, + {'Type': 'DIMENSION', 'Key': 'USAGE_TYPE'}, + ], ) if next_token: kwargs['NextPageToken'] = next_token @@ -163,12 +178,26 @@ def _fetch_aws_costs(date_from: str, date_to: str) -> list[dict]: date = result['TimePeriod']['Start'] for group in result['Groups']: service = group['Keys'][0] + usage_type = group['Keys'][1] if len(group['Keys']) > 1 else '' amount = float(group['Metrics']['UnblendedCost']['Amount']) if amount == 0: continue category = SERVICE_CATEGORY_MAP.get(service, 'other') + # Savings plans: ComputeSP:1yrAllUpfront, ComputeSP:3yrNoUpfront, etc. + if category == 'savings_plans': + m = re.match(r'ComputeSP:(\d+yr)(\w+)', usage_type) + if m: + term = m.group(1) + payment = m.group(2) + if payment == 'NoUpfront': + category = f'savings_plan_{term}_monthly' + elif 'Upfront' in payment: + category = f'savings_plan_{term}_annual' + # EC2 reserved instances: HeavyUsage: billed monthly on 1st + elif category == 'ec2' and 'HeavyUsage:' in usage_type: + category = 'reserved_instance_monthly' if category == 'other': - print(f"[rk_aws_costs] unmapped service: {service!r} (${amount:.2f})") + print(f"[rk_aws_costs] unmapped service: {service!r} / {usage_type!r} (${amount:.2f})") rows.append({ 'date': date, 'service': service, @@ -322,26 +351,32 @@ def get_costs_overview(date_from: str, date_to: str) -> dict: for r in aws_rows: d = r['date'] if d not in by_date: - by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0} + by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0, 'aws_one_time': 0} cat = r['category'] by_date[d]['aws'][cat] = by_date[d]['aws'].get(cat, 0) + r['amount_usd'] by_date[d]['aws_total'] += r['amount_usd'] + if cat in _ONE_TIME_CATEGORIES: + by_date[d]['aws_one_time'] += r['amount_usd'] for d, cats in gcp_by_date.items(): if d not in by_date: - by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0} + by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0, 'aws_one_time': 0} by_date[d]['gcp'] = cats by_date[d]['gcp_total'] = sum(cats.values()) sorted_dates = sorted(by_date.values(), key=lambda x: x['date']) aws_total = sum(d['aws_total'] for d in sorted_dates) + aws_one_time = sum(d['aws_one_time'] for d in sorted_dates) gcp_total = sum(d['gcp_total'] for d in sorted_dates) return { 'by_date': sorted_dates, 'totals': { 'aws': round(aws_total, 2), + 'aws_operational': round(aws_total - aws_one_time, 2), + 'aws_one_time': round(aws_one_time, 2), 'gcp': round(gcp_total, 2), 'combined': round(aws_total + gcp_total, 2), + 'combined_operational': round(aws_total - aws_one_time + gcp_total, 2), } } diff --git a/ci3/ci-metrics/db.py b/ci3/ci-metrics/db.py index 93e970fe3a56..e19380902825 100644 --- a/ci3/ci-metrics/db.py +++ b/ci3/ci-metrics/db.py @@ -3,11 +3,14 @@ Stores test events (from Redis pub/sub) and merge queue daily stats (backfilled from GitHub API). """ +import json import os import sqlite3 import threading +import time -_DB_PATH = os.path.join(os.getenv('LOGS_DISK_PATH', '/logs-disk'), 'metrics.db') +_DB_PATH = os.getenv('METRICS_DB_PATH', + os.path.join(os.getenv('LOGS_DISK_PATH', '/logs-disk'), 'metrics.db')) _local = threading.local() SCHEMA = """ @@ -34,6 +37,7 @@ CREATE INDEX IF NOT EXISTS idx_test_events_ts ON test_events(timestamp); CREATE INDEX IF NOT EXISTS idx_test_events_cmd ON test_events(test_cmd); CREATE INDEX IF NOT EXISTS idx_test_events_dashboard ON test_events(dashboard); +CREATE INDEX IF NOT EXISTS idx_test_events_status_ts ON test_events(status, timestamp); CREATE TABLE IF NOT EXISTS merge_queue_daily ( date TEXT PRIMARY KEY, @@ -64,6 +68,84 @@ CREATE INDEX IF NOT EXISTS idx_ci_runs_ts ON ci_runs(timestamp_ms); CREATE INDEX IF NOT EXISTS idx_ci_runs_name ON ci_runs(name); CREATE INDEX IF NOT EXISTS idx_ci_runs_dashboard ON ci_runs(dashboard); + +CREATE TABLE IF NOT EXISTS test_daily_stats ( + date TEXT NOT NULL, + test_cmd TEXT NOT NULL, + dashboard TEXT NOT NULL DEFAULT '', + passed INTEGER NOT NULL DEFAULT 0, + failed INTEGER NOT NULL DEFAULT 0, + flaked INTEGER NOT NULL DEFAULT 0, + total_secs REAL NOT NULL DEFAULT 0, + count_timed INTEGER NOT NULL DEFAULT 0, + min_secs REAL, + max_secs REAL, + PRIMARY KEY (date, test_cmd, dashboard) +); +CREATE INDEX IF NOT EXISTS idx_tds_date ON test_daily_stats(date); +CREATE INDEX IF NOT EXISTS idx_tds_dashboard ON test_daily_stats(dashboard); + +CREATE TABLE IF NOT EXISTS merge_queue_snapshots ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + timestamp TEXT NOT NULL, + depth INTEGER NOT NULL, + entries_json TEXT +); +CREATE INDEX IF NOT EXISTS idx_mqs_ts ON merge_queue_snapshots(timestamp); + +CREATE TABLE IF NOT EXISTS ci_run_daily_stats ( + date TEXT NOT NULL, + dashboard TEXT NOT NULL, + run_count INTEGER NOT NULL DEFAULT 0, + passed INTEGER NOT NULL DEFAULT 0, + failed INTEGER NOT NULL DEFAULT 0, + sum_duration REAL NOT NULL DEFAULT 0, + min_duration REAL, + max_duration REAL, + p50_duration REAL, + p95_duration REAL, + PRIMARY KEY (date, dashboard) +); +CREATE INDEX IF NOT EXISTS idx_crds_date ON ci_run_daily_stats(date); + +CREATE TABLE IF NOT EXISTS ci_phases ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + phase TEXT NOT NULL, + duration_secs REAL NOT NULL, + exit_code INTEGER, + run_id TEXT, + job_id TEXT, + dashboard TEXT NOT NULL DEFAULT '', + ref_name TEXT, + commit_hash TEXT, + timestamp TEXT NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_ci_phases_run ON ci_phases(run_id); +CREATE INDEX IF NOT EXISTS idx_ci_phases_ts ON ci_phases(timestamp); +CREATE INDEX IF NOT EXISTS idx_ci_phases_phase ON ci_phases(phase); + +CREATE TABLE IF NOT EXISTS pr_authors ( + pr_number INTEGER PRIMARY KEY, + author TEXT NOT NULL, + title TEXT NOT NULL DEFAULT '', + branch TEXT NOT NULL DEFAULT '', + additions INTEGER DEFAULT 0, + deletions INTEGER DEFAULT 0, + fetched_at TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS api_cache ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + created_at REAL NOT NULL, + ttl_secs INTEGER NOT NULL DEFAULT 300 +); + +CREATE TABLE IF NOT EXISTS pr_cache ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated_at REAL NOT NULL +); """ @@ -73,6 +155,16 @@ "ALTER TABLE ci_runs ADD COLUMN job_id TEXT DEFAULT ''", "ALTER TABLE ci_runs ADD COLUMN arch TEXT DEFAULT ''", "CREATE INDEX IF NOT EXISTS idx_ci_runs_dashboard ON ci_runs(dashboard)", + "ALTER TABLE test_events ADD COLUMN test_hash TEXT", + "CREATE INDEX IF NOT EXISTS idx_test_events_hash ON test_events(test_hash)", + "ALTER TABLE merge_queue_daily ADD COLUMN avg_depth REAL", + "ALTER TABLE merge_queue_daily ADD COLUMN peak_depth INTEGER", + "CREATE INDEX IF NOT EXISTS idx_test_events_duration_ts ON test_events(timestamp) WHERE duration_secs IS NOT NULL AND duration_secs > 0", + "ALTER TABLE test_daily_stats ADD COLUMN total_secs REAL NOT NULL DEFAULT 0", + "ALTER TABLE test_daily_stats ADD COLUMN count_timed INTEGER NOT NULL DEFAULT 0", + "ALTER TABLE test_daily_stats ADD COLUMN min_secs REAL", + "ALTER TABLE test_daily_stats ADD COLUMN max_secs REAL", + "CREATE INDEX IF NOT EXISTS idx_test_events_duration ON test_events(duration_secs DESC) WHERE duration_secs IS NOT NULL AND duration_secs > 0", ] @@ -105,3 +197,31 @@ def execute(sql: str, params=()): conn = get_db() conn.execute(sql, params) conn.commit() + + +def cache_get(key: str): + """Return cached value (parsed JSON) if not expired, else None.""" + rows = query('SELECT value, created_at, ttl_secs FROM api_cache WHERE key = ?', (key,)) + if rows and time.time() - rows[0]['created_at'] < rows[0]['ttl_secs']: + return json.loads(rows[0]['value']) + return None + + +def cache_set(key: str, data, ttl_secs: int = 300) -> None: + """Store data as JSON in the cache with a TTL.""" + execute( + 'INSERT OR REPLACE INTO api_cache (key, value, created_at, ttl_secs) VALUES (?, ?, ?, ?)', + (key, json.dumps(data, default=str), time.time(), ttl_secs), + ) + + +def cache_invalidate_prefix(prefix: str) -> None: + """Delete all cache entries whose key starts with prefix.""" + execute('DELETE FROM api_cache WHERE key LIKE ?', (prefix + '%',)) + + +def cache_cleanup() -> None: + """Remove expired entries.""" + execute( + "DELETE FROM api_cache WHERE created_at + ttl_secs < unixepoch('now')" + ) diff --git a/ci3/ci-metrics/ec2_pricing.py b/ci3/ci-metrics/ec2_pricing.py index ace55ea4f40a..96e0561d0d70 100644 --- a/ci3/ci-metrics/ec2_pricing.py +++ b/ci3/ci-metrics/ec2_pricing.py @@ -16,12 +16,20 @@ # ---- Hardcoded fallback rates (us-east-2, USD/hr) ---- _HARDCODED_RATES = { - ('m6a.48xlarge', True): 8.31, # spot - ('m6a.48xlarge', False): 16.56, # on-demand - ('m6a.32xlarge', True): 5.54, - ('m6a.32xlarge', False): 11.04, + ('m6a.xlarge', True): 0.07, # spot + ('m6a.xlarge', False): 0.1728, # on-demand + ('m6a.4xlarge', True): 0.28, + ('m6a.4xlarge', False): 0.6912, + ('m6a.8xlarge', True): 0.55, + ('m6a.8xlarge', False): 1.3824, ('m6a.16xlarge', True): 2.77, ('m6a.16xlarge', False): 5.52, + ('m6a.24xlarge', True): 1.66, + ('m6a.24xlarge', False): 4.1472, + ('m6a.32xlarge', True): 5.54, + ('m6a.32xlarge', False): 11.04, + ('m6a.48xlarge', True): 8.31, + ('m6a.48xlarge', False): 16.56, ('m7a.48xlarge', True): 8.31, ('m7a.48xlarge', False): 16.56, ('m7a.16xlarge', True): 2.77, @@ -145,8 +153,19 @@ def _fetch_all_spot(instance_types: list[str]) -> dict[str, float]: # ---- Cache refresh ---- def _get_known_instance_types() -> list[str]: - """Return the set of instance types we need pricing for.""" - return sorted({itype for itype, _ in _HARDCODED_RATES}) + """Return the set of instance types we need pricing for (hardcoded + from DB).""" + types = {itype for itype, _ in _HARDCODED_RATES} + try: + import db + conn = db.get_db() + rows = conn.execute( + "SELECT DISTINCT instance_type FROM ci_runs " + "WHERE instance_type IS NOT NULL AND instance_type != '' AND instance_type != 'unknown'" + ).fetchall() + types.update(r['instance_type'] for r in rows) + except Exception: + pass + return sorted(types) def _refresh_cache(): diff --git a/ci3/ci-metrics/github_data.py b/ci3/ci-metrics/github_data.py index 8824d187cb81..9c36a708025d 100644 --- a/ci3/ci-metrics/github_data.py +++ b/ci3/ci-metrics/github_data.py @@ -1,15 +1,20 @@ """GitHub API polling with in-memory cache. -Fetches PR lifecycle, deployment runs, branch lag, and merge queue stats via `gh` CLI. +Fetches PR lifecycle, deployment runs, branch lag, and merge queue stats via +the GitHub REST API (using requests + GH_TOKEN env var). Most data cached in memory with TTL. Merge queue stats persisted to SQLite daily. """ import json -import subprocess +import os +import requests import threading import time from datetime import datetime, timedelta, timezone +import db as _db + REPO = 'AztecProtocol/aztec-packages' +_GH_API = 'https://api.github.com' BRANCH_PAIRS = [ ('next', 'staging-public'), @@ -25,41 +30,126 @@ _CACHE_TTL = 3600 # 1 hour _pr_cache = {'data': [], 'ts': 0} +_commits_cache: dict = {} # keyed by branch +_commits_lock = threading.Lock() _deploy_cache = {'data': [], 'ts': 0} _lag_cache = {'data': [], 'ts': 0} -_pr_author_cache = {} # {pr_number: {'author': str, 'title': str, 'branch': str}} _pr_lock = threading.Lock() _deploy_lock = threading.Lock() _lag_lock = threading.Lock() -def _gh(args: list[str]) -> str | None: +def _gh_headers() -> dict: + token = os.environ.get('GH_TOKEN') or os.environ.get('GITHUB_TOKEN', '') + h = {'Accept': 'application/vnd.github+json', 'X-GitHub-Api-Version': '2022-11-28'} + if token: + h['Authorization'] = f'Bearer {token}' + return h + + +def _github_get(path: str, paginate: bool = False) -> list | dict | None: + """GET from GitHub REST API. Returns parsed JSON (list or dict). + If paginate=True, follows Link: next headers and merges array results.""" + url = f'{_GH_API}/{path}' if not path.startswith('http') else path + headers = _gh_headers() try: - result = subprocess.run( - ['gh'] + args, - capture_output=True, text=True, timeout=30 - ) - if result.returncode == 0: - return result.stdout.strip() - except (FileNotFoundError, subprocess.TimeoutExpired) as e: - print(f"[rk_github] gh error: {e}") - return None + if not paginate: + resp = requests.get(url, headers=headers, timeout=30) + if resp.status_code != 200: + print(f"[rk_github] API {resp.status_code}: {url}") + return None + return resp.json() + # Paginated: collect all pages + all_items = [] + while url: + resp = requests.get(url, headers=headers, timeout=30) + if resp.status_code != 200: + print(f"[rk_github] API {resp.status_code}: {url}") + break + data = resp.json() + if isinstance(data, list): + all_items.extend(data) + elif isinstance(data, dict): + # For endpoints like /actions/workflows/.../runs that wrap in an object + all_items.append(data) + # Follow Link: ; rel="next" + link = resp.headers.get('Link', '') + url = None + for part in link.split(','): + if 'rel="next"' in part: + url = part.split('<')[1].split('>')[0] + return all_items + except Exception as e: + print(f"[rk_github] API error: {e}") + return None + + +def _github_graphql(query: str, variables: dict = None) -> dict | None: + """Execute a GitHub GraphQL query.""" + headers = _gh_headers() + try: + resp = requests.post(f'{_GH_API}/graphql', headers=headers, + json={'query': query, 'variables': variables or {}}, + timeout=30) + if resp.status_code != 200: + print(f"[rk_github] GraphQL {resp.status_code}") + return None + data = resp.json() + if 'errors' in data: + print(f"[rk_github] GraphQL errors: {data['errors']}") + return data.get('data') + except Exception as e: + print(f"[rk_github] GraphQL error: {e}") + return None # ---- PR lifecycle ---- +_PR_GQL = ''' +query($owner: String!, $repo: String!, $cursor: String) { + repository(owner: $owner, name: $repo) { + pullRequests(states: MERGED, first: 100, after: $cursor, orderBy: {field: UPDATED_AT, direction: DESC}) { + pageInfo { hasNextPage endCursor } + nodes { + number + author { login } + title + createdAt + mergedAt + closedAt + baseRefName + headRefName + additions + deletions + changedFiles + isDraft + reviewDecision + labels(first: 20) { nodes { name } } + } + } + } +}''' + + def _fetch_and_process_prs() -> list[dict]: - out = _gh([ - 'pr', 'list', '--repo', REPO, '--state', 'merged', - '--limit', '500', - '--json', 'number,author,title,createdAt,mergedAt,closedAt,baseRefName,' - 'headRefName,additions,deletions,changedFiles,isDraft,reviewDecision,labels' - ]) - if not out: - return [] - try: - prs = json.loads(out) - except json.JSONDecodeError: + owner, repo = REPO.split('/') + prs = [] + cursor = None + for _ in range(5): # max 5 pages = 500 PRs + data = _github_graphql(_PR_GQL, {'owner': owner, 'repo': repo, 'cursor': cursor}) + if not data: + break + pr_data = data.get('repository', {}).get('pullRequests', {}) + nodes = pr_data.get('nodes', []) + for node in nodes: + node['author'] = (node.get('author') or {}).get('login', 'unknown') + node['labels'] = [l['name'] for l in (node.get('labels') or {}).get('nodes', [])] + prs.extend(nodes) + page_info = pr_data.get('pageInfo', {}) + if not page_info.get('hasNextPage'): + break + cursor = page_info.get('endCursor') + if not prs: return [] for pr in prs: @@ -87,9 +177,20 @@ def _fetch_and_process_prs() -> list[dict]: def _ensure_prs(): + import db as _db now = time.time() if _pr_cache['data'] and now - _pr_cache['ts'] < _CACHE_TTL: return + # Try SQLite cache before hitting GitHub + if not _pr_cache['data']: + try: + rows = _db.query("SELECT value, updated_at FROM pr_cache WHERE key = 'prs'") + if rows and now - rows[0]['updated_at'] < _CACHE_TTL: + _pr_cache['data'] = json.loads(rows[0]['value']) + _pr_cache['ts'] = rows[0]['updated_at'] + return + except Exception: + pass if not _pr_lock.acquire(blocking=False): return try: @@ -97,6 +198,13 @@ def _ensure_prs(): if prs: _pr_cache['data'] = prs _pr_cache['ts'] = now + try: + _db.execute( + "INSERT OR REPLACE INTO pr_cache (key, value, updated_at) VALUES ('prs', ?, ?)", + (json.dumps(prs, default=str), now), + ) + except Exception: + pass finally: _pr_lock.release() @@ -106,20 +214,14 @@ def _ensure_prs(): def _fetch_all_deploys() -> list[dict]: all_runs = [] for workflow in DEPLOY_WORKFLOWS: - out = _gh([ - 'run', 'list', '--repo', REPO, - '--workflow', workflow, '--limit', '50', - '--json', 'databaseId,status,conclusion,createdAt,updatedAt,headBranch,name' - ]) - if not out: - continue - try: - runs = json.loads(out) - except json.JSONDecodeError: + data = _github_get( + f'repos/{REPO}/actions/workflows/{workflow}/runs?per_page=50&status=completed') + if not data: continue + runs = data.get('workflow_runs', []) for run in runs: - started = run.get('createdAt', '') - completed = run.get('updatedAt') + started = run.get('created_at', '') + completed = run.get('updated_at') duration = None if started and completed: try: @@ -129,9 +231,9 @@ def _fetch_all_deploys() -> list[dict]: except (ValueError, TypeError): pass all_runs.append({ - 'run_id': str(run.get('databaseId', '')), + 'run_id': str(run.get('id', '')), 'workflow_name': workflow.replace('.yml', ''), - 'ref_name': run.get('headBranch', ''), + 'ref_name': run.get('head_branch', ''), 'status': run.get('conclusion', run.get('status', 'unknown')), 'started_at': started, 'completed_at': completed, @@ -162,26 +264,22 @@ def _fetch_branch_lag() -> list[dict]: results = [] today = datetime.now(timezone.utc).date().isoformat() for source, target in BRANCH_PAIRS: - out = _gh([ - 'api', f'repos/{REPO}/compare/{target}...{source}', - '--jq', '.ahead_by' - ]) - if not out: + data = _github_get(f'repos/{REPO}/compare/{target}...{source}') + if not data: continue try: - commits_behind = int(out) + commits_behind = int(data.get('ahead_by', 0)) except (ValueError, TypeError): continue days_behind = None - out2 = _gh([ - 'api', f'repos/{REPO}/compare/{target}...{source}', - '--jq', '.commits[0].commit.committer.date' - ]) - if out2: + commits = data.get('commits', []) + if commits: try: - oldest = datetime.fromisoformat(out2.replace('Z', '+00:00')) - days_behind = round((datetime.now(timezone.utc) - oldest).total_seconds() / 86400, 1) + oldest_date = commits[0].get('commit', {}).get('committer', {}).get('date', '') + if oldest_date: + oldest = datetime.fromisoformat(oldest_date.replace('Z', '+00:00')) + days_behind = round((datetime.now(timezone.utc) - oldest).total_seconds() / 86400, 1) except (ValueError, TypeError): pass @@ -291,71 +389,106 @@ def get_branch_lag(date_from: str, date_to: str) -> dict: return {'pairs': pairs} +def _cache_pr_author(pr_number: int, info: dict): + """Write PR author info to SQLite cache.""" + _db.execute(''' + INSERT OR REPLACE INTO pr_authors (pr_number, author, title, branch, additions, deletions, fetched_at) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', (pr_number, info['author'], info.get('title', ''), info.get('branch', ''), + info.get('additions', 0), info.get('deletions', 0), + datetime.now(timezone.utc).isoformat())) + + +def _get_cached_pr_author(pr_number: int) -> dict | None: + """Read PR author info from SQLite cache.""" + rows = _db.query('SELECT * FROM pr_authors WHERE pr_number = ?', (pr_number,)) + if rows: + r = rows[0] + return {'author': r['author'], 'title': r['title'], 'branch': r['branch'], + 'additions': r['additions'], 'deletions': r['deletions']} + return None + + def get_pr_author(pr_number) -> dict | None: - """Look up PR author/title by number. Results are cached permanently (PR data doesn't change).""" + """Look up PR author/title by number. Results cached in SQLite.""" pr_number = int(pr_number) if pr_number else None if not pr_number: return None - if pr_number in _pr_author_cache: - return _pr_author_cache[pr_number] - # Check merged PR cache first (already fetched) + # Check SQLite cache + cached = _get_cached_pr_author(pr_number) + if cached: + return cached + + # Check merged PR cache (already fetched in-memory) for pr in _pr_cache.get('data', []): if pr.get('number') == pr_number: info = {'author': pr.get('author', 'unknown'), 'title': pr.get('title', ''), 'branch': pr.get('headRefName', ''), 'additions': pr.get('additions', 0), 'deletions': pr.get('deletions', 0)} - _pr_author_cache[pr_number] = info + _cache_pr_author(pr_number, info) return info - # Fetch from GitHub API - out = _gh(['pr', 'view', str(pr_number), '--repo', REPO, - '--json', 'author,title,headRefName,additions,deletions']) - if out: + # Fetch from GitHub REST API + data = _github_get(f'repos/{REPO}/pulls/{pr_number}') + if data: try: - data = json.loads(out) - author = data.get('author', {}) - if isinstance(author, dict): - author = author.get('login', 'unknown') + author = (data.get('user') or {}).get('login', 'unknown') info = {'author': author, 'title': data.get('title', ''), - 'branch': data.get('headRefName', ''), + 'branch': (data.get('head') or {}).get('ref', ''), 'additions': data.get('additions', 0), 'deletions': data.get('deletions', 0)} - _pr_author_cache[pr_number] = info + _cache_pr_author(pr_number, info) return info - except (json.JSONDecodeError, KeyError): + except (KeyError, TypeError): pass return None def batch_get_pr_authors(pr_numbers: set) -> dict: - """Fetch authors for multiple PR numbers, using cache. Returns {pr_number: info}.""" + """Fetch authors for multiple PR numbers, using SQLite cache. Returns {pr_number: info}.""" result = {} - to_fetch = [] - for prn in pr_numbers: - if not prn: - continue - prn = int(prn) - if prn in _pr_author_cache: - result[prn] = _pr_author_cache[prn] - else: - to_fetch.append(prn) - - # Check merged PR cache first - for pr in _pr_cache.get('data', []): - num = pr.get('number') - if num in to_fetch: - info = {'author': pr.get('author', 'unknown'), 'title': pr.get('title', ''), - 'branch': pr.get('headRefName', ''), - 'additions': pr.get('additions', 0), 'deletions': pr.get('deletions', 0)} - _pr_author_cache[num] = info - result[num] = info - to_fetch.remove(num) - - # Fetch remaining individually (with a cap to avoid API abuse) - for prn in to_fetch[:50]: - info = get_pr_author(prn) - if info: - result[prn] = info + # Batch fetch from SQLite cache in a single query + clean = [int(prn) for prn in pr_numbers if prn] + if not clean: + return result + placeholders = ','.join('?' * len(clean)) + cached_rows = _db.query( + f'SELECT * FROM pr_authors WHERE pr_number IN ({placeholders})', clean) + cached_set = set() + for r in cached_rows: + prn = r['pr_number'] + result[prn] = {'author': r['author'], 'title': r['title'], 'branch': r['branch'], + 'additions': r['additions'], 'deletions': r['deletions']} + cached_set.add(prn) + to_fetch = [prn for prn in clean if prn not in cached_set] + + # Check merged PR cache (in-memory) + if to_fetch: + to_fetch_set = set(to_fetch) + for pr in _pr_cache.get('data', []): + num = pr.get('number') + if num in to_fetch_set: + info = {'author': pr.get('author', 'unknown'), 'title': pr.get('title', ''), + 'branch': pr.get('headRefName', ''), + 'additions': pr.get('additions', 0), 'deletions': pr.get('deletions', 0)} + _cache_pr_author(num, info) + result[num] = info + to_fetch_set.discard(num) + to_fetch = list(to_fetch_set) + + # Fetch remaining concurrently (with a cap to avoid API abuse) + if to_fetch: + from concurrent.futures import ThreadPoolExecutor, as_completed + with ThreadPoolExecutor(max_workers=10) as pool: + futures = {pool.submit(get_pr_author, prn): prn for prn in to_fetch[:50]} + for fut in as_completed(futures): + prn = futures[fut] + try: + info = fut.result() + if info: + result[prn] = info + except Exception: + pass return result @@ -495,33 +628,29 @@ def _median(vals): def _fetch_merge_queue_runs(date_str: str) -> dict: """Fetch merge_group workflow runs for a single date. Returns daily summary.""" - out = _gh([ - 'api', '--paginate', + pages = _github_get( f'repos/{REPO}/actions/workflows/{CI3_WORKFLOW}/runs' f'?event=merge_group&created={date_str}&per_page=100', - '--jq', '.workflow_runs[] | [.conclusion, .status] | @tsv', - ]) + paginate=True) summary = {'date': date_str, 'total': 0, 'success': 0, 'failure': 0, 'cancelled': 0, 'in_progress': 0} - if not out: + if not pages: return summary - for line in out.strip().split('\n'): - if not line.strip(): - continue - parts = line.split('\t') - conclusion = parts[0] if parts[0] else '' - status = parts[1] if len(parts) > 1 else '' - summary['total'] += 1 - if conclusion == 'success': - summary['success'] += 1 - elif conclusion == 'failure': - summary['failure'] += 1 - elif conclusion == 'cancelled': - summary['cancelled'] += 1 - elif status in ('in_progress', 'queued', 'waiting'): - summary['in_progress'] += 1 - else: - summary['failure'] += 1 # treat unknown conclusions as failures + for page in pages: + for run in (page.get('workflow_runs') or []) if isinstance(page, dict) else []: + conclusion = run.get('conclusion') or '' + status = run.get('status') or '' + summary['total'] += 1 + if conclusion == 'success': + summary['success'] += 1 + elif conclusion == 'failure': + summary['failure'] += 1 + elif conclusion == 'cancelled': + summary['cancelled'] += 1 + elif status in ('in_progress', 'queued', 'waiting'): + summary['in_progress'] += 1 + else: + summary['failure'] += 1 # treat unknown conclusions as failures return summary @@ -597,13 +726,14 @@ def _backfill_merge_queue(): def refresh_merge_queue_today(): - """Refresh today's (and yesterday's) merge queue stats. Called periodically.""" + """Refresh recent merge queue stats. Re-fetches the last 7 days to fix any + zero rows written during transient API failures.""" import db conn = db.get_db() - today = datetime.now(timezone.utc).date().isoformat() - yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).date().isoformat() + today = datetime.now(timezone.utc).date() - for ds in [yesterday, today]: + for i in range(7): + ds = (today - timedelta(days=i)).isoformat() summary = _fetch_merge_queue_runs(ds) conn.execute( 'INSERT OR REPLACE INTO merge_queue_daily (date, total, success, failure, cancelled, in_progress) ' @@ -613,6 +743,80 @@ def refresh_merge_queue_today(): conn.commit() +_MQ_DEPTH_GQL = ''' +query($owner: String!, $repo: String!, $branch: String!) { + repository(owner: $owner, name: $repo) { + mergeQueue(branch: $branch) { + entries(first: 100) { + totalCount + nodes { position state enqueuedAt pullRequest { number title author { login } } } + } + } + } +}''' + +_MQ_BRANCH = 'next' + + +def poll_merge_queue_depth(): + """Snapshot the current merge queue depth into SQLite.""" + import db + owner, repo = REPO.split('/') + data = _github_graphql(_MQ_DEPTH_GQL, + {'owner': owner, 'repo': repo, 'branch': _MQ_BRANCH}) + if not data: + return + mq = (data.get('repository') or {}).get('mergeQueue') + if mq is None: + return + entries = mq.get('entries', {}) + depth = entries.get('totalCount', 0) + nodes = entries.get('nodes', []) + entries_json = json.dumps([{ + 'position': n.get('position'), + 'state': n.get('state'), + 'pr': (n.get('pullRequest') or {}).get('number'), + 'author': ((n.get('pullRequest') or {}).get('author') or {}).get('login'), + } for n in nodes]) if nodes else None + + now = datetime.now(timezone.utc).isoformat() + db.execute('INSERT INTO merge_queue_snapshots (timestamp, depth, entries_json) VALUES (?, ?, ?)', + (now, depth, entries_json)) + + +def _aggregate_depth_stats(): + """Aggregate merge_queue_snapshots into avg/peak depth on merge_queue_daily.""" + import db + conn = db.get_db() + rows = conn.execute(''' + SELECT substr(timestamp, 1, 10) as date, + ROUND(AVG(depth), 1) as avg_depth, + MAX(depth) as peak_depth + FROM merge_queue_snapshots + GROUP BY substr(timestamp, 1, 10) + ''').fetchall() + for row in rows: + conn.execute(''' + UPDATE merge_queue_daily SET avg_depth = ?, peak_depth = ? + WHERE date = ? + ''', (row['avg_depth'], row['peak_depth'], row['date'])) + conn.commit() + + +def start_merge_queue_poller(): + """Start background thread that polls merge queue depth every 5 minutes.""" + def loop(): + while True: + try: + poll_merge_queue_depth() + except Exception as e: + print(f"[rk_github] queue depth poll error: {e}") + time.sleep(300) # 5 minutes + t = threading.Thread(target=loop, daemon=True, name='mq-depth-poller') + t.start() + return t + + _mq_backfill_lock = threading.Lock() _mq_last_refresh = 0 _MQ_REFRESH_TTL = 3600 # refresh today's data every hour @@ -629,6 +833,7 @@ def ensure_merge_queue_data(): try: _backfill_merge_queue() refresh_merge_queue_today() + _aggregate_depth_stats() _mq_last_refresh = now finally: _mq_backfill_lock.release() @@ -646,7 +851,7 @@ def get_merge_queue_stats(date_from: str, date_to: str) -> dict: threading.Thread(target=ensure_merge_queue_data, daemon=True).start() rows = db.query( - 'SELECT date, total, success, failure, cancelled, in_progress ' + 'SELECT date, total, success, failure, cancelled, in_progress, avg_depth, peak_depth ' 'FROM merge_queue_daily WHERE date >= ? AND date <= ? ORDER BY date', (date_from, date_to)) @@ -664,3 +869,144 @@ def get_merge_queue_stats(date_from: str, date_to: str) -> dict: 'days': len([r for r in rows if r['total'] > 0]), }, } + + +import re as _re + +_COMMIT_TYPE_RE = _re.compile( + r'^(fix|feat|chore|refactor|docs|style|test|perf|ci|build|revert)(\([^)]+\))?(!)?: ' +) +_PR_NUM_RE = _re.compile(r'\(#(\d+)\)\s*$') +_MERGE_TRAIN_RE = _re.compile(r'merge-train/([^\s]+)') + + +def _parse_commit(raw: dict) -> dict: + """Normalise a GitHub REST commit object into a compact dict.""" + sha = raw.get('sha', '') + msg = raw.get('commit', {}).get('message', '') or '' + subject = msg.split('\n')[0] + c_author = raw.get('commit', {}).get('author', {}) or {} + # Prefer committer login if available (shows GitHub username not git display name) + login = (raw.get('author') or {}).get('login', '') + author = login or c_author.get('name', '') + date = c_author.get('date', '') # ISO-8601 + + # Parse conventional commit type + scope + m = _COMMIT_TYPE_RE.match(subject) + commit_type = m.group(1) if m else 'other' + breaking = bool(m and m.group(3)) + scope_raw = m.group(2) if m else '' + scope = scope_raw[1:-1] if scope_raw else '' # strip parens + + # Extract PR number from "(#NNNNN)" at end of subject + pr_m = _PR_NUM_RE.search(subject) + pr_number = int(pr_m.group(1)) if pr_m else None + clean_subject = _PR_NUM_RE.sub('', subject).rstrip() + + # Detect merge-train commits + mt_m = _MERGE_TRAIN_RE.search(subject) + merge_train = mt_m.group(1) if mt_m else None + is_merge = len(raw.get('parents', [])) > 1 + + return { + 'sha': sha, + 'subject': clean_subject, + 'type': commit_type, + 'scope': scope, + 'breaking': breaking, + 'pr': pr_number, + 'author': author, + 'date': date, + 'merge_train': merge_train, + 'is_merge': is_merge, + 'dirs': None, # populated by caller if Redis cache available + } + + +_pr_dirs_cache: dict = {} # {pr_number: [dirs]} in-memory cache (long TTL) +_pr_dirs_lock = threading.Lock() +_pr_dirs_fetch_queue: set = set() +_pr_dirs_worker_started = False + + +def _compute_pr_dirs(pr_number: int) -> list[str]: + """Fetch changed files for a PR and return 2-level path buckets.""" + data = _github_get(f'repos/{REPO}/pulls/{pr_number}/files?per_page=100') + if not data or not isinstance(data, list): + return [] + dirs: set[str] = set() + for f in data: + filename = f.get('filename', '') + if not filename: + continue + parts = filename.split('/') + top = parts[0] + dirs.add(top) + # For yarn-project, include 2nd level for sub-project drill-down + if top == 'yarn-project' and len(parts) > 1: + dirs.add(f'yarn-project/{parts[1]}') + return sorted(dirs) + + +def _pr_dirs_worker(): + """Background worker: drains the fetch queue, caches results.""" + while True: + time.sleep(2) + with _pr_dirs_lock: + if not _pr_dirs_fetch_queue: + continue + pr_number = _pr_dirs_fetch_queue.pop() + try: + dirs = _compute_pr_dirs(pr_number) + with _pr_dirs_lock: + _pr_dirs_cache[pr_number] = dirs + except Exception as e: + print(f'[github_data] pr_dirs fetch error for #{pr_number}: {e}') + + +def start_pr_dirs_worker(): + """Start the background PR dirs fetcher (call once at startup).""" + global _pr_dirs_worker_started + if _pr_dirs_worker_started: + return + _pr_dirs_worker_started = True + t = threading.Thread(target=_pr_dirs_worker, daemon=True, name='pr-dirs-fetcher') + t.start() + + +def get_pr_dirs(pr_number: int) -> list[str] | None: + """Return cached dirs for a PR, or None if not yet fetched (queues async fetch).""" + with _pr_dirs_lock: + if pr_number in _pr_dirs_cache: + return _pr_dirs_cache[pr_number] + _pr_dirs_fetch_queue.add(pr_number) + return None + + +def get_recent_commits(branch: str = 'next', page: int = 1, per_page: int = 50) -> list[dict]: + """Fetch a page of commits from GitHub API with 5-minute in-memory cache.""" + per_page = min(per_page, 100) + cache_key = f'{branch}:{page}:{per_page}' + now = time.time() + with _commits_lock: + cached = _commits_cache.get(cache_key) + if cached and now - cached['ts'] < 300: + return cached['data'] + + data = _github_get( + f'repos/{REPO}/commits?sha={branch}&per_page={per_page}&page={page}' + ) + if not data or not isinstance(data, list): + result = [] + else: + result = [_parse_commit(raw) for raw in data] + + with _commits_lock: + _commits_cache[cache_key] = {'data': result, 'ts': now} + + # Enrich with cached dirs (non-blocking) + for c in result: + if c.get('pr'): + c['dirs'] = get_pr_dirs(c['pr']) + + return result diff --git a/ci3/ci-metrics/metrics.py b/ci3/ci-metrics/metrics.py index 5c0d1610e06b..8582ef0ad90c 100644 --- a/ci3/ci-metrics/metrics.py +++ b/ci3/ci-metrics/metrics.py @@ -1,9 +1,11 @@ -"""CI metrics: direct Redis reads + test event listener. +"""CI metrics: SQLite source of truth + Redis ingestion + test event listener. -Reads CI run data directly from Redis sorted sets on each request. +CI runs are ingested from Redis (written by log_ci_run on CI instances) and +stored in SQLite. All reads go through SQLite so enriched fields (instance_type +from CloudTrail, recalculated costs) are preserved. Test events stored in SQLite since they only arrive via pub/sub. -CI runs periodically synced from Redis to SQLite for flake correlation. """ +import hashlib import json import re import time @@ -21,6 +23,18 @@ _URL_PR_RE = re.compile(r'/pull/(\d+)') +def hash_str_orig(s: str) -> str: + """Replicate bash's `echo "$s" | git hash-object --stdin | cut -c1-16`. + + git hash-object computes SHA-1 of "blob \\0" where content + includes the trailing newline from echo. Length is byte length, not + Unicode code points. + """ + content = (s + "\n").encode('utf-8') + blob = f"blob {len(content)}\0".encode('utf-8') + content + return hashlib.sha1(blob).hexdigest()[:16] + + def compute_run_cost(data: dict) -> float | None: complete = data.get('complete') ts = data.get('timestamp') @@ -31,7 +45,9 @@ def compute_run_cost(data: dict) -> float | None: is_spot = bool(data.get('spot')) rate = ec2_pricing.get_instance_rate(instance_type, is_spot) if not rate: - vcpus = data.get('instance_vcpus', 192) + vcpus = data.get('instance_vcpus') + if not vcpus: + return None # unknown instance type and no vCPU data rate = vcpus * ec2_pricing.get_fallback_vcpu_rate(is_spot) return round(hours * rate, 4) @@ -116,31 +132,37 @@ def _get_ci_runs_from_sqlite(date_from_ms=None, date_to_ms=None): return runs -def get_ci_runs(redis_conn, date_from_ms=None, date_to_ms=None): - """Read CI runs from Redis, backfilled with SQLite for data that Redis has flushed.""" - redis_runs = _get_ci_runs_from_redis(redis_conn, date_from_ms, date_to_ms) - - # Find the earliest timestamp in Redis to know what SQLite needs to fill - redis_keys = set() - redis_min_ts = float('inf') - for run in redis_runs: - ts = run.get('timestamp', 0) - redis_keys.add((run.get('dashboard', ''), ts, run.get('name', ''))) - if ts < redis_min_ts: - redis_min_ts = ts - - # If requesting data older than what Redis has, backfill from SQLite - sqlite_runs = [] - need_sqlite = (date_from_ms is not None and date_from_ms < redis_min_ts) or not redis_runs - if need_sqlite: - sqlite_to = int(redis_min_ts) if redis_runs else date_to_ms - sqlite_runs = _get_ci_runs_from_sqlite(date_from_ms, sqlite_to) - # Deduplicate: only include SQLite runs not already in Redis - sqlite_runs = [r for r in sqlite_runs - if (r.get('dashboard', ''), r.get('timestamp', 0), r.get('name', '')) - not in redis_keys] - - return sqlite_runs + redis_runs +def get_ci_runs(date_from_ms=None, date_to_ms=None): + """Read CI runs from SQLite (the source of truth). + + Redis is only an ingestion pipe — sync_ci_runs_to_sqlite() copies data in. + All reads go through SQLite so enriched fields (instance_type from CloudTrail, + recalculated costs) are always reflected. + """ + return _get_ci_runs_from_sqlite(date_from_ms, date_to_ms) + + +def get_ci_runs_for_pr(pr_number: int, limit: int = 100) -> list: + """Return CI runs for a specific PR, most recent first.""" + rows = db.query( + 'SELECT * FROM ci_runs WHERE pr_number = ? ORDER BY timestamp_ms DESC LIMIT ?', + (pr_number, limit) + ) + return [{ + 'dashboard': row['dashboard'], + 'name': row['name'], + 'timestamp': row['timestamp_ms'], + 'complete': row['complete_ms'], + 'status': row['status'], + 'author': row['author'], + 'pr_number': row['pr_number'], + 'instance_type': row['instance_type'], + 'instance_vcpus': row.get('instance_vcpus'), + 'spot': bool(row['spot']), + 'cost_usd': row['cost_usd'], + 'job_id': row.get('job_id', ''), + 'arch': row.get('arch', ''), + } for row in rows] def _ts_to_date(ts_ms): @@ -149,6 +171,32 @@ def _ts_to_date(ts_ms): # ---- Test event handling (only thing needing SQLite) ---- +def _upsert_daily_stats(status: str, test_cmd: str, dashboard: str, timestamp: str, duration_secs=None): + """Increment the daily counter for a test status.""" + date = timestamp[:10] # 'YYYY-MM-DD' + col = status if status in ('passed', 'failed', 'flaked') else None + if not col: + return + d = duration_secs if duration_secs and duration_secs > 0 else None + if d: + db.execute(f''' + INSERT INTO test_daily_stats (date, test_cmd, dashboard, {col}, total_secs, count_timed, min_secs, max_secs) + VALUES (?, ?, ?, 1, ?, 1, ?, ?) + ON CONFLICT(date, test_cmd, dashboard) DO UPDATE SET + {col} = {col} + 1, + total_secs = total_secs + excluded.total_secs, + count_timed = count_timed + 1, + min_secs = CASE WHEN min_secs IS NULL OR excluded.min_secs < min_secs THEN excluded.min_secs ELSE min_secs END, + max_secs = CASE WHEN max_secs IS NULL OR excluded.max_secs > max_secs THEN excluded.max_secs ELSE max_secs END + ''', (date, test_cmd, dashboard, d, d, d)) + else: + db.execute(f''' + INSERT INTO test_daily_stats (date, test_cmd, dashboard, {col}) + VALUES (?, ?, ?, 1) + ON CONFLICT(date, test_cmd, dashboard) DO UPDATE SET {col} = {col} + 1 + ''', (date, test_cmd, dashboard)) + + def _handle_test_event(channel: str, data: dict): status = channel.split(':')[-1] # Handle field name mismatches: run_test_cmd publishes 'cmd' for failed/flaked @@ -157,12 +205,19 @@ def _handle_test_event(channel: str, data: dict): log_url = data.get('log_url') or data.get('log_key') if log_url and not log_url.startswith('http'): log_url = f'http://ci.aztec-labs.com/{log_url}' + dashboard = data.get('dashboard', '') + timestamp = data.get('timestamp', datetime.now(timezone.utc).isoformat()) + test_hash = hash_str_orig(test_cmd) if test_cmd else None + + # Always update daily stats (lightweight aggregate) + _upsert_daily_stats(status, test_cmd, dashboard, timestamp, data.get('duration_secs')) + db.execute(''' INSERT INTO test_events (status, test_cmd, log_url, ref_name, commit_hash, commit_author, commit_msg, exit_code, duration_secs, is_scenario, owners, - flake_group_id, dashboard, timestamp) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + flake_group_id, dashboard, timestamp, test_hash) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( status, test_cmd, @@ -176,14 +231,15 @@ def _handle_test_event(channel: str, data: dict): 1 if data.get('is_scenario_test') else 0, json.dumps(data['owners']) if data.get('owners') else None, data.get('flake_group_id'), - data.get('dashboard', ''), - data.get('timestamp', datetime.now(timezone.utc).isoformat()), + dashboard, + timestamp, + test_hash, )) def start_test_listener(redis_conn): """Subscribe to test event channels only. Reconnects on failure.""" - channels = [b'ci:test:started', b'ci:test:passed', b'ci:test:failed', b'ci:test:flaked'] + channels = [b'ci:test:passed', b'ci:test:failed', b'ci:test:flaked'] def listener(): backoff = 1 @@ -215,6 +271,163 @@ def listener(): return t +# ---- CI Phase timing listener ---- + +def _handle_phase_event(data: dict): + """Insert a CI phase timing event into SQLite.""" + db.execute(''' + INSERT INTO ci_phases + (phase, duration_secs, exit_code, run_id, job_id, dashboard, + ref_name, commit_hash, timestamp) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + data.get('phase', ''), + data.get('duration_secs', 0), + data.get('exit_code'), + data.get('run_id', ''), + data.get('job_id', ''), + data.get('dashboard', ''), + data.get('ref_name', ''), + data.get('commit_hash', ''), + datetime.now(timezone.utc).isoformat(), + )) + + +def start_phase_listener(redis_conn): + """Subscribe to ci:phase:complete and store in ci_phases table.""" + def listener(): + backoff = 1 + while True: + try: + pubsub = redis_conn.pubsub() + pubsub.subscribe(b'ci:phase:complete') + backoff = 1 + for message in pubsub.listen(): + if message['type'] != 'message': + continue + try: + payload = message['data'] + if isinstance(payload, bytes): + payload = payload.decode() + _handle_phase_event(json.loads(payload)) + except Exception as e: + print(f"[rk_metrics] Error parsing phase event: {e}") + except Exception as e: + print(f"[rk_metrics] Phase listener error (reconnecting in {backoff}s): {e}") + time.sleep(backoff) + backoff = min(backoff * 2, 60) + + t = threading.Thread(target=listener, daemon=True, name='phase-listener') + t.start() + return t + + +def get_phases(date_from: str, date_to: str, dashboard: str = '', + run_id: str = '') -> dict: + """Query CI phase timing data for the API.""" + conditions = ['timestamp >= ?', 'timestamp < ?'] + params: list = [date_from, date_to + 'T23:59:59'] + if dashboard: + conditions.append('dashboard = ?') + params.append(dashboard) + if run_id: + conditions.append('run_id = ?') + params.append(run_id) + where = 'WHERE ' + ' AND '.join(conditions) + + # Aggregate by phase name + by_phase = db.query(f''' + SELECT phase, + COUNT(*) as count, + ROUND(AVG(duration_secs), 1) as avg_secs, + ROUND(MIN(duration_secs), 1) as min_secs, + ROUND(MAX(duration_secs), 1) as max_secs, + ROUND(SUM(duration_secs), 0) as total_secs + FROM ci_phases {where} + GROUP BY phase + ORDER BY total_secs DESC + ''', params) + + # Aggregate by date: avg duration per phase per day + date_rows = db.query(f''' + SELECT substr(timestamp, 1, 10) as date, phase, + ROUND(AVG(duration_secs), 1) as avg_secs, + COUNT(*) as count + FROM ci_phases {where} + GROUP BY date, phase + ORDER BY date + ''', params) + by_date: dict[str, dict] = {} + for row in date_rows: + d = row['date'] + if d not in by_date: + by_date[d] = {'date': d, 'phases': {}} + by_date[d]['phases'][row['phase']] = row['avg_secs'] + + # Recent individual runs with their phases + recent_runs = db.query(f''' + SELECT run_id, job_id, dashboard, ref_name, commit_hash, + phase, duration_secs, exit_code, timestamp + FROM ci_phases {where} + ORDER BY timestamp DESC + LIMIT 500 + ''', params) + runs_map: dict[str, dict] = {} + for row in recent_runs: + rid = row['run_id'] or row['timestamp'] + if rid not in runs_map: + runs_map[rid] = { + 'run_id': row['run_id'], 'job_id': row['job_id'], + 'dashboard': row['dashboard'], 'ref_name': row['ref_name'], + 'commit_hash': row['commit_hash'], 'phases': [], + } + runs_map[rid]['phases'].append({ + 'phase': row['phase'], + 'duration_secs': row['duration_secs'], + 'exit_code': row['exit_code'], + }) + + # Aggregate by dashboard: P95 duration per phase per pipeline. + # Step 1: sum durations within each (dashboard, phase, run_id) — multiple machines + # running the same phase in one run are summed, not counted separately. + # Step 2: compute P95 across run_ids in Python. + per_run_rows = db.query(f''' + SELECT dashboard, phase, run_id, + ROUND(SUM(duration_secs), 3) as run_total + FROM ci_phases {where} + AND dashboard != '' + AND run_id != '' + GROUP BY dashboard, phase, run_id + ''', params) + + import math + from collections import defaultdict + run_totals: dict[tuple, list] = defaultdict(list) + for row in per_run_rows: + run_totals[(row['dashboard'], row['phase'])].append(row['run_total']) + + by_dashboard: dict[str, dict] = {} + for (dash, phase), totals in sorted(run_totals.items()): + totals_s = sorted(totals) + n = len(totals_s) + p95_idx = min(math.ceil(0.95 * n) - 1, n - 1) + p95 = round(totals_s[p95_idx], 1) + if dash not in by_dashboard: + by_dashboard[dash] = {'dashboard': dash, 'phases': {}, 'total_secs': 0, 'count': 0} + by_dashboard[dash]['phases'][phase] = p95 + by_dashboard[dash]['total_secs'] += sum(totals_s) + by_dashboard[dash]['count'] = max(by_dashboard[dash]['count'], n) + for d in by_dashboard.values(): + d['total_secs'] = round(d['total_secs'], 1) + + return { + 'by_phase': by_phase, + 'by_date': list(by_date.values()), + 'by_dashboard': list(by_dashboard.values()), + 'recent_runs': list(runs_map.values())[:50], + } + + # ---- Sync failed_tests_{section} lists from Redis into SQLite ---- _ANSI_STRIP = re.compile(r'\x1b\[[^m]*m|\x1b\]8;;[^\x07]*\x07') @@ -326,18 +539,18 @@ def sync_failed_tests_to_sqlite(redis_conn): _failed_tests_sync_ts = now conn = db.get_db() - # Track existing entries to avoid duplicates: log_url for entries that have one, - # (test_cmd, timestamp, dashboard) composite key for entries without log_url + # Track existing failed/flaked entries to avoid duplicates (this sync only + # processes failed/flaked from Redis lists, so no need to scan passed rows). existing_urls = {row['log_url'] for row in conn.execute( - "SELECT DISTINCT log_url FROM test_events WHERE log_url IS NOT NULL" + "SELECT DISTINCT log_url FROM test_events WHERE log_url IS NOT NULL AND status IN ('failed', 'flaked')" ).fetchall()} existing_keys = {(row['test_cmd'], row['timestamp'], row['dashboard']) for row in conn.execute( - "SELECT test_cmd, timestamp, dashboard FROM test_events WHERE log_url IS NULL" + "SELECT test_cmd, timestamp, dashboard FROM test_events WHERE log_url IS NULL AND status IN ('failed', 'flaked')" ).fetchall()} total = 0 - for section in SECTIONS: - key = f'failed_tests_{section}' + for section in SECTIONS + ['']: + key = f'failed_tests_{section}' if section else 'failed_tests' try: entries = redis_conn.lrange(key, 0, -1) except Exception as e: @@ -363,21 +576,27 @@ def sync_failed_tests_to_sqlite(redis_conn): INSERT INTO test_events (status, test_cmd, log_url, ref_name, commit_author, commit_msg, duration_secs, flake_group_id, dashboard, - timestamp) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + timestamp, test_hash) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( parsed['status'], parsed['test_cmd'], parsed['log_url'], parsed['ref_name'], parsed['commit_author'], parsed['commit_msg'], parsed['duration_secs'], parsed['flake_group_id'], parsed['dashboard'], parsed['timestamp'], + hash_str_orig(parsed['test_cmd']) if parsed['test_cmd'] else None, )) + _upsert_daily_stats( + parsed['status'], parsed['test_cmd'], + parsed['dashboard'], parsed['timestamp']) total += 1 except Exception as e: print(f"[rk_metrics] Error inserting test event: {e}") conn.commit() if total: print(f"[rk_metrics] Synced {total} test events from Redis lists") + db.cache_invalidate_prefix('flakes:') + db.cache_invalidate_prefix('timings:') # ---- Seed loading ---- @@ -437,15 +656,16 @@ def _load_seed_data(): events = data['test_events'] for ev in events: try: + te_cmd = ev.get('test_cmd', '') conn.execute(''' INSERT OR IGNORE INTO test_events (status, test_cmd, log_url, ref_name, commit_hash, commit_author, commit_msg, exit_code, duration_secs, is_scenario, owners, - flake_group_id, dashboard, timestamp) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + flake_group_id, dashboard, timestamp, test_hash) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( ev.get('status', ''), - ev.get('test_cmd', ''), + te_cmd, ev.get('log_url'), ev.get('ref_name', ''), ev.get('commit_hash'), @@ -458,6 +678,7 @@ def _load_seed_data(): ev.get('flake_group_id'), ev.get('dashboard', ''), ev.get('timestamp', ''), + hash_str_orig(te_cmd) if te_cmd else None, )) except Exception: continue @@ -472,14 +693,19 @@ def _load_seed_data(): def sync_ci_runs_to_sqlite(redis_conn): - """Sync all CI runs from Redis into SQLite for persistence.""" + """Ingest CI runs from Redis into SQLite. + + Redis is the ingestion pipe (log_ci_run writes there from CI instances). + SQLite is the source of truth. Fields enriched post-ingestion (instance_type, + cost_usd from CloudTrail resolution) are preserved — only overwritten if + Redis has a non-empty value. + """ global _ci_sync_ts now = time.time() if now - _ci_sync_ts < _CI_SYNC_TTL: return _ci_sync_ts = now - # Sync everything Redis has (not just 30 days) runs = _get_ci_runs_from_redis(redis_conn) now_iso = datetime.now(timezone.utc).isoformat() @@ -488,11 +714,32 @@ def sync_ci_runs_to_sqlite(redis_conn): for run in runs: try: conn.execute(''' - INSERT OR REPLACE INTO ci_runs + INSERT INTO ci_runs (dashboard, name, timestamp_ms, complete_ms, status, author, pr_number, instance_type, instance_vcpus, spot, cost_usd, job_id, arch, synced_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(dashboard, timestamp_ms, name) DO UPDATE SET + complete_ms = excluded.complete_ms, + status = excluded.status, + author = excluded.author, + pr_number = excluded.pr_number, + instance_vcpus = excluded.instance_vcpus, + spot = excluded.spot, + job_id = excluded.job_id, + arch = excluded.arch, + synced_at = excluded.synced_at, + -- Preserve enriched fields: only overwrite if Redis has real data + instance_type = CASE + WHEN excluded.instance_type IS NOT NULL AND excluded.instance_type != '' + THEN excluded.instance_type + ELSE ci_runs.instance_type + END, + cost_usd = CASE + WHEN excluded.instance_type IS NOT NULL AND excluded.instance_type != '' + THEN excluded.cost_usd + ELSE ci_runs.cost_usd + END ''', ( run.get('dashboard', ''), run.get('name', ''), @@ -514,17 +761,372 @@ def sync_ci_runs_to_sqlite(redis_conn): print(f"[rk_metrics] Error syncing run: {e}") conn.commit() print(f"[rk_metrics] Synced {count} CI runs to SQLite") + db.cache_invalidate_prefix('perf:') + + +def _backfill_daily_stats(): + """Populate test_daily_stats from existing test_events rows. + + Uses INSERT OR IGNORE to fill gaps without overwriting data from the + real-time listener. Safe to call repeatedly — skips dates/tests that + already have rows. + """ + conn = db.get_db() + cur = conn.execute(''' + INSERT OR IGNORE INTO test_daily_stats (date, test_cmd, dashboard, passed, failed, flaked) + SELECT substr(timestamp, 1, 10) as date, test_cmd, dashboard, + SUM(CASE WHEN status = 'passed' THEN 1 ELSE 0 END), + SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END), + SUM(CASE WHEN status = 'flaked' THEN 1 ELSE 0 END) + FROM test_events + GROUP BY substr(timestamp, 1, 10), test_cmd, dashboard + ''') + conn.commit() + if cur.rowcount and cur.rowcount > 0: + print(f"[rk_metrics] Backfilled {cur.rowcount} daily stat rows from test_events") + + +def _materialize_ci_run_daily_stats(): + """Recompute ci_run_daily_stats from ci_runs. + + Replaces all rows — safe to call repeatedly. Stores pre-aggregated + duration percentiles so the API doesn't need to scan raw rows. + """ + conn = db.get_db() + # Fetch raw daily durations grouped by date + dashboard + rows = conn.execute(''' + SELECT + strftime('%Y-%m-%d', timestamp_ms / 1000, 'unixepoch') AS date, + dashboard, + (complete_ms - timestamp_ms) / 60000.0 AS dur_mins + FROM ci_runs + WHERE status IN ('PASSED', 'FAILED') + AND complete_ms IS NOT NULL AND complete_ms > timestamp_ms + ''').fetchall() + + # Group durations: {(date, dashboard): [dur_mins, ...]} + groups = {} + for r in rows: + key = (r['date'], r['dashboard']) + groups.setdefault(key, {'passed': 0, 'failed': 0, 'durs': []}) + groups[key]['durs'].append(r['dur_mins']) + + # Also count pass/fail per group + status_rows = conn.execute(''' + SELECT + strftime('%Y-%m-%d', timestamp_ms / 1000, 'unixepoch') AS date, + dashboard, status, COUNT(*) as cnt + FROM ci_runs + WHERE status IN ('PASSED', 'FAILED') + GROUP BY date, dashboard, status + ''').fetchall() + for r in status_rows: + key = (r['date'], r['dashboard']) + if key not in groups: + groups[key] = {'passed': 0, 'failed': 0, 'durs': []} + if r['status'] == 'PASSED': + groups[key]['passed'] = r['cnt'] + else: + groups[key]['failed'] = r['cnt'] + + conn.execute('DELETE FROM ci_run_daily_stats') + inserted = 0 + for (date, dashboard), g in groups.items(): + durs = sorted(g['durs']) + n = len(durs) + conn.execute(''' + INSERT INTO ci_run_daily_stats + (date, dashboard, run_count, passed, failed, + sum_duration, min_duration, max_duration, p50_duration, p95_duration) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + date, dashboard, g['passed'] + g['failed'], + g['passed'], g['failed'], + round(sum(durs), 2) if durs else 0, + round(min(durs), 1) if durs else None, + round(max(durs), 1) if durs else None, + round(durs[n // 2], 1) if durs else None, + round(durs[int(n * 0.95)], 1) if durs else None, + )) + inserted += 1 + conn.commit() + print(f"[rk_metrics] Materialized {inserted} ci_run_daily_stats rows") + + +def _backfill_test_hashes(): + """Populate test_hash for existing test_events rows that are missing it.""" + conn = db.get_db() + rows = conn.execute( + "SELECT DISTINCT test_cmd FROM test_events WHERE test_hash IS NULL AND test_cmd != ''" + ).fetchall() + if not rows: + return + for row in rows: + cmd = row['test_cmd'] + h = hash_str_orig(cmd) + conn.execute( + "UPDATE test_events SET test_hash = ? WHERE test_cmd = ? AND test_hash IS NULL", + (h, cmd)) + conn.commit() + print(f"[rk_metrics] Backfilled test_hash for {len(rows)} distinct test commands") + + +# ---- CloudTrail instance type resolution ---- + +_ct_resolve_ts = 0 +_CT_RESOLVE_TTL = 6 * 3600 # 6 hours + + +def _fetch_cloudtrail_daily(ct, event_name, start_time, end_time, max_per_day=10000): + """Fetch CloudTrail events in daily chunks to avoid the 5000-event global limit.""" + events = [] + day = start_time.replace(hour=0, minute=0, second=0, microsecond=0) + while day < end_time: + day_end = min(day + timedelta(days=1), end_time) + kwargs = { + 'LookupAttributes': [ + {'AttributeKey': 'EventName', 'AttributeValue': event_name}, + ], + 'StartTime': day, + 'EndTime': day_end, + 'MaxResults': 50, + } + while True: + resp = ct.lookup_events(**kwargs) + events.extend(resp.get('Events', [])) + token = resp.get('NextToken') + if not token or len(events) >= max_per_day: + break + kwargs['NextToken'] = token + day += timedelta(days=1) + return events + + +# Name tag format: _[_] +_NAME_TAG_RE = re.compile(r'^(.+)_(amd64|arm64)(?:_.*)?$') + + +def _normalize_branch_name(name): + """Normalize a branch name the same way bootstrap_ec2 does for the EC2 Name tag.""" + m = re.match(r'^gh-readonly-queue/[^/]+/pr-(\d+)', name) + if m: + return f'pr-{m.group(1)}' + name = re.sub(r'\s*\(queue\)$', '', name) + return re.sub(r'[^a-zA-Z0-9-]', '_', name[:50]) + + +def resolve_unknown_instance_types(): + """Query CloudTrail for RunInstances + CreateTags events to resolve unknown instance types. + + Strategy: + 1. Fetch RunInstances events (daily chunks) → instance_id → instance_type + launch_time + 2. Fetch CreateTags events (daily chunks) → instance_id → {Name, Group, Dashboard, ...} + Tags are accumulated across multiple events then filtered to Group=build-instance. + 3. Join by instance_id, then match to ci_runs by normalized branch name + arch + time window. + """ + global _ct_resolve_ts + now = time.time() + if now - _ct_resolve_ts < _CT_RESOLVE_TTL: + return + _ct_resolve_ts = now + + conn = db.get_db() + unknown_runs = conn.execute(''' + SELECT dashboard, name, timestamp_ms, complete_ms, instance_vcpus, spot, + cost_usd, arch, pr_number + FROM ci_runs + WHERE (instance_type IS NULL OR instance_type = '' OR instance_type = 'unknown') + AND timestamp_ms > ? + ''', (int((time.time() - 90 * 86400) * 1000),)).fetchall() + + if not unknown_runs: + return + + try: + import boto3 + except ImportError: + return + + try: + ct = boto3.client('cloudtrail', region_name='us-east-2') + start_time = datetime.fromtimestamp( + min(r['timestamp_ms'] for r in unknown_runs) / 1000 - 300, tz=timezone.utc) + end_time = datetime.now(timezone.utc) + + # Step 1: Fetch RunInstances events in daily chunks → instance_id → type + launch time + run_events = _fetch_cloudtrail_daily(ct, 'RunInstances', start_time, end_time) + instance_types = {} + instance_launch_times = {} + for event in run_events: + try: + detail = json.loads(event.get('CloudTrailEvent', '{}')) + itype = detail.get('requestParameters', {}).get('instanceType', '') + items = (detail.get('responseElements') or {}).get('instancesSet', {}).get('items', []) + for item in items: + iid = item.get('instanceId', '') + item_type = item.get('instanceType', '') or itype + if iid and item_type: + instance_types[iid] = item_type + instance_launch_times[iid] = int(event['EventTime'].timestamp() * 1000) + except Exception: + continue + + if not instance_types: + print("[rk_metrics] CloudTrail: no RunInstances events found") + return + + # Step 2: Fetch CreateTags events in daily chunks. + # Accumulate ALL tags per instance first, then filter to build instances. + # This handles the case where Name, Group, and Dashboard are set in separate + # create-tags API calls (aws_request_instance_type lines 97, 126, 127). + tag_events = _fetch_cloudtrail_daily(ct, 'CreateTags', start_time, end_time) + all_instance_tags = {} + for event in tag_events: + try: + detail = json.loads(event.get('CloudTrailEvent', '{}')) + req = detail.get('requestParameters', {}) + resources = req.get('resourcesSet', {}).get('items', []) + tags = req.get('tagSet', {}).get('items', []) + tag_dict = {t.get('key', ''): t.get('value', '') for t in tags} + for res in resources: + rid = res.get('resourceId', '') + if rid.startswith('i-'): + if rid not in all_instance_tags: + all_instance_tags[rid] = {} + all_instance_tags[rid].update(tag_dict) + except Exception: + continue + + # Filter to build instances + instance_tags = { + iid: tags for iid, tags in all_instance_tags.items() + if tags.get('Group') == 'build-instance' + } + + # Step 3: Join RunInstances + CreateTags by instance_id + instances = [] + for iid, itype in instance_types.items(): + tags = instance_tags.get(iid, {}) + if not tags.get('Name'): + continue + instances.append({ + 'instance_type': itype, + 'launch_ms': instance_launch_times.get(iid, 0), + 'dashboard': tags.get('Dashboard', ''), + 'name_tag': tags.get('Name', ''), + }) + + # Build index: normalized branch name → [instances] + tag_index = {} + for inst in instances: + m = _NAME_TAG_RE.match(inst['name_tag']) + if m: + tag_index.setdefault(m.group(1), []).append(inst) + else: + tag_index.setdefault(inst['name_tag'], []).append(inst) + + # Step 4: Match unknown runs to instances + updated = 0 + for run in unknown_runs: + run_name = run['name'] + run_arch = run['arch'] or '' + run_ts = run['timestamp_ms'] + run_dashboard = run['dashboard'] + + expected_name = _normalize_branch_name(run_name) + candidates = tag_index.get(expected_name, []) + + best = None + for inst in candidates: + # Verify arch matches + if run_arch: + m = _NAME_TAG_RE.match(inst['name_tag']) + if m and m.group(2) != run_arch: + continue + + # Verify dashboard matches (if tag present) + if inst['dashboard'] and inst['dashboard'] != run_dashboard: + continue + + # CI run starts after instance launch; allow up to 90 min (instance lifetime) + delta = run_ts - inst['launch_ms'] + if delta < -60_000 or delta > 5400_000: + continue + + # Prefer most recently launched instance before the run + if delta >= 0 and (best is None or inst['launch_ms'] > best['launch_ms']): + best = inst + elif best is None and abs(delta) < 60_000: + best = inst + + if best: + itype = best['instance_type'] + is_spot = bool(run['spot']) + rate = ec2_pricing.get_instance_rate(itype, is_spot) + new_cost = run['cost_usd'] + if rate and run['complete_ms'] and run['timestamp_ms']: + hours = (run['complete_ms'] - run['timestamp_ms']) / 3_600_000 + new_cost = round(hours * rate, 4) + conn.execute(''' + UPDATE ci_runs SET instance_type = ?, cost_usd = ? + WHERE dashboard = ? AND timestamp_ms = ? AND name = ? + ''', (itype, new_cost, run['dashboard'], run['timestamp_ms'], run['name'])) + updated += 1 + + conn.commit() + if updated: + print(f"[rk_metrics] CloudTrail: resolved {updated}/{len(unknown_runs)} unknown instance types") + else: + print(f"[rk_metrics] CloudTrail: {len(instances)} instances, " + f"0/{len(unknown_runs)} matched") + except Exception as e: + print(f"[rk_metrics] CloudTrail resolution failed: {e}") + + +def recalculate_all_costs(): + """Recalculate cost_usd for all ci_runs based on current instance_type and pricing.""" + conn = db.get_db() + runs = conn.execute(''' + SELECT dashboard, name, timestamp_ms, complete_ms, instance_type, + instance_vcpus, spot, cost_usd + FROM ci_runs + WHERE complete_ms IS NOT NULL AND complete_ms > 0 + ''').fetchall() + updated = 0 + for run in runs: + cost = compute_run_cost({ + 'complete': run['complete_ms'], + 'timestamp': run['timestamp_ms'], + 'instance_type': run['instance_type'] or 'unknown', + 'spot': run['spot'], + 'instance_vcpus': run['instance_vcpus'], + }) + if cost is not None and cost != run['cost_usd']: + conn.execute(''' + UPDATE ci_runs SET cost_usd = ? + WHERE dashboard = ? AND timestamp_ms = ? AND name = ? + ''', (cost, run['dashboard'], run['timestamp_ms'], run['name'])) + updated += 1 + conn.commit() + print(f"[rk_metrics] Recalculated costs: {updated}/{len(runs)} changed") + return updated def start_ci_run_sync(redis_conn): """Start periodic CI run + test event sync thread.""" _load_seed_data() + _backfill_daily_stats() + _backfill_test_hashes() + _materialize_ci_run_daily_stats() def loop(): while True: try: sync_ci_runs_to_sqlite(redis_conn) sync_failed_tests_to_sqlite(redis_conn) + resolve_unknown_instance_types() + _materialize_ci_run_daily_stats() + db.cache_cleanup() except Exception as e: print(f"[rk_metrics] sync error: {e}") time.sleep(600) # check every 10 min (TTL gates actual work) @@ -600,3 +1202,21 @@ def get_flakes_by_command(date_from, date_to, dashboard=''): 'total_failures': sum(failures_by_command.values()), }, } + + +def get_test_history(test_hash: str, branch: str = '', limit: int = 1000) -> list[dict]: + """Get test event history by test_hash, matching Redis history_{hash}[_{branch}] lists.""" + conditions = ['test_hash = ?'] + params: list = [test_hash] + if branch: + conditions.append('ref_name = ?') + params.append(branch) + where = 'WHERE ' + ' AND '.join(conditions) + params.append(limit) + return db.query(f''' + SELECT status, test_cmd, log_url, ref_name, commit_author, + commit_msg, duration_secs, dashboard, timestamp + FROM test_events {where} + ORDER BY timestamp DESC + LIMIT ? + ''', params) diff --git a/ci3/ci-metrics/requirements.txt b/ci3/ci-metrics/requirements.txt index d6516263133f..310ecadf230a 100644 --- a/ci3/ci-metrics/requirements.txt +++ b/ci3/ci-metrics/requirements.txt @@ -6,3 +6,4 @@ Flask-HTTPAuth requests google-cloud-bigquery boto3 +pytest diff --git a/ci3/ci-metrics/rk.py b/ci3/ci-metrics/rk.py new file mode 100644 index 000000000000..d099b92dbbd7 --- /dev/null +++ b/ci3/ci-metrics/rk.py @@ -0,0 +1 @@ +from app import app diff --git a/ci3/ci-metrics/test_cache.py b/ci3/ci-metrics/test_cache.py new file mode 100644 index 000000000000..5538a3810958 --- /dev/null +++ b/ci3/ci-metrics/test_cache.py @@ -0,0 +1,154 @@ +"""Automated performance tests: SQLite response cache makes 1-year ci-insights fast. + +Usage: + pip install pytest + METRICS_DB_PATH=/tmp/t.db DASHBOARD_PASSWORD=test REDIS_HOST=invalid pytest test_cache.py -v + +All 18 parametrised tests should pass. Cold requests may take several seconds; +warm (cached) requests must be < 100 ms each. +""" +import base64 +import json +import os +import tempfile +import time +from datetime import date, timedelta + +# Set env vars BEFORE importing the app so db path and Redis host are fixed +_db_path = tempfile.mktemp(suffix='.db') +os.environ.setdefault('METRICS_DB_PATH', _db_path) +os.environ.setdefault('DASHBOARD_PASSWORD', 'test') +os.environ.setdefault('REDIS_HOST', 'invalid') # causes Redis errors, swallowed silently + +import pytest + +# Import app after env vars are set; background threads start but Redis fails gracefully +from app import app +import db + +# Basic-auth header for 'test:test' +_AUTH = {'Authorization': 'Basic ' + base64.b64encode(b'test:test').decode()} + +YEAR_FROM = '2025-02-24' +YEAR_TO = '2026-02-24' + +ENDPOINTS = [ + f'/api/ci/performance?from={YEAR_FROM}&to={YEAR_TO}&granularity=daily', + f'/api/ci/phases?from={YEAR_FROM}&to={YEAR_TO}', + f'/api/ci/flakes-by-command?from={YEAR_FROM}&to={YEAR_TO}', + f'/api/tests/timings?from={YEAR_FROM}&to={YEAR_TO}', + f'/api/merge-queue/stats?from={YEAR_FROM}&to={YEAR_TO}', + f'/api/prs/metrics?from={YEAR_FROM}&to={YEAR_TO}', +] + + +def _seed(): + """Insert one year of synthetic data covering all 6 ci-insights endpoints.""" + conn = db.get_db() + dashboards = ['next', 'prs', 'master'] + start = date(2025, 2, 24) + end = date(2026, 2, 24) + ts_base = int(time.mktime(start.timetuple())) * 1000 + ms_per_day = 86_400_000 + + for i, day in enumerate( + start + timedelta(days=n) for n in range((end - start).days + 1) + ): + ds = day.isoformat() + ts = ts_base + i * ms_per_day + + # merge_queue_daily — one row per day + conn.execute( + 'INSERT OR IGNORE INTO merge_queue_daily (date, total, success, failure) VALUES (?,10,8,2)', + (ds,), + ) + + for dash in dashboards: + # ci_runs — 5 per pipeline per day + for j in range(5): + conn.execute( + '''INSERT OR IGNORE INTO ci_runs + (dashboard, name, timestamp_ms, complete_ms, status, author, synced_at) + VALUES (?,?,?,?,?,?,?)''', + ( + dash, f'run-{i}-{dash}-{j}', + ts + j * 60_000, + ts + j * 60_000 + 3_600_000, + 'PASSED' if j % 5 != 0 else 'FAILED', + 'ci-bot', ds, + ), + ) + + # test_daily_stats + conn.execute( + '''INSERT OR IGNORE INTO test_daily_stats + (date, test_cmd, dashboard, passed, failed, flaked) VALUES (?,?,?,80,5,2)''', + (ds, f'test_{dash}', dash), + ) + + # test_events — 3 per pipeline per day (one flaked for flakes endpoint) + for j in range(3): + conn.execute( + '''INSERT OR IGNORE INTO test_events + (status, test_cmd, ref_name, dashboard, timestamp, duration_secs) + VALUES (?,?,?,?,?,?)''', + ( + 'passed' if j < 2 else 'flaked', + f'test_{dash}', + 'main', dash, + f'{ds}T12:00:0{j}', + 30.0 + j, + ), + ) + + # ci_phases — build + test phases per pipeline per day + for phase in ('build', 'test'): + conn.execute( + '''INSERT OR IGNORE INTO ci_phases + (phase, duration_secs, dashboard, timestamp) VALUES (?,?,?,?)''', + (phase, 1200.0, dash, f'{ds}T12:00:00'), + ) + + conn.commit() + + +@pytest.fixture(scope='session', autouse=True) +def seeded_db(): + _seed() + + +@pytest.fixture(scope='session') +def client(): + app.config['TESTING'] = True + with app.test_client() as c: + yield c + + +@pytest.mark.parametrize('url', ENDPOINTS) +def test_cold_returns_valid_json(client, url): + """First request computes from SQLite and returns valid JSON.""" + r = client.get(url, headers=_AUTH) + assert r.status_code == 200, f'HTTP {r.status_code}: {r.data[:200]}' + data = json.loads(r.data) + assert data # non-empty response + + +@pytest.mark.parametrize('url', ENDPOINTS) +def test_warm_hit_under_100ms(client, url): + """Second request is served from cache and completes in < 100 ms.""" + # Ensure cold request ran (order not guaranteed across parametrised tests) + client.get(url, headers=_AUTH) + # Warm request — must hit cache + t0 = time.perf_counter() + r = client.get(url, headers=_AUTH) + elapsed_ms = (time.perf_counter() - t0) * 1000 + assert r.status_code == 200 + assert elapsed_ms < 100, f'{url}: cache hit took {elapsed_ms:.1f} ms (limit 100 ms)' + + +@pytest.mark.parametrize('url', ENDPOINTS) +def test_cached_response_matches_original(client, url): + """Cached response is byte-for-byte identical to the original.""" + r1 = client.get(url, headers=_AUTH) + r2 = client.get(url, headers=_AUTH) + assert r1.data == r2.data diff --git a/ci3/ci-metrics/test_cloudtrail.py b/ci3/ci-metrics/test_cloudtrail.py new file mode 100644 index 000000000000..8acd71925cec --- /dev/null +++ b/ci3/ci-metrics/test_cloudtrail.py @@ -0,0 +1,302 @@ +#!/usr/bin/env python3 +"""Test CloudTrail instance type resolution against real data + SQLite. + +Usage: + python3 test_cloudtrail.py /path/to/metrics.db --dry-run # preview matches + python3 test_cloudtrail.py /path/to/metrics.db # apply updates + python3 test_cloudtrail.py --days 7 --dry-run # only last 7 days +""" +import json +import os +import re +import sqlite3 +import sys +from datetime import datetime, timedelta, timezone + +try: + import boto3 +except ImportError: + print("ERROR: boto3 not installed") + sys.exit(1) + +DB_PATH = os.getenv('METRICS_DB_PATH', + os.path.join(os.getenv('LOGS_DISK_PATH', '/logs-disk'), 'metrics.db')) +for arg in sys.argv[1:]: + if not arg.startswith('-') and os.path.exists(arg): + DB_PATH = arg + break + +dry_run = '--dry-run' in sys.argv +days_back = 90 +for i, arg in enumerate(sys.argv): + if arg == '--days' and i + 1 < len(sys.argv): + days_back = int(sys.argv[i + 1]) + +ct = boto3.client('cloudtrail', region_name='us-east-2') + + +def fetch_events(event_name, start, end, max_events=10000): + events = [] + kwargs = { + 'LookupAttributes': [{'AttributeKey': 'EventName', 'AttributeValue': event_name}], + 'StartTime': start, 'EndTime': end, 'MaxResults': 50, + } + while True: + resp = ct.lookup_events(**kwargs) + events.extend(resp.get('Events', [])) + token = resp.get('NextToken') + if not token or len(events) >= max_events: + break + kwargs['NextToken'] = token + return events + + +def normalize_branch_name(name): + """Normalize a branch name the same way bootstrap_ec2 does for the EC2 Name tag.""" + # Strip merge queue prefix: gh-readonly-queue/next/pr-123-... → pr-123 + m = re.match(r'^gh-readonly-queue/[^/]+/pr-(\d+)', name) + if m: + return f'pr-{m.group(1)}' + # Strip " (queue)" suffix from log_ci_run simplified names + name = re.sub(r'\s*\(queue\)$', '', name) + # Same as: echo -n "$REF_NAME" | head -c 50 | tr -c 'a-zA-Z0-9-' '_' + return re.sub(r'[^a-zA-Z0-9-]', '_', name[:50]) + + +# ---- Step 1: Fetch RunInstances events in daily chunks ---- +end_time = datetime.now(timezone.utc) +start_time = end_time - timedelta(days=days_back) + +print(f"Fetching RunInstances events in daily chunks ({start_time.date()} to {end_time.date()})...") +instance_types = {} # instance_id → instance_type +instance_times = {} # instance_id → launch_time_ms +total_run_events = 0 + +day_start = start_time.replace(hour=0, minute=0, second=0, microsecond=0) +while day_start < end_time: + day_end = min(day_start + timedelta(days=1), end_time) + events = fetch_events('RunInstances', day_start, day_end) + total_run_events += len(events) + + for event in events: + try: + detail = json.loads(event.get('CloudTrailEvent', '{}')) + itype = detail.get('requestParameters', {}).get('instanceType', '') + items = (detail.get('responseElements') or {}).get('instancesSet', {}).get('items', []) + for item in items: + iid = item.get('instanceId', '') + item_type = item.get('instanceType', '') or itype + if iid and item_type: + instance_types[iid] = item_type + instance_times[iid] = int(event['EventTime'].timestamp() * 1000) + except Exception: + continue + + day_start = day_start + timedelta(days=1) + sys.stdout.write(f"\r {day_start.strftime('%Y-%m-%d')}: {total_run_events} events, {len(instance_types)} instances") + sys.stdout.flush() + +print(f"\n Total: {total_run_events} RunInstances events, {len(instance_types)} unique instances") + +if not instance_types: + print("No RunInstances data. Exiting.") + sys.exit(1) + +# ---- Step 2: Fetch CreateTags events in daily chunks ---- +# NOTE: Tags are applied to CI instances in multiple create-tags calls: +# 1. aws_request_instance_type line 97: Name + Group + GithubActor + CICommand + Dashboard (all at once) +# 2. aws_request_instance_type line 126: Name only (redundant, after SSH) +# 3. aws_request_instance_type line 127: Group only (redundant, after SSH) +# CloudTrail sometimes misses event #1, so we must accumulate tags from ALL events +# for each instance, then filter to build instances afterwards. +print(f"\nFetching CreateTags events in daily chunks...") +all_instance_tags = {} # instance_id → accumulated tags (unfiltered) +total_tag_events = 0 + +day_start = start_time.replace(hour=0, minute=0, second=0, microsecond=0) +while day_start < end_time: + day_end = min(day_start + timedelta(days=1), end_time) + events = fetch_events('CreateTags', day_start, day_end) + total_tag_events += len(events) + + for event in events: + try: + detail = json.loads(event.get('CloudTrailEvent', '{}')) + req = detail.get('requestParameters', {}) + resources = req.get('resourcesSet', {}).get('items', []) + tags = req.get('tagSet', {}).get('items', []) + tag_dict = {t.get('key', ''): t.get('value', '') for t in tags} + for res in resources: + rid = res.get('resourceId', '') + if rid.startswith('i-'): + if rid not in all_instance_tags: + all_instance_tags[rid] = {} + all_instance_tags[rid].update(tag_dict) + except Exception: + continue + + day_start = day_start + timedelta(days=1) + sys.stdout.write(f"\r {day_start.strftime('%Y-%m-%d')}: {total_tag_events} events, {len(all_instance_tags)} instances") + sys.stdout.flush() + +# Filter to build instances (those with Group=build-instance tag) +instance_tags = { + iid: tags for iid, tags in all_instance_tags.items() + if tags.get('Group') == 'build-instance' +} +print(f"\n Total: {total_tag_events} CreateTags events, {len(all_instance_tags)} total instances, {len(instance_tags)} build instances") + +# ---- Step 3: Join RunInstances + CreateTags by instance_id ---- +instances = [] +joined_count = 0 +for iid, itype in instance_types.items(): + tags = instance_tags.get(iid, {}) + has_tags = bool(tags.get('Name')) + if has_tags: + joined_count += 1 + instances.append({ + 'instance_id': iid, + 'instance_type': itype, + 'launch_ms': instance_times.get(iid, 0), + 'dashboard': tags.get('Dashboard', ''), + 'name_tag': tags.get('Name', ''), + 'actor': tags.get('GithubActor', ''), + }) + +print(f"\n Joined: {len(instances)} total RunInstances, {joined_count} with Name tag from CreateTags") +print(f" CreateTags instances NOT in RunInstances: {len(instance_tags) - joined_count}") + +# Show type distribution +type_counts = {} +for inst in instances: + if inst['name_tag']: + type_counts[inst['instance_type']] = type_counts.get(inst['instance_type'], 0) + 1 +print(f"\n Instance types (from joined data):") +for t, c in sorted(type_counts.items(), key=lambda x: -x[1]): + print(f" {t}: {c}") + +# ---- Step 4: Load SQLite and match ---- +if not os.path.exists(DB_PATH): + print(f"\nNo database at {DB_PATH}. Exiting after CloudTrail summary.") + sys.exit(0) + +conn = sqlite3.connect(DB_PATH) +conn.row_factory = sqlite3.Row + +cutoff_ms = int((datetime.now(timezone.utc) - timedelta(days=days_back)).timestamp() * 1000) +unknown_runs = conn.execute(''' + SELECT dashboard, name, timestamp_ms, complete_ms, instance_vcpus, spot, + cost_usd, arch, pr_number + FROM ci_runs + WHERE (instance_type IS NULL OR instance_type = '' OR instance_type = 'unknown') + AND timestamp_ms > ? +''', (cutoff_ms,)).fetchall() +print(f"\n{len(unknown_runs)} unknown ci_runs in last {days_back} days") + +# Build lookup: normalized_name → [instances] for fast matching +# Name tag format: _[_] +# Examples: +# next_amd64 → branch=next +# merge-train_spartan_amd64_17 → branch=merge-train_spartan +# pr-20419_arm64_a1-fast → branch=pr-20419 +# cl_all_in_makefile_arm64_a1-fast → branch=cl_all_in_makefile +_NAME_TAG_RE = re.compile(r'^(.+)_(amd64|arm64)(?:_.*)?$') +tag_index = {} +for inst in instances: + if inst['name_tag']: + m = _NAME_TAG_RE.match(inst['name_tag']) + if m: + branch = m.group(1) + tag_index.setdefault(branch, []).append(inst) + else: + # No arch suffix found — use name as-is + tag_index.setdefault(inst['name_tag'], []).append(inst) + +updated = 0 +unmatched_dashboards = {} +matches = [] +for run in unknown_runs: + run_name = run['name'] + run_arch = run['arch'] or '' + run_ts = run['timestamp_ms'] + run_dashboard = run['dashboard'] + + # Compute expected EC2 instance name (same as bootstrap_ec2) + expected_name = normalize_branch_name(run_name) + + # Look up by normalized name + candidates = tag_index.get(expected_name, []) + + best = None + best_delta = float('inf') + for inst in candidates: + # Verify arch matches — Name tag format: branch_[_postfix] + if run_arch: + m = _NAME_TAG_RE.match(inst['name_tag']) + if m and m.group(2) != run_arch: + continue + # Verify dashboard matches (if tag present) + if inst['dashboard'] and inst['dashboard'] != run_dashboard: + continue + # CI run should start AFTER instance launch. Instance runs multiple steps + # over its ~90-minute lifetime (default shutdown timer). + delta = run_ts - inst['launch_ms'] + if delta < -60_000: # run shouldn't start >1 min before launch + continue + if delta > 5400_000: # 90 min max lifetime + continue + # Prefer the most recently launched instance (closest launch BEFORE run) + if delta >= 0 and (best is None or inst['launch_ms'] > best['launch_ms']): + best_delta = delta + best = inst + elif best is None and abs(delta) < 60_000: + # Allow small negative delta (clock skew) + best_delta = abs(delta) + best = inst + + if best: + matches.append({ + 'dashboard': run_dashboard, + 'name': run_name, + 'timestamp_ms': run_ts, + 'new_type': best['instance_type'], + 'delta_s': round(best_delta / 1000), + 'tag': best['name_tag'], + 'iid': best['instance_id'], + }) + if not dry_run: + conn.execute(''' + UPDATE ci_runs SET instance_type = ? + WHERE dashboard = ? AND timestamp_ms = ? AND name = ? + ''', (best['instance_type'], run_dashboard, run_ts, run_name)) + updated += 1 + else: + unmatched_dashboards[run_dashboard] = unmatched_dashboards.get(run_dashboard, 0) + 1 + +if not dry_run and updated: + conn.commit() + +print(f"\n{'Would resolve' if dry_run else 'Resolved'} {updated}/{len(unknown_runs)} unknown instance types") + +if matches: + print(f"\nSample matches:") + for m in matches[:30]: + print(f" [{m['dashboard']:6s}] {m['name']:45s} -> {m['new_type']:15s} " + f"(dt={m['delta_s']:4d}s, tag={m['tag']}, id={m['iid']})") + if len(matches) > 30: + print(f" ... and {len(matches) - 30} more") + + # Summary by type + type_counts = {} + for m in matches: + type_counts[m['new_type']] = type_counts.get(m['new_type'], 0) + 1 + print(f"\nResolved types:") + for t, c in sorted(type_counts.items(), key=lambda x: -x[1]): + print(f" {t}: {c}") + +if unmatched_dashboards: + print(f"\nUnmatched by dashboard:") + for d, c in sorted(unmatched_dashboards.items(), key=lambda x: -x[1]): + print(f" {d}: {c}") + +conn.close() diff --git a/ci3/ci-metrics/views/ci-health-report.html b/ci3/ci-metrics/views/ci-health-report.html new file mode 100644 index 000000000000..e23b0165116e --- /dev/null +++ b/ci3/ci-metrics/views/ci-health-report.html @@ -0,0 +1,999 @@ + + + + + +Aztec CI Health Report + + + + + +
+ + +
+
aztec-packages · Jan 21 – Feb 23, 2026
+

CI Health Report

+
34-day view · spend · flake · pipeline health · actions
+
+
$6,107
AWS CI EC2 (34d · metrics.db)
+
$22,738
AWS operational total (34d · Cost Explorer, ex-tax, ex-contract)
+
$9,972
GCP compute (30d · 31 namespaces)
+
63.5%
merge queue success (712 attempts)
+
92.3%
next pipeline pass rate
+
33.6 min
next P50 · +8% in 3 weeks
+
+
source: ci.aztec-labs.com metrics.db + BigQuery · 5,136 tracked runs · Jan 21–Feb 23, 2026
+
+ + +
+
§1 spend · overview
+

$6,107 AWS CI EC2 in 34 days — network is the cost hotspot at 30% of spend, 18.4% spot

+
Jan 21–Feb 23 · metrics.db cost_usd · CI pipelines only (GCP on next slide)
+
+
+
+
+
+
+
network pipeline
+
$5.03/run avg
+
523 runs · only 15.5% spot · $2,632 total = 37% of all CI spend
+
+
+
prs pipeline
+
$0.72/run avg
+
1,864 runs · 94.4% spot · $1,340 total · most cost-efficient pipeline
+
+
Network spot rate 15.5% vs 87–99% everywhere else. Spot costs ~86% less than on-demand. Network pipeline is the single largest AWS cost lever.
+
+
+
+ + +
+
§1 spend · full picture
+

$16K/month total: $9K dev/testing across both clouds, $6K always-on production

+
AWS CI EC2 $6,107 (34d · metrics.db) · AWS operational $22,738 (34d · Cost Explorer, ex-tax, ex-contract) · GCP $9,972 (30d)
+
+
+
+
+
+
+
+
+
+
GCP production clusters
+
$6,379/mo
+
testnet $997 · next-net $859 · mainnet $816 · staging $792 · fisherman $334. Always-on regardless of CI activity.
+
+
$6,107 = CI run EC2 tracked in metrics.db — what we directly pay per CI job. Gap to $16,221 total EC2 bill = always-on infra (NAT, VPC, bastion, EFS) not tagged as CI runs. $22,738 = full AWS operational (Cost Explorer, 34d): EC2 $16,221 + CloudFront $1,829 + EC2-Other $2,177 + ElastiCache $637 + misc $1,874. Excluded: $17,520 annual Savings Plan commitment (one-time contract) + $6,172 tax → $46,430 total AWS bill. GCP: $2,129 CI namespaces + $6,379 prod clusters (always-on, CI-independent).
+
+
+
+ + +
+
§1 spend · weekly trend
+

W04 network anomaly: $999 in one week — Feb 14 instance change cut it 91%

+
Stacked weekly CI spend by pipeline · m6a.48xlarge eliminated from network CI Feb 14, replaced by m6a.4xlarge
+
+
+
+
+
+
+
W04 network (m6a.48xlarge, on-demand)
+
$9.79/run
+
192 vCPUs, on-demand pricing. Long-failing jobs during p2p/epoch instability. $999 in one week.
+
+
+
W07 network (m6a.4xlarge)
+
$0.85/run
+
$94 total for the week. Network tests no longer run on the 192-vCPU on-demand beast.
+
+
W07 total: $988 vs W06 peak $1,947. 49% weekly reduction driven by eliminating m6a.48xlarge from network tests. kind tests also moved to spot this period.
+
+
+
+ + +
+
§1 spend · spot discipline
+

Network pipeline: 15.5% spot — every other pipeline runs 87–99% spot

+
Spot instances cost ~86% less than on-demand at equivalent capacity. The gap is isolated to network.
+
+
+
+
+
+
Network at 15.5% spot: 442 of 523 runs were on-demand. At prs pipeline spot rates (94.4%) the network bill drops from $2,632 to ~$370. ~$2,200/month opportunity.
+
GCP spot migration: $1,397/month in 0%-spot namespaces (eth-mainnet, sepolia, monitoring). Ops complexity — preemption handling for archive nodes. See §4.
+
kind tests moved to spot (done). Network pipeline is the next highest-ROI action on AWS.
+
+
+
+ + +
+
§2 flake · weekly pattern
+

One cluster drove every spike — p2p/epoch cleared completely in W07

+
Jan 14–Feb 23 · stacked: p2p/epoch (orange) · other flakes (blue) · hard fails (red)
+
+
+
+
+
+
+
p2p/epoch cluster (W02–W06)
+
2,034 events
+
56 distinct tests · all classified as flakes (0 hard fails) · 747 events in W06 → 2 in W07
+
+
W06 (Feb 9): 515 hard fails — highest of any week. Three high-risk PRs merged same day (see slide 8).
+
W08 (Feb 23, partial): 36 events, 0 p2p/epoch. Too early to call — partial week, low volume.
+
+
+
+ + +
+
§2 flake · anatomy
+

2,034 labeled events across 56 tests — and 1,912 unlabeled failures behind them

+
The e2e-p2p-epoch-flakes cluster is tagged. Everything else is anonymous.
+
+
+
+
+
+
+
e2e-p2p-epoch-flakes · 2,034 events · 56 tests · 0 hard fails
+

Known cluster. All events classified as flakes — timing-driven, not code bugs. Cleared in W07 after Santiago PRs #20351 (mbps test fix) and #20462 (remove hardcoded 10s timeout), plus ludamad #20613 (CI parallelism). Root cause: PXE using latest (not checkpointed) block across distributed epoch transitions. Will recur.

+
+
+
Unlabeled · 1,912 events · 938 distinct tests · merge queue failures
+

These are failures in the merge queue (next pipeline). In the queue, any failure blocks a PR — whether it's a real bug, infrastructure noise, or a timing race. 938 distinct failing tests with no cluster assignment and no assigned owner. This is the long tail of test rot that grows with every feature sprint.

+
+
The labeled cluster cleared. The unlabeled tail didn't. 938 distinct failing tests is residual risk that accumulates without active maintenance.
+
+
+
+ + +
+
§2 flake · signal quality
+

New flakes keep appearing — the floor doesn't clear, and 4 grinds didn't catch W04

+
0.12% overall flake rate (6,632 flakes / 5.5M test runs) — low headline, lumpy reality
+
+
+
W04: the gate failed
+
+
valid_epoch_pruned_slash: 0 events W03 → 346 events W04
+

This test passed the 4-run deflake gate and was merged. In its first full week in CI it produced 346 fail/flake events — 33% of the entire W04 spike. Similarly, tx_proposal_collector (180 events) and inactivity_slash_with_consequence (74 events) debuted in W04 with immediate instability. All passed the gate.

+
+
+
team-alpha now running 10 grinds
+

A stricter informal standard adopted by one team. Not yet universal policy. Reduces risk of introducing flaky tests but increases CI time per new test by 2.5×. Proposal: targeted deflake — instrument tests for determinism before setting a grind count. Outstanding from offsite.

+
+
+
+
W07: new regressions after p2p fix
+
+
profile_gates.test.ts (12 events) + compile.test.ts (11 events)
+

Both appeared for the first time in W07 — the same week the p2p cluster cleared. CLI-layer tests, hash-prefixed job context. Not present in W06. New regressions introduced during the W07 fix sprint, or surface area exposed once p2p noise was removed.

+
+
+
~7% of CI run failures are genuine code bugs
+

Most failures are infrastructure noise (nightly 61% quick-fail) or p2p timing (labeled cluster). Merge-queue failures often behave like flakes regardless of their root cause — they block PRs that may be perfectly correct. New failing tests appear every week; old ones aren't fully removed.

+
+
+
+
+ + +
+
§2 flake · PR correlation
+

What caused the spikes — and who fixed them

+
Attribution via ci_runs × pr_authors. Santiago Palladino: 18+ fix PRs in 6 weeks.
+
+
+
spike causes
+
+ +
+
W02 — Jan 13 · 2,072 flakes
+
spalladino refactors (e2e setup, archiver, test setup). Changed patterns exposing latent timing races across p2p/epoch simultaneously. Tipping point for existing instability.
+
+
+
+ +
+
W04 — Jan 26 · 935 flakes
+
PhilWindle added cross-chain mbps e2e tests without pre-deflaking. valid_epoch_pruned_slash: 0→346 events. tx_proposal_collector: 180 events. inactivity_slash_with_consequence: 74 events. All absent the prior week.
+
+
+
+ +
+
W06 — Feb 9 · 850 flakes + 515 hard fails (worst week)
+
Three high-risk PRs same day: #20047 peer scoring (15:27, mrzeszutko), #20241 max checkpoints→32 (11:21), #20257 hash constants (14:38, LeilaWang). Spanning p2p + epoch config + consensus hashes — the exact subsystems where all flakes live.
+
+
+
+
+
fixes that worked
+
+ +
+
W03 — Jan 23 · −57% · root fix
+
spalladino: checkpointed chain tip for PXE. PXE was using latest (not checkpointed) block, causing epoch boundary races. 6 of 7 CI runs passed cleanly. Most impactful single fix of the period.
+
+
+
+ +
+
W05 — Feb 3–5 · −33%
+
spalladino #20088 slasher multi-block handling. PhilWindle #20140 deflake discv5. 20+ CI runs over 2 days before clean merge.
+
+
+
+ +
+
W07 — Feb 10–17 · −65% · p2p cluster cleared
+
spalladino #20351 fix mbps chain test (p2p_client 311→0 flakes). #20462 remove hardcoded 10s timeout. ludamad #20613 CI parallelism — insufficient workers caused timeout cascades misclassified as flakes.
+
+
+
+
+
Pattern: every spike triggered by new unstable tests or multi-subsystem config changes without pre-deflaking. Will recur without structural change.
+
+ + +
+
§2 flake · maintenance cost
+

30 fix/deflake PRs in 34 days — nearly one per day of ongoing maintenance overhead

+
PRs with "fix/flake/deflake" in title or branch · Jan 14–Feb 23 · engineers patching unstable tests continuously
+
+
+
recent (Feb)
+
+
#20636spypsy · Feb 18
+
#20609alexghr · Feb 17
+
#20439ludamad · Feb 12
+
#20335spypsy · Feb 10
+
#20330ludamad · Feb 10
+
#19322charlielye · Feb 10
+
#20246mverzilli · Feb 6
+
#20243mverzilli · Feb 6
+
#20240mverzilli · Feb 6
+
#20215ludamad · Feb 5
+
#20160mverzilli · Feb 4
+
#20095suyash67 · Feb 4
+
#20140PhilWindle · Feb 3
+
#20131alexghr · Feb 3
+
#20119alexghr · Feb 3
+
#20115danielntmd · Feb 2
+
+
+
+
earlier (Jan)
+
+
#20090mralj · Jan 30
+
#20070mralj · Jan 30
+
#20068mralj · Jan 30
+
#20040mralj · Jan 29
+
#20024spalladino · Jan 28
+
#20004spalladino · Jan 28
+
#19952ludamad · Jan 26
+
#19910spalladino · Jan 23
+
#19782alexghr · Jan 21
+
#19767spalladino · Jan 20
+
#19705PhilWindle · Jan 19
+
#19618alexghr · Jan 15
+
#19588spalladino · Jan 15
+
#19580PhilWindle · Jan 14
+
+
+
+
+
deflake maintenance rate
+
30
+
PRs in 34 days = 0.88/day. 8 distinct contributors. spalladino (6), alexghr (5), ludamad (4), mverzilli (4), mralj (4), PhilWindle (3).
+
+
This is the hidden cost of test rot — not paid in dollars but in senior engineer hours. Each deflake PR is an interrupt to feature work.
+
+
+
+ + +
+
§3 health · pass rates
+

next at 92% — nightly and network stuck at 58%, different failure modes

+
Jan 21–Feb 23 · ci_run_daily_stats · 4,870 total runs across 5 pipelines
+
+
+
+
+
+
+
nightly (573 runs)
+
58.1%
+
61% of failures complete in under 5 min — single job, crashes at startup before tests run. Environment instability, not code.
+
+
+
network (450 runs)
+
58.0%
+
9.5% quick-fail, 87.9min avg fail duration. Network runs often partially pass — a FAIL may have completed 80% of its test suite before one test times out.
+
+
Nightly failures are environment crashes (startup), not code regressions. Network failures are long-running timeouts — pass rate understates how much actually succeeds.
+
+
+
+ + +
+
§3 health · failure anatomy
+

61% of nightly failures complete in under 5 minutes — infra crashes, not code

+
Quick-fail proxy: FAILED runs completing in <5 min. Heuristic — some short unit tests legitimately fail fast.
+
+
+
+
+
+
nightly: 60.8% quick-fail (avg 5.2min to failure). Single job type (next), bailing at startup. The pipeline is crashing on environment setup — likely a Docker pull, secret mount, or k8s scheduling failure — before any test code runs.
+
prs: 30.3% quick-fail (168 of 555 failures). 1 in 3 PR CI failures is infrastructure, not the engineer's code. Significant developer friction that inflates apparent failure rates.
+
next: 9.0% quick-fail — next failures are mostly genuine test failures. The highest-signal pipeline for code quality.
+
+
+
+ + +
+
§3 health · build time
+

next P50 +8% in 3 weeks — P95 up 22%, tail latency accelerating

+
W05=Feb 2 · W06=Feb 9 · W07=Feb 16 · individual ci_runs PASSED records · exact percentiles
+
+
+
+
+
+
+
P95 growth (3 weeks)
+
+22%
+
45.3min (W05) → 55.1min (W07). At this rate: P95 exceeds 70min by May.
+
+
+
P50 growth (3 weeks)
+
+8%
+
31.0min (W05) → 33.6min (W07). Compounded across 1,100+ next runs/month.
+
+
W08 early data (Feb 23, 10 runs only): P50=32.9, P95=34.5. Too small a sample to conclude. No identified root cause for the growth trend.
+
+
+
+ + +
+
§3 health · merge queue
+

63.5% merge queue success — W04 bottomed at 57.6%, W07 still only 59.5%

+
712 total attempts · 452 successes · 251 failures · Jan 21–Feb 23
+
+
+
+
+
+
+
34-day success rate
+
63.5%
+
1 in 3 merge attempts needs a retry. Target for a healthy queue: 85%+.
+
+
W07 (59.5%) is worse than W06 (69.6%) despite the p2p fix. Hard test failures replaced flakes as merge blockers — the queue improved in one dimension and degraded in another.
+
W04 worst week: 91 failures out of 217 attempts (57.6%). Coincided with the p2p/epoch spike and the new unstable test introductions.
+
+
+
+ + +
+
§3 health · scheduling patterns
+

Tuesday 72% vs Thursday 82% — the Monday queue effect is measurable

+
Jan 21–Feb 23 · all pipelines · 4,870 runs · 9.5pp spread between worst and best weekday
+
+
+
+
+
+
Mon/Tue dip: PRs accumulate over the weekend and all enter the merge queue simultaneously Monday morning. Pipeline contention drives higher failure rates. By Thursday the queue has cleared and pass rates recover.
+
9.5pp spread (Tue 72.3% → Thu 81.8%). Thursday–Friday is measurably safer for large feature merges. Batch rollouts on Monday are higher risk.
+
+
+
+ + +
+
§4 GCP · optimization
+

$1,397/month in 0%-spot namespaces — ops complexity vs savings is the open question

+
Five GCP namespaces running 100% on-demand at 86% spot discount foregone · Jan 21–Feb 20
+
+
+
+
+
+
+
Monthly savings potential
+
$1,397/mo
+
eth-mainnet + sepolia ($800) · monitoring ($599). At 86% spot discount, matching cluster-wide spot rate.
+
+
eth-mainnet and sepolia: Ethereum archive nodes, 80% memory utilization. Spot preemption means re-syncing from chain tip. Engineering cost to handle graceful preemption needs evaluation.
+
metrics + public-telemetry: Prometheus/Grafana. Spot-tolerant with PVC persistence — a platform ops task, not a code change. Lower risk than archive nodes.
+
+
+
+ + +
+
§5 actions · done
+

Three improvements landed in February

+
Concrete changes with measured impact on cost and flake counts.
+
+
+
cost — instance type
+
+
m6a.48xlarge → m6a.4xlarge (Feb 14)
+

The 192-vCPU on-demand instance running network tests is gone. At W04's failure rate it cost $9.79/run and $999 in a single week. m6a.4xlarge costs $0.85/run. W07 network spend: $94 vs W04's $999. 91% per-run cost reduction. Network tests no longer run on an oversized on-demand machine. Done.

+
+
+
kind tests moved to spot
+

kind test runs moved from on-demand to spot instances. ~86% cost reduction on affected runs. Done.

+
+
+
+
flake — p2p/epoch cleared
+
+
e2e-p2p-epoch-flakes: 2,034 → ~0 events
+

The cluster that drove every spike since Jan 6 cleared in W07. Two p2p/epoch events in W07 vs 747 in W06. W08: zero p2p events so far.

+
+
+
Santiago Palladino: 18+ fix PRs (Jan–Feb)
+

#19914 checkpointed PXE chain tip (root fix), #20088 slasher multi-block, #20351 mbps chain test (311→0 flakes), #20462 remove hardcoded 10s timeout. ludamad #20613 CI parallelism fix.

+
+
+
+
open — not yet resolved
+
+
p2p cluster root cause
+

Cleared via targeted fixes, not architectural resolution. PXE epoch boundary races and test isolation issues remain. Will recur when new p2p/epoch features merge. No assigned owner for permanent fix.

+
+
+
Build time creep unresolved
+

next P50 +8%, P95 +22% in 3 weeks. No identified cause. At current rate P95 exceeds 70 min by May.

+
+
+
+
+ + +
+
§5 actions · in progress
+

The deflake gate exists — but W04 proved 4 grinds isn't enough for epoch/slashing tests

+
Outstanding from offsite: reassess deflake strategy. team-alpha already at 10 grinds.
+
+
+
current state
+
+
4-run deflake gate (universal, always existed)
+

All new e2e tests must pass 4 consecutive CI runs before merging. Has been standard for some time. Cost: ~$11 in deflake CI spend over 34 days — not a cost constraint.

+
+
+
W04 evidence: the gate failed
+

valid_epoch_pruned_slash passed 4 grinds and produced 346 events in its first week. tx_proposal_collector (180 events) and inactivity_slash_with_consequence (74 events) debuted similarly. The gate is not filtering unstable epoch/slashing tests adequately.

+
+
+
team-alpha: 10 grinds
+

Informal higher standard on one team. Not yet universal. Reduces risk but increases CI wait time by 2.5× per new test.

+
+
+
+
options
+
+
Option A: raise blanket gate to 10 grinds
+

Universal. Simple to enforce. team-alpha already doing this. Tradeoff: 2.5× CI cost per new e2e test; developers wait longer before merge. Not evidence-based — fast-passing tests still get 10 runs.

+ A-533 · set up hard grinding tests in spartan merge train +
+
+
Option B: targeted deflake (proposed)
+

Analyze test pass rate distribution over N runs. Flag tests above a variance threshold. Grind count becomes adaptive: stable tests get 4, timing-sensitive tests get 20+. More surgical, higher tooling cost to implement.

+
+
+
Decision needed from offsite follow-up
+

Who owns the deflake gate upgrade? What is the target flake rate threshold before a test is considered acceptable for CI merge?

+
+
+
+
+ + +
+
§5 actions · open questions
+

Three questions for discussion

+
Not answered here — require team input, owner assignment, or cost-benefit analysis.
+
+
+
deflake strategy
+
+
Targeted deflake vs blanket grinds?
+

team-alpha doing 10 grinds — should this become universal? Or build adaptive tooling that sets the grind count per-test based on variance? Who owns the implementation and the policy?

+ A-533 +
+
Evidence: W04 proved 4 isn't enough for epoch/slashing tests. 10 is pragmatic but not evidence-based. Targeted deflake is surgical but requires tooling investment.
+
+
+
GCP spot migration
+
+
$1,397/month savings — worth the ops complexity?
+

eth-mainnet and sepolia archive nodes: spot preemption means re-syncing from chain tip. metrics and public-telemetry: lower risk, Prometheus/Grafana with PVC persistence. What's the engineering cost to make each namespace preemption-safe?

+
+
AWS network pipeline spot migration (~$2,200/mo) is likely the easier, higher-ROI first step — no archive node complexity.
+
+
+
p2p/epoch ownership
+
+
Who owns the cluster to permanent resolution?
+

The cluster cleared through 18+ targeted fix PRs. Root cause — PXE epoch boundary races, test isolation, distributed timing — is not architecturally resolved. Next p2p feature merge will likely trigger another spike without a named owner and explicit SLO.

+
+
Without assignment: the next spike will be diagnosed reactively, same as this one. Expected recurrence: within 1–2 major p2p feature merges.
+
+
+
+ +
+ +
1 / 19
+ + + + + diff --git a/ci3/ci-metrics/views/ci-insights.html b/ci3/ci-metrics/views/ci-insights.html index 533b6bfb62cd..bb483af6f1d9 100644 --- a/ci3/ci-metrics/views/ci-insights.html +++ b/ci3/ci-metrics/views/ci-insights.html @@ -22,6 +22,15 @@ .msg { color:#888; padding:8px 0; } .msg.err { color:#f85149; } + /* Tabs */ + .tabs { display:flex; gap:0; margin:12px 0 0 0; } + .tab { background:#111; border:1px solid #333; border-bottom:none; color:#888; + font-family:monospace; font-size:13px; padding:6px 16px; cursor:pointer; } + .tab:hover { color:#ccc; } + .tab.active { background:#0a0a0a; color:#fff; border-color:#58a6ff; border-bottom:1px solid #0a0a0a; position:relative; z-index:1; } + .tab-content { display:none; border:1px solid #333; border-top:1px solid #333; padding:12px; background:#0a0a0a; margin-top:-1px; } + .tab-content.active { display:block; } + /* KPI strip */ .kpi-strip { display:flex; gap:12px; margin:16px 0; flex-wrap:wrap; } .kpi { background:#0a0a0a; border:1px solid #222; padding:12px 16px; flex:1; min-width:180px; max-height:120px; overflow:hidden; } @@ -53,23 +62,41 @@ .amt { font-variant-numeric:tabular-nums; text-align:right; } th.amt { text-align:right; } .detail-scroll { max-height:500px; overflow:auto; } + .detail-table { width:100%; border-collapse:collapse; font-size:12px; } + .detail-table th { text-align:left; padding:4px 6px; border-bottom:1px solid #333; color:#888; white-space:nowrap; position:sticky; top:0; background:#0a0a0a; } + .detail-table td { padding:4px 6px; border-bottom:1px solid #111; white-space:nowrap; } + .detail-table .amt { text-align:right; font-variant-numeric:tabular-nums; } + .detail-table th.amt { text-align:right; } + .stats { margin:12px 0; color:#888; } + .stats span { color:#ccc; } + /* Test details */ + .cmd { max-width:500px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; } + .pass { color:#3fb950; } + .fail { color:#f85149; } + .flake { color:#d29922; } -

ci insights

+

ci insights

- + + - + + + + | @@ -82,71 +109,205 @@

ci insights

- - -
-
daily ci spend
--
-
cost / merge
--
-
mq success rate
--
-
flakes / day
--
-
prs merged / day
--
+
+
Overview
+
Attribution
- -
-
-

daily ci cost + 7-day rolling cost per merge

-
+ +
+ -
-

merge queue: daily outcomes + success rate

-
+ +
+
mq success rate
--
+
flakes / day
--
+
prs merged / day
--
+
avg mq duration
--
-
-

flakes + test failures per day

-
+ +
+
+

merge queue: daily outcomes + success rate

+
+
+
+

test outcomes per day

+
+
+
+

ci run duration by pipeline (avg mins)

+
+
+
+

total ci time by pipeline (hours)

+
+
+
+

P95 build time by pipeline & phase (secs)

+
+
+
+ +
+
+
top flaky tests
+
+ + + +
testflakesaffected
+
+
+
+
top failing tests
+
+ + + +
testfailuresaffected
+
+
-
- -
flakes by pipeline
-
- - - -
+
flakes by pipeline
+
+ + + +
+
+ + +
+
+ test details + | + + + + +
+
+
+
+
+
+

avg duration by day

+
+
+
+

test run count by day

+
+
+
+

tests by duration

+
+ + + + + + + + + + + + + + + + +
test commandrunsavg (s)min (s)max (s)total (h)pass %passedfailedflaked
+
+

slowest individual runs

+
+ + + + + + + + + + + + + +
test commandduration (s)statusdateauthorpipelinelog
+
- -
author ci profile
-
- - - -
+ +
+
+ +
+
+
+

ci cost by run type (time series)

+
+
+
+

cost by user (AWS + GCP)

+
+
+
+

cost by run type

+
+
+
+ +
author ci profile
+
+ + + +
+
+ +
instances
+
+ + + +
+
- + diff --git a/ci3/ci-metrics/views/commits.html b/ci3/ci-metrics/views/commits.html new file mode 100644 index 000000000000..03143adff905 --- /dev/null +++ b/ci3/ci-metrics/views/commits.html @@ -0,0 +1,345 @@ + + + + + Commits — aztec-packages + + + + + +

commits — next

+ +
+ + + + + + + + + + +
+ +
Loading…
+ + + + + + diff --git a/ci3/ci-metrics/views/cost-overview.html b/ci3/ci-metrics/views/cost-overview.html index 53424a2d2d70..da8d73e0b7f8 100644 --- a/ci3/ci-metrics/views/cost-overview.html +++ b/ci3/ci-metrics/views/cost-overview.html @@ -71,17 +71,23 @@ -

cost overview

+

cost overview

- - + + + + + + | @@ -98,7 +104,6 @@

cost overview

Overview
Resource Details
-
CI Attribution
@@ -136,35 +141,6 @@

aws vs gcp split

-
-
- -
-
-
-

ci cost by run type (time series)

-
-
-
-

cost by user (AWS + GCP)

-
-
-
-

cost by run type

-
-
-
-

instances

-
- - - -
-
-
+ + diff --git a/ci3/ci-metrics/views/test-timings.html b/ci3/ci-metrics/views/test-timings.html index 0bf6c7213bd6..63cc54bb3690 100644 --- a/ci3/ci-metrics/views/test-timings.html +++ b/ci3/ci-metrics/views/test-timings.html @@ -1,289 +1,7 @@ - - - ACI - Test Timings - - +Redirecting... - - -

test timings

- -
- - - - - | - - - | - - - | - - -
- -
loading...
- -
- -
-
-

avg duration by day

-
-
-
-

test run count by day

-
-
-
- -

tests by duration

-
- - - - - - - - - - - - - - - - -
test commandrunsavg (s)min (s)max (s)total (h)pass %passedfailedflaked
-
- -

slowest individual runs

-
- - - - - - - - - - - - - -
test commandduration (s)statusdateauthorpipelinelog
-
- - - + diff --git a/ci3/dashboard/Dockerfile b/ci3/dashboard/Dockerfile index 2da7805ffa83..cd2e5b1f9b1d 100644 --- a/ci3/dashboard/Dockerfile +++ b/ci3/dashboard/Dockerfile @@ -24,4 +24,4 @@ RUN pip install --no-cache-dir -r ci-metrics/requirements.txt RUN git config --global --add safe.directory /aztec-packages COPY . . EXPOSE 8080 8081 -CMD ["gunicorn", "-w", "100", "-b", "0.0.0.0:8080", "rk:app"] +CMD ["gunicorn", "-w", "50", "-b", "0.0.0.0:8080", "rk:app"] diff --git a/ci3/dashboard/requirements.txt b/ci3/dashboard/requirements.txt index 9c1526f5b7a8..f3e1e9c53c08 100644 --- a/ci3/dashboard/requirements.txt +++ b/ci3/dashboard/requirements.txt @@ -5,3 +5,4 @@ ansi2html Flask-Compress requests Flask-HTTPAuth +boto3 diff --git a/ci3/dashboard/rk.py b/ci3/dashboard/rk.py index aedf35a824e2..12b9a414077f 100644 --- a/ci3/dashboard/rk.py +++ b/ci3/dashboard/rk.py @@ -1,6 +1,8 @@ from flask import Flask, render_template_string, request, Response, redirect from flask_compress import Compress from flask_httpauth import HTTPBasicAuth +import boto3 +from botocore.exceptions import ClientError import gzip import json import os @@ -9,6 +11,7 @@ import shlex import subprocess import threading +import time as _time import uuid from ansi2html import Ansi2HTMLConverter from pathlib import Path @@ -19,6 +22,10 @@ hyperlink, r, get_section_data, get_list_as_string ) LOGS_DISK_PATH = os.getenv('LOGS_DISK_PATH', '/logs-disk') +S3_LOGS_BUCKET = os.getenv('S3_LOGS_BUCKET', 'aztec-ci-artifacts') +S3_LOGS_PREFIX = os.getenv('S3_LOGS_PREFIX', 'logs') + +_s3 = boto3.client('s3', region_name='us-east-2') DASHBOARD_PASSWORD = os.getenv('DASHBOARD_PASSWORD', 'password') CI_METRICS_PORT = int(os.getenv('CI_METRICS_PORT', '8081')) CI_METRICS_URL = os.getenv('CI_METRICS_URL', f'http://localhost:{CI_METRICS_PORT}') @@ -27,37 +34,47 @@ Compress(app) auth = HTTPBasicAuth() -# Start the ci-metrics server as a subprocess -# Check sibling dir (repo layout) then subdirectory (Docker layout) +# Start the ci-metrics server as a subprocess (once across all workers). +# Uses a file lock so only the first gunicorn worker to import this module +# actually spawns the process; the rest skip silently. +import fcntl +import signal + _ci_metrics_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'ci-metrics') if not os.path.isdir(_ci_metrics_dir): _ci_metrics_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'ci-metrics') if os.path.isdir(_ci_metrics_dir): - # Kill any stale process on the port (e.g. leftover from previous reload) - import signal + _lock_path = f'/tmp/ci-metrics-{CI_METRICS_PORT}.lock' try: - out = subprocess.check_output( - ['lsof', '-ti', f':{CI_METRICS_PORT}'], stderr=subprocess.DEVNULL, text=True) - for pid in out.strip().split('\n'): - if pid: - os.kill(int(pid), signal.SIGTERM) - import time; time.sleep(0.5) - except (subprocess.CalledProcessError, OSError): + _lock_fd = open(_lock_path, 'w') + fcntl.flock(_lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + # We hold the lock — kill stale process and spawn fresh one + try: + out = subprocess.check_output( + ['lsof', '-ti', f':{CI_METRICS_PORT}'], stderr=subprocess.DEVNULL, text=True) + for pid in out.strip().split('\n'): + if pid: + os.kill(int(pid), signal.SIGTERM) + _time.sleep(0.5) + except (subprocess.CalledProcessError, OSError): + pass + _ci_metrics_env = {**os.environ, 'CI_METRICS_PORT': str(CI_METRICS_PORT)} + subprocess.Popen( + ['gunicorn', '-w', '1', '-b', f'0.0.0.0:{CI_METRICS_PORT}', + '--timeout', '120', 'app:app'], + cwd=_ci_metrics_dir, + env=_ci_metrics_env, + ) + print(f"[rk.py] ci-metrics server started on port {CI_METRICS_PORT}") + # Hold the lock until this process exits so other workers skip + except OSError: + # Another worker already holds the lock — nothing to do pass - _ci_metrics_env = {**os.environ, 'CI_METRICS_PORT': str(CI_METRICS_PORT)} - subprocess.Popen( - ['gunicorn', '-w', '4', '-b', f'0.0.0.0:{CI_METRICS_PORT}', '--timeout', '120', 'app:app'], - cwd=_ci_metrics_dir, - env=_ci_metrics_env, - ) - print(f"[rk.py] ci-metrics server started on port {CI_METRICS_PORT}") def read_from_disk(key): - """Read log from disk as fallback when Redis key not found.""" + """Read log from disk.""" try: - # Use first 4 chars as subdirectory prefix = key[:4] - log_file = f"/logs-disk/{prefix}/{key}.log.gz" log_file = f"{LOGS_DISK_PATH}/{prefix}/{key}.log.gz" if os.path.exists(log_file): with gzip.open(log_file, 'rb') as f: @@ -66,6 +83,20 @@ def read_from_disk(key): print(f"Error reading from disk: {e}") return None +def read_from_s3(key): + """Read log from S3 (fallback when Redis and disk both miss).""" + try: + prefix = key[:4] + s3_key = f"{S3_LOGS_PREFIX}/{prefix}/{key}.log.gz" + obj = _s3.get_object(Bucket=S3_LOGS_BUCKET, Key=s3_key) + return gzip.decompress(obj['Body'].read()).decode('utf-8', errors='replace') + except ClientError as e: + if e.response['Error']['Code'] != 'NoSuchKey': + print(f"S3 error reading {key}: {e}") + except Exception as e: + print(f"Error reading from S3: {e}") + return None + def read_breakdown_from_disk(runtime, flow_name, sha): """Read benchmark breakdown JSON from disk.""" try: @@ -178,7 +209,6 @@ def root() -> str: f"{hyperlink('/cost-overview', 'cost overview (AWS + GCP)')}\n" f"{hyperlink('/namespace-billing', 'namespace billing')}\n" f"{hyperlink('/ci-insights', 'ci insights')}\n" - f"{hyperlink('/test-timings', 'test timings')}\n" f"{RESET}" ) @@ -528,32 +558,32 @@ def make_options(param_name, options, current_value, suffix=''): _proxy_session = requests.Session() _HOP_BY_HOP = frozenset([ 'connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', - 'te', 'trailers', 'transfer-encoding', 'upgrade', 'content-length', - # `requests` auto-decompresses gzip responses, so Content-Encoding is - # stale — strip it so the browser doesn't try to decompress plain content. - # Flask-Compress on rkapp handles browser compression. - 'content-encoding', + 'te', 'trailers', 'transfer-encoding', 'upgrade', ]) -# Don't forward Accept-Encoding — let `requests` negotiate with ci-metrics -# (it adds its own and auto-decompresses). -_STRIP_REQUEST_HEADERS = frozenset(['host', 'accept-encoding']) +_STRIP_REQUEST_HEADERS = frozenset(['host']) def _proxy(path): - """Forward request to ci-metrics, streaming the response back.""" + """Forward request to ci-metrics, streaming the response back. + + Passes the browser's Accept-Encoding through to ci-metrics so it + compresses directly for the browser. We stream the raw (still + compressed) bytes back without decompression. + """ url = f'{CI_METRICS_URL}/{path.lstrip("/")}' try: + fwd_headers = {k: v for k, v in request.headers if k.lower() not in _STRIP_REQUEST_HEADERS} resp = _proxy_session.request( method=request.method, url=url, params=request.args, data=request.get_data(), - headers={k: v for k, v in request.headers if k.lower() not in _STRIP_REQUEST_HEADERS}, + headers=fwd_headers, stream=True, - timeout=60, + timeout=180, ) - # Strip hop-by-hop headers + # Stream raw bytes (skip requests auto-decompression) headers = {k: v for k, v in resp.headers.items() if k.lower() not in _HOP_BY_HOP} - return Response(resp.iter_content(chunk_size=8192), + return Response(resp.raw.stream(8192), status=resp.status_code, headers=headers) except Exception as e: return Response(json.dumps({'error': f'ci-metrics unavailable: {e}'}), @@ -564,10 +594,13 @@ def _proxy(path): @app.route('/ci-insights') @app.route('/cost-overview') @app.route('/test-timings') +@app.route('/ci-health-report') +@app.route('/flake-prs') @auth.login_required def proxy_dashboard(): return _proxy(request.path) + @app.route('/api/', methods=['GET', 'POST', 'PUT', 'DELETE']) @auth.login_required def proxy_api(path): @@ -583,11 +616,13 @@ def get_value(key): value = r.get(key) if value is None: - # Try disk fallback value = read_from_disk(key) - if value is None: - value = "Key not found" - else: + if value is None: + value = read_from_s3(key) + if value is None: + value = "Key not found" + elif isinstance(value, bytes): + # Redis returns raw bytes — decompress if gzip. try: if value.startswith(b"\x1f\x8b"): value = gzip.decompress(value).decode() diff --git a/l1-contracts/script/deploy/DeploymentConfiguration.sol b/l1-contracts/script/deploy/DeploymentConfiguration.sol index 1d27ac7aa72f..1a40c72aac4e 100644 --- a/l1-contracts/script/deploy/DeploymentConfiguration.sol +++ b/l1-contracts/script/deploy/DeploymentConfiguration.sol @@ -143,7 +143,7 @@ contract DeploymentConfiguration is IDeploymentConfiguration, Test { }), votingDelay: Timestamp.wrap(3 * 24 * 60 * 60), votingDuration: Timestamp.wrap(7 * 24 * 60 * 60), - executionDelay: Timestamp.wrap(7 * 24 * 60 * 60), + executionDelay: Timestamp.wrap(30 * 24 * 60 * 60), gracePeriod: Timestamp.wrap(7 * 24 * 60 * 60), quorum: 0.2e18, requiredYeaMargin: 0.33e18, diff --git a/l1-contracts/script/deploy/RollupConfiguration.sol b/l1-contracts/script/deploy/RollupConfiguration.sol index ed58853bef49..1ea7bdb1e452 100644 --- a/l1-contracts/script/deploy/RollupConfiguration.sol +++ b/l1-contracts/script/deploy/RollupConfiguration.sol @@ -71,7 +71,7 @@ contract RollupConfiguration is IRollupConfiguration, Test { uint16 sequencerBps; uint96 checkpointReward; sequencerBps = 7000; - checkpointReward = 400e18; + checkpointReward = 500e18; return RewardConfig({ rewardDistributor: _rewardDistributor, diff --git a/noir-projects/noir-contracts/contracts/app/auth_contract/src/main.nr b/noir-projects/noir-contracts/contracts/app/auth_contract/src/main.nr index 4f2b7dbeeb68..836d8b4af942 100644 --- a/noir-projects/noir-contracts/contracts/app/auth_contract/src/main.nr +++ b/noir-projects/noir-contracts/contracts/app/auth_contract/src/main.nr @@ -13,8 +13,8 @@ pub contract Auth { }; // docs:start:delayed_public_mutable_storage - // Authorizing a new address has a certain delay before it goes into effect. Set to 180 seconds. - pub(crate) global CHANGE_AUTHORIZED_DELAY: u64 = 180; + // Authorizing a new address has a certain delay before it goes into effect. Set to 360 seconds which is 5 slots. + pub(crate) global CHANGE_AUTHORIZED_DELAY: u64 = 360; #[storage] struct Storage { diff --git a/scripts/backport_to_staging.sh b/scripts/backport_to_staging.sh index 60fac941bc71..28d398ebfade 100755 --- a/scripts/backport_to_staging.sh +++ b/scripts/backport_to_staging.sh @@ -141,17 +141,16 @@ fi # Commit changes - base the commit details off of the PR title and body echo "Diff applied successfully! Committing changes..." -git config user.name "$PR_AUTHOR" -git config user.email "$PR_AUTHOR_EMAIL" - # Ensure commit subject contains PR reference for get_meaningful_commits COMMIT_SUBJECT="$PR_TITLE" if ! echo "$COMMIT_SUBJECT" | grep -qE '\(#[0-9]+\)'; then COMMIT_SUBJECT="$COMMIT_SUBJECT (#$PR_NUMBER)" fi +# Use --author to preserve original PR author while keeping the committer +# as whoever runs the script (so GPG signing works for local devs). git add -A -git commit -m "$COMMIT_SUBJECT +git commit --author="$PR_AUTHOR <$PR_AUTHOR_EMAIL>" -m "$COMMIT_SUBJECT $PR_BODY" diff --git a/spartan/.gitignore b/spartan/.gitignore index 792fa0ebb8b7..6fbca9faac92 100644 --- a/spartan/.gitignore +++ b/spartan/.gitignore @@ -29,4 +29,5 @@ environments/* !environments/tps-scenario.env !environments/kind-minimal.env !environments/kind-provers.env +!environments/alpha-net.env *.tfvars diff --git a/spartan/bootstrap.sh b/spartan/bootstrap.sh index d769085d45ad..45e2144182df 100755 --- a/spartan/bootstrap.sh +++ b/spartan/bootstrap.sh @@ -99,6 +99,12 @@ function run_network_tests { source_network_env "$env_file" gcp_auth export SCENARIO_TESTS=1 + # Retrieve the admin API key stored as a K8s Secret during deployment. + # Exported so the test runner can authenticate against the admin RPC endpoint. + export AZTEC_ADMIN_API_KEY + AZTEC_ADMIN_API_KEY=$(kubectl get secret aztec-admin-api-key \ + --namespace "$NAMESPACE" \ + -o jsonpath='{.data.key}' 2>/dev/null | base64 -d 2>/dev/null || true) local failed=() for test_file in "$@"; do echo_header "Running $test_file" diff --git a/spartan/environments/alpha-net.env b/spartan/environments/alpha-net.env new file mode 100644 index 000000000000..ba4758a6acc7 --- /dev/null +++ b/spartan/environments/alpha-net.env @@ -0,0 +1,89 @@ +NAMESPACE=${NAMESPACE:-alpha-net} +CLUSTER=aztec-gke-private +GCP_REGION=us-west1-a +DESTROY_NAMESPACE=true +DESTROY_ETH_DEVNET=true +CREATE_ETH_DEVNET=${CREATE_ETH_DEVNET:-true} +AZTEC_EPOCH_DURATION=8 +AZTEC_SLOT_DURATION=72 +AZTEC_PROOF_SUBMISSION_EPOCHS=2 +ETHEREUM_CHAIN_ID=1337 +LABS_INFRA_MNEMONIC="test test test test test test test test test test test junk" +FUNDING_PRIVATE_KEY="0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80" +# CREATE_CHAOS_MESH=true + +# Install chaos mesh peer isolation after Aztec infra deploys. Validators, +# RPC nodes, and prover nodes can only peer with full-nodes, not each other. +# Requires P2P_PUBLIC_IP=false so P2P uses pod IPs that iptables rules can match. +P2P_PUBLIC_IP=false +CHAOS_MESH_SCENARIOS_FILE=network-requirements.yaml + +AZTEC_MANA_TARGET=2147483647 + +P2P_TX_POOL_DELETE_TXS_AFTER_REORG=true + +# For mbps +SEQ_BUILD_CHECKPOINT_IF_EMPTY=true +SEQ_BLOCK_DURATION_MS=6000 +SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT=5 + +CREATE_ROLLUP_CONTRACTS=true +REDEPLOY_ROLLUP_CONTRACTS=true +VERIFY_CONTRACTS=false +DESTROY_AZTEC_INFRA=true + +AZTEC_LAG_IN_EPOCHS_FOR_VALIDATOR_SET=1 +AZTEC_LAG_IN_EPOCHS_FOR_RANDAO=1 + +OTEL_COLLECTOR_ENDPOINT=REPLACE_WITH_GCP_SECRET + +VALIDATOR_REPLICAS=12 +VALIDATORS_PER_NODE=4 +PUBLISHERS_PER_VALIDATOR_KEY=2 +VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX=5000 +VALIDATOR_RESOURCE_PROFILE="2-core-dedicated" + +REAL_VERIFIER=false + +RPC_REPLICAS=12 +RPC_INGRESS_ENABLED=false + +FULL_NODE_REPLICAS=500 +FULL_NODE_RESOURCE_PROFILE="2-core-spot" + +PUBLISHERS_PER_PROVER=2 +PROVER_PUBLISHER_MNEMONIC_START_INDEX=8000 +PROVER_REPLICAS=128 +PROVER_RESOURCE_PROFILE="hi-tps" +PROVER_AGENT_POLL_INTERVAL_MS=10000 + +RUN_TESTS=false + +PROVER_TEST_DELAY_TYPE=fixed + +AZTEC_SLASHING_ROUND_SIZE_IN_EPOCHS=1 +AZTEC_SLASHING_QUORUM=5 +AZTEC_SLASHING_EXECUTION_DELAY_IN_ROUNDS=0 +AZTEC_SLASHING_OFFSET_IN_ROUNDS=1 +AZTEC_LOCAL_EJECTION_THRESHOLD=90000000000000000000 + +SEQ_MAX_TX_PER_BLOCK=10 +SEQ_MIN_TX_PER_BLOCK=0 + +# Override L1 tx utils bump percentages for scenario tests +VALIDATOR_L1_PRIORITY_FEE_BUMP_PERCENTAGE=0 +VALIDATOR_L1_PRIORITY_FEE_RETRY_BUMP_PERCENTAGE=0 +PROVER_L1_PRIORITY_FEE_BUMP_PERCENTAGE=0 +PROVER_L1_PRIORITY_FEE_RETRY_BUMP_PERCENTAGE=0 + +# Enable latency mesaruement for p2p messages +DEBUG_P2P_INSTRUMENT_MESSAGES=true + +# Inject artificial delay of proof verification for all nodes +PROVER_TEST_VERIFICATION_DELAY_MS=250 + +# Reduce the amount of metrics produced by prover agents and full nodes +PROVER_AGENT_INCLUDE_METRICS="aztec.circuit" +FULL_NODE_INCLUDE_METRICS="aztec.p2p.gossip.agg_" +LOG_LEVEL=info + diff --git a/spartan/environments/network-defaults.yml b/spartan/environments/network-defaults.yml index 9291bc82795c..9eefd071c7ee 100644 --- a/spartan/environments/network-defaults.yml +++ b/spartan/environments/network-defaults.yml @@ -40,7 +40,7 @@ l1-contracts: &l1-contracts-defaults # How many seconds an L1 slot lasts (Ethereum consensus layer). ETHEREUM_SLOT_DURATION: 12 # How many seconds an L2 slot lasts (must be multiple of L1 slot duration). - AZTEC_SLOT_DURATION: 36 + AZTEC_SLOT_DURATION: 72 # How many L2 slots in an epoch. AZTEC_EPOCH_DURATION: 32 @@ -138,9 +138,9 @@ slasher: &slasher # Penalty for attesting to a descendant of an invalid block. SLASH_ATTEST_DESCENDANT_OF_INVALID_PENALTY: 10e18 # Penalty for proposing two different block or checkpoint proposal for the same position. - SLASH_DUPLICATE_PROPOSAL_PENALTY: 10e18 + SLASH_DUPLICATE_PROPOSAL_PENALTY: 0 # Penalty for signing attestations for different proposals at the same slot. - SLASH_DUPLICATE_ATTESTATION_PENALTY: 10e18 + SLASH_DUPLICATE_ATTESTATION_PENALTY: 0 # Penalty for unknown offenses. SLASH_UNKNOWN_PENALTY: 10e18 # Penalty for broadcasting an invalid block. @@ -171,8 +171,13 @@ _prodlike: &prodlike #--------------------------------------------------------------------------- # Minimum transactions to include in a block. SEQ_MIN_TX_PER_BLOCK: 0 + # Maximum transactions to include in a block # Build checkpoint even if block is empty. SEQ_BUILD_CHECKPOINT_IF_EMPTY: true + # 6 second block times + SEQ_BLOCK_DURATION_MS: 6000 + # Time allocated for publishing to L1 + SEQ_L1_PUBLISHING_TIME_ALLOWANCE_IN_SLOT: 36 # 3 Ethereum slots #--------------------------------------------------------------------------- # Database Map Sizes (in KB) @@ -216,7 +221,7 @@ networks: SPONSORED_FPC: true # Fund sponsored FPC with fee juice TRANSACTIONS_DISABLED: false # Sequencer - SEQ_MAX_TX_PER_BLOCK: 32 + SEQ_MAX_TX_PER_BLOCK: 18 # Prover PROVER_REAL_PROOFS: false # Use mock proofs PXE_PROVER_ENABLED: false # Disable PXE proving @@ -260,7 +265,11 @@ networks: AZTEC_SLASHING_QUORUM: 33 AZTEC_GOVERNANCE_PROPOSER_ROUND_SIZE: 100 AZTEC_GOVERNANCE_PROPOSER_QUORUM: 60 - AZTEC_MANA_TARGET: 150000000 + AZTEC_MANA_TARGET: 75000000 + AZTEC_PROVING_COST_PER_MANA: 25000000 + AZTEC_SLASH_AMOUNT_SMALL: 100000e18 + AZTEC_SLASH_AMOUNT_MEDIUM: 100000e18 + AZTEC_SLASH_AMOUNT_LARGE: 100000e18 # Network identity L1_CHAIN_ID: 11155111 # Sepolia # Genesis state @@ -268,7 +277,8 @@ networks: SPONSORED_FPC: true TRANSACTIONS_DISABLED: false # Sequencer - SEQ_MAX_TX_PER_BLOCK: 8 + # Gives ~0.1 TPS @ 72s slot time, 36s publish time, 6s block time - max 4 blocks per slot + SEQ_MAX_TX_PER_BLOCK: 2 # Prover PROVER_REAL_PROOFS: true # P2P @@ -291,10 +301,9 @@ networks: <<: *prodlike # L1 contract overrides - production parameters AZTEC_SLOT_DURATION: 72 - AZTEC_TARGET_COMMITTEE_SIZE: 24 AZTEC_ACTIVATION_THRESHOLD: 200000e18 AZTEC_EJECTION_THRESHOLD: 100000e18 - AZTEC_LOCAL_EJECTION_THRESHOLD: 196000e18 + AZTEC_LOCAL_EJECTION_THRESHOLD: 162000e18 AZTEC_SLASH_AMOUNT_SMALL: 2000e18 AZTEC_SLASH_AMOUNT_MEDIUM: 2000e18 AZTEC_SLASH_AMOUNT_LARGE: 2000e18 @@ -304,10 +313,15 @@ networks: AZTEC_SLASHING_QUORUM: 65 AZTEC_GOVERNANCE_PROPOSER_QUORUM: 600 AZTEC_GOVERNANCE_PROPOSER_ROUND_SIZE: 1000 - AZTEC_MANA_TARGET: 0 - AZTEC_PROVING_COST_PER_MANA: 0 + AZTEC_MANA_TARGET: 75000000 + AZTEC_PROVING_COST_PER_MANA: 25000000 AZTEC_EXIT_DELAY_SECONDS: 345600 # 4 days AZTEC_SLASHING_DISABLE_DURATION: 259200 # 3 days + AZTEC_ENTRY_QUEUE_BOOTSTRAP_VALIDATOR_SET_SIZE: 500 + AZTEC_ENTRY_QUEUE_BOOTSTRAP_FLUSH_SIZE: 500 + AZTEC_ENTRY_QUEUE_FLUSH_SIZE_MIN: 1 + AZTEC_ENTRY_QUEUE_FLUSH_SIZE_QUOTIENT: 400 + AZTEC_ENTRY_QUEUE_MAX_FLUSH_SIZE: 4 # Network identity L1_CHAIN_ID: 1 # Ethereum mainnet # Genesis state - no test accounts, no sponsored FPC diff --git a/spartan/environments/next-net.env b/spartan/environments/next-net.env index 73b5331fbb89..1dbe575eae9f 100644 --- a/spartan/environments/next-net.env +++ b/spartan/environments/next-net.env @@ -24,11 +24,17 @@ TX_COLLECTION_FILE_STORE_URLS="https://aztec-labs-snapshots.com/${TX_FILE_STORE_ R2_ACCESS_KEY_ID=REPLACE_WITH_GCP_SECRET R2_SECRET_ACCESS_KEY=REPLACE_WITH_GCP_SECRET PROVER_FAILED_PROOF_STORE=gs://aztec-develop/next-net/failed-proofs +L1_TX_FAILED_STORE=gs://aztec-develop/next-net/failed-l1-txs TEST_ACCOUNTS=true SPONSORED_FPC=true + SEQ_MIN_TX_PER_BLOCK=0 -SEQ_MAX_TX_PER_BLOCK=8 -AZTEC_EPOCH_DURATION=32 +# Gives ~0.1 TPS @ 72s slot time, 36s publish time, 6s block time - max 4 blocks per slot +SEQ_MAX_TX_PER_BLOCK=2 + +# Build checkpoint even if block is empty. +SEQ_BUILD_CHECKPOINT_IF_EMPTY=true +SEQ_BLOCK_DURATION_MS=6000 AZTEC_LAG_IN_EPOCHS_FOR_VALIDATOR_SET=2 AZTEC_LAG_IN_EPOCHS_FOR_RANDAO=2 @@ -60,3 +66,12 @@ RPC_INGRESS_SSL_CERT_NAMES='["nextnet-rpc-cert"]' VALIDATOR_HA_REPLICAS=1 VALIDATOR_RESOURCE_PROFILE="prod-spot" + +REAL_VERIFIER=true +AZTEC_SLOT_DURATION=72 +AZTEC_EPOCH_DURATION=32 +AZTEC_TARGET_COMMITTEE_SIZE=48 +AZTEC_LAG_IN_EPOCHS_FOR_VALIDATOR_SET=2 +AZTEC_LAG_IN_EPOCHS_FOR_RANDAO=2 +AZTEC_PROOF_SUBMISSION_EPOCHS=1 + diff --git a/spartan/environments/staging-public.env b/spartan/environments/staging-public.env index 35ec3b1db6fb..100900a664b0 100644 --- a/spartan/environments/staging-public.env +++ b/spartan/environments/staging-public.env @@ -17,13 +17,21 @@ ETHERSCAN_API_KEY=REPLACE_WITH_GCP_SECRET DEPLOY_INTERNAL_BOOTNODE=true SNAPSHOT_BUCKET_DIRECTORY=${SNAPSHOT_BUCKET_DIRECTORY:-staging-public} BLOB_BUCKET_DIRECTORY=${BLOB_BUCKET_DIRECTORY:-staging-public/blobs} +TX_FILE_STORE_ENABLED=true +TX_FILE_STORE_BUCKET_DIRECTORY=${TX_FILE_STORE_BUCKET_DIRECTORY:-staging-public/txs} +TX_COLLECTION_FILE_STORE_URLS="https://aztec-labs-snapshots.com/${TX_FILE_STORE_BUCKET_DIRECTORY}" R2_ACCESS_KEY_ID=REPLACE_WITH_GCP_SECRET R2_SECRET_ACCESS_KEY=REPLACE_WITH_GCP_SECRET TEST_ACCOUNTS=false SPONSORED_FPC=true SEQ_MIN_TX_PER_BLOCK=0 -SEQ_MAX_TX_PER_BLOCK=4 +# Gives ~0.1 TPS @ 72s slot time, 36s publish time, 6s block time - max 4 blocks per slot +SEQ_MAX_TX_PER_BLOCK=2 + +# Build checkpoint even if block is empty. +SEQ_BUILD_CHECKPOINT_IF_EMPTY=true +SEQ_BLOCK_DURATION_MS=6000 CREATE_ROLLUP_CONTRACTS=${CREATE_ROLLUP_CONTRACTS:-false} P2P_TX_POOL_DELETE_TXS_AFTER_REORG=true diff --git a/spartan/environments/testnet.env b/spartan/environments/testnet.env index d57eefe37a96..e0dfb4b0e717 100644 --- a/spartan/environments/testnet.env +++ b/spartan/environments/testnet.env @@ -2,10 +2,6 @@ CREATE_ETH_DEVNET=false GCP_REGION=us-west1-a CLUSTER=aztec-gke-public NAMESPACE=${NAMESPACE:-testnet} -TEST_ACCOUNTS=false -SPONSORED_FPC=true -SEQ_MIN_TX_PER_BLOCK=0 -SEQ_MAX_TX_PER_BLOCK=8 NETWORK=testnet REAL_VERIFIER=true @@ -27,7 +23,11 @@ AZTEC_SLASHING_OFFSET_IN_ROUNDS=2 AZTEC_SLASHING_LIFETIME_IN_ROUNDS=5 AZTEC_SLASHING_EXECUTION_DELAY_IN_ROUNDS=2 AZTEC_SLASHING_VETOER=\"0xdfe19Da6a717b7088621d8bBB66be59F2d78e924\" -AZTEC_MANA_TARGET=150000000 +AZTEC_MANA_TARGET=75000000 +AZTEC_PROVING_COST_PER_MANA=25000000 +AZTEC_SLASH_AMOUNT_SMALL=100000e18 +AZTEC_SLASH_AMOUNT_MEDIUM=100000e18 +AZTEC_SLASH_AMOUNT_LARGE=100000e18 AZTEC_ACTIVATION_THRESHOLD=200000e18 AZTEC_EJECTION_THRESHOLD=100000e18 AZTEC_GOVERNANCE_PROPOSER_ROUND_SIZE=100 @@ -48,6 +48,9 @@ ETHERSCAN_API_KEY=REPLACE_WITH_GCP_SECRET SNAPSHOT_BUCKET_DIRECTORY=${SNAPSHOT_BUCKET_DIRECTORY:-testnet} BLOB_BUCKET_DIRECTORY=${BLOB_BUCKET_DIRECTORY:-testnet/blobs} +TX_FILE_STORE_ENABLED=true +TX_FILE_STORE_BUCKET_DIRECTORY=${TX_FILE_STORE_BUCKET_DIRECTORY:-testnet/txs} +TX_COLLECTION_FILE_STORE_URLS="https://aztec-labs-snapshots.com/${TX_FILE_STORE_BUCKET_DIRECTORY}" R2_ACCESS_KEY_ID=REPLACE_WITH_GCP_SECRET R2_SECRET_ACCESS_KEY=REPLACE_WITH_GCP_SECRET DEPLOY_INTERNAL_BOOTNODE=false @@ -69,9 +72,11 @@ RPC_INGRESS_SSL_CERT_NAMES='["testnet-rpc-cert"]' VALIDATOR_REPLICAS=4 -VALIDATORS_PER_NODE=20 -PUBLISHERS_PER_VALIDATOR_KEY=2 +VALIDATORS_PER_NODE=64 +PUBLISHERS_PER_VALIDATOR_KEY=1 VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX=5000 +VALIDATOR_HA_REPLICAS=1 +VALIDATOR_RESOURCE_PROFILE="prod-spot" PUBLISHERS_PER_PROVER=2 PROVER_PUBLISHER_MNEMONIC_START_INDEX=8000 diff --git a/spartan/scripts/deploy_network.sh b/spartan/scripts/deploy_network.sh index 55af194f30b7..3f84610e781b 100755 --- a/spartan/scripts/deploy_network.sh +++ b/spartan/scripts/deploy_network.sh @@ -67,7 +67,7 @@ LABS_INFRA_INDICES=${LABS_INFRA_INDICES:-0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,1 ######################## # ROLLUP VARIABLES ######################## -CREATE_ROLLUP_CONTRACTS=${CREATE_ROLLUP_CONTRACTS:-false} +CREATE_ROLLUP_CONTRACTS=${CREATE_ROLLUP_CONTRACTS:-true} SPONSORED_FPC=${SPONSORED_FPC:-true} TEST_ACCOUNTS=${TEST_ACCOUNTS:-false} REAL_VERIFIER=${REAL_VERIFIER:-true} @@ -196,6 +196,12 @@ P2P_GOSSIPSUB_DHI=${P2P_GOSSIPSUB_DHI:-12} P2P_DROP_TX_CHANCE=${P2P_DROP_TX_CHANCE:-0} +# Chaos mesh scenarios values file (e.g., "network-requirements.yaml") +# If set, the experiment is installed after Aztec infra, rules are injected, +# then all pods are restarted so they come up clean with partition rules active. +# Requires the chaos mesh operator to already be running (see deploy_chaos_mesh.sh). +CHAOS_MESH_SCENARIOS_FILE=${CHAOS_MESH_SCENARIOS_FILE:-} + # Compute validator addresses (skip if no validators) if [[ $VALIDATOR_REPLICAS -gt 0 ]]; then VALIDATOR_ADDRESSES=$(echo "$VALIDATOR_INDICES" | tr ',' '\n' | xargs -I{} cast wallet address --mnemonic "$LABS_INFRA_MNEMONIC" --mnemonic-index {} | tr '\n' ',' | sed 's/,$//') @@ -458,6 +464,20 @@ else fi +# ------------------------------- +# Generate admin API key +# ------------------------------- +# Generate a fresh key on every deploy; the hash goes to validators and the +# raw key is stored as a K8s Secret for the test runner to retrieve later. +# The raw key is never logged. +ADMIN_API_KEY=$(openssl rand -hex 32) +ADMIN_API_KEY_HASH=$(printf '%s' "$ADMIN_API_KEY" | sha256sum | cut -d' ' -f1) +kubectl create secret generic aztec-admin-api-key \ + --from-literal=key="$ADMIN_API_KEY" \ + --namespace "${NAMESPACE}" \ + --dry-run=client -o yaml | kubectl apply -f - +unset ADMIN_API_KEY + # ------------------------------- # Deploy Aztec infra # ------------------------------- @@ -611,12 +631,54 @@ PROVER_AGENT_PROOF_TYPES = ${PROVER_AGENT_PROOF_TYPES:-[]} DEBUG_FORCE_TX_PROOF_VERIFICATION = ${DEBUG_FORCE_TX_PROOF_VERIFICATION:-false} WAIT_FOR_PROVER_DEPLOY = ${WAIT_FOR_PROVER_DEPLOY:-null} +ADMIN_API_KEY_HASH = "${ADMIN_API_KEY_HASH}" EOF k8s_denoise "tf_run "${DEPLOY_AZTEC_INFRA_DIR}" "${DESTROY_AZTEC_INFRA}" "${CREATE_AZTEC_INFRA}"" STAGE_TIMINGS[aztec_infra]=$(($(date +%s) - AZTEC_INFRA_START)) log "Deployed aztec infra" +# ------------------------------------------------------- +# Optionally install chaos mesh scenarios after Aztec infra +# ------------------------------------------------------- +# Chaos Mesh resolves pod selectors at experiment creation time, so the target +# pods must already exist. The chaos-daemon injects iptables DROP rules into +# each matched pod's network namespace. For partition experiments, this +# immediately blocks packets between the partitioned pods, causing existing +# TCP connections to timeout and preventing new ones from forming. +# +# IMPORTANT: Do NOT restart pods after chaos injection. Chaos Mesh does not +# automatically re-inject rules into recreated pods, leaving them unpartitioned. +if [[ -n "${CHAOS_MESH_SCENARIOS_FILE}" ]]; then + CHAOS_SCENARIOS_DIR="${SCRIPT_DIR}/../aztec-chaos-scenarios" + log "Installing chaos mesh scenarios from ${CHAOS_MESH_SCENARIOS_FILE}" + helm upgrade --install network-shaping "${CHAOS_SCENARIOS_DIR}" \ + --namespace "${NAMESPACE}" \ + --values "${CHAOS_SCENARIOS_DIR}/values/${CHAOS_MESH_SCENARIOS_FILE}" \ + --set "global.targetNamespace=${NAMESPACE}" \ + --wait --timeout=5m + log "Chaos mesh scenarios installed, waiting for rules to be injected..." + + # Wait for all NetworkChaos experiments to have their rules injected. + # The AllInjected condition confirms iptables rules are active on every matched pod. + CHAOS_WAIT_TIMEOUT=120 + CHAOS_WAITED=0 + while true; do + NOT_INJECTED=$(kubectl get networkchaos -n "${NAMESPACE}" -o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="AllInjected")].status}{"\n"}{end}' 2>/dev/null | grep -c "False" || true) + if [[ "${NOT_INJECTED}" -eq 0 ]]; then + log "All chaos mesh rules injected" + break + fi + if [[ "${CHAOS_WAITED}" -ge "${CHAOS_WAIT_TIMEOUT}" ]]; then + log "WARNING: Timed out waiting for chaos mesh injection after ${CHAOS_WAIT_TIMEOUT}s (${NOT_INJECTED} experiments not yet injected)" + break + fi + sleep 5 + CHAOS_WAITED=$((CHAOS_WAITED + 5)) + done + log "Chaos mesh partition active — existing connections will break as packets are dropped" +fi + # Calculate total deployment time DEPLOY_END_TIME=$(date +%s) TOTAL_DEPLOY_TIME=$((DEPLOY_END_TIME - DEPLOY_START_TIME)) diff --git a/spartan/terraform/deploy-aztec-infra/main.tf b/spartan/terraform/deploy-aztec-infra/main.tf index b00d4f8447f6..bf07026f31d8 100644 --- a/spartan/terraform/deploy-aztec-infra/main.tf +++ b/spartan/terraform/deploy-aztec-infra/main.tf @@ -232,6 +232,7 @@ locals { "validator.node.env.WS_NUM_HISTORIC_CHECKPOINTS" = var.WS_NUM_HISTORIC_CHECKPOINTS "validator.node.env.TX_COLLECTION_FILE_STORE_URLS" = var.TX_COLLECTION_FILE_STORE_URLS "validator.node.env.SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT" = var.SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT + "validator.node.adminApiKeyHash" = var.ADMIN_API_KEY_HASH } # Note: nonsensitive() is required here because helm_releases is used in for_each, diff --git a/spartan/terraform/deploy-aztec-infra/variables.tf b/spartan/terraform/deploy-aztec-infra/variables.tf index 790bf238a9ff..fadca17716b5 100644 --- a/spartan/terraform/deploy-aztec-infra/variables.tf +++ b/spartan/terraform/deploy-aztec-infra/variables.tf @@ -229,6 +229,12 @@ variable "VALIDATOR_HA_REPLICAS" { default = 0 } +variable "ADMIN_API_KEY_HASH" { + description = "SHA-256 hex hash of the admin API key. When set, enables admin API authentication on validator nodes. Leave empty to disable admin auth (default)." + type = string + default = "" +} + variable "PROVER_MNEMONIC" { description = "The prover mnemonic" type = string diff --git a/yarn-project/archiver/src/l1/data_retrieval.ts b/yarn-project/archiver/src/l1/data_retrieval.ts index 4f5a529f1aae..599a33bc1bfb 100644 --- a/yarn-project/archiver/src/l1/data_retrieval.ts +++ b/yarn-project/archiver/src/l1/data_retrieval.ts @@ -265,6 +265,9 @@ async function processCheckpointProposedLogs( checkpointNumber, expectedHashes, ); + const { timestamp, parentBeaconBlockRoot } = await getL1Block(publicClient, log.l1BlockNumber); + const l1 = new L1PublishedData(log.l1BlockNumber, timestamp, log.l1BlockHash.toString()); + const checkpointBlobData = await getCheckpointBlobDataFromBlobs( blobClient, checkpoint.blockHash, @@ -272,12 +275,8 @@ async function processCheckpointProposedLogs( checkpointNumber, logger, isHistoricalSync, - ); - - const l1 = new L1PublishedData( - log.l1BlockNumber, - await getL1BlockTime(publicClient, log.l1BlockNumber), - log.l1BlockHash.toString(), + parentBeaconBlockRoot, + timestamp, ); retrievedCheckpoints.push({ ...checkpoint, checkpointBlobData, l1, chainId, version }); @@ -298,9 +297,12 @@ async function processCheckpointProposedLogs( return retrievedCheckpoints; } -export async function getL1BlockTime(publicClient: ViemPublicClient, blockNumber: bigint): Promise { +export async function getL1Block( + publicClient: ViemPublicClient, + blockNumber: bigint, +): Promise<{ timestamp: bigint; parentBeaconBlockRoot: string | undefined }> { const block = await publicClient.getBlock({ blockNumber, includeTransactions: false }); - return block.timestamp; + return { timestamp: block.timestamp, parentBeaconBlockRoot: block.parentBeaconBlockRoot }; } export async function getCheckpointBlobDataFromBlobs( @@ -310,8 +312,14 @@ export async function getCheckpointBlobDataFromBlobs( checkpointNumber: CheckpointNumber, logger: Logger, isHistoricalSync: boolean, + parentBeaconBlockRoot?: string, + l1BlockTimestamp?: bigint, ): Promise { - const blobBodies = await blobClient.getBlobSidecar(blockHash, blobHashes, { isHistoricalSync }); + const blobBodies = await blobClient.getBlobSidecar(blockHash, blobHashes, { + isHistoricalSync, + parentBeaconBlockRoot, + l1BlockTimestamp, + }); if (blobBodies.length === 0) { throw new NoBlobBodiesFoundError(checkpointNumber); } diff --git a/yarn-project/aztec-node/src/aztec-node/server.ts b/yarn-project/aztec-node/src/aztec-node/server.ts index 99481ba2373e..907a6a2c5e2c 100644 --- a/yarn-project/aztec-node/src/aztec-node/server.ts +++ b/yarn-project/aztec-node/src/aztec-node/server.ts @@ -79,7 +79,6 @@ import { import type { DebugLogStore, LogFilter, SiloedTag, Tag, TxScopedL2Log } from '@aztec/stdlib/logs'; import { InMemoryDebugLogStore, NullDebugLogStore } from '@aztec/stdlib/logs'; import { InboxLeaf, type L1ToL2MessageSource } from '@aztec/stdlib/messaging'; -import { P2PClientType } from '@aztec/stdlib/p2p'; import type { Offense, SlashPayloadRound } from '@aztec/stdlib/slashing'; import type { NullifierLeafPreimage, PublicDataTreeLeaf, PublicDataTreeLeafPreimage } from '@aztec/stdlib/trees'; import { MerkleTreeId, NullifierMembershipWitness, PublicDataWitness } from '@aztec/stdlib/trees'; @@ -194,7 +193,7 @@ export class AztecNodeService implements AztecNode, AztecNodeAdmin, Traceable { logger?: Logger; publisher?: SequencerPublisher; dateProvider?: DateProvider; - p2pClientDeps?: P2PClientDeps; + p2pClientDeps?: P2PClientDeps; proverNodeDeps?: Partial; } = {}, options: { @@ -325,9 +324,13 @@ export class AztecNodeService implements AztecNode, AztecNodeAdmin, Traceable { const proofVerifier = new QueuedIVCVerifier(config, circuitVerifier); + const proverOnly = config.enableProverNode && config.disableValidator; + if (proverOnly) { + log.info('Starting in prover-only mode: skipping validator, sequencer, sentinel, and slasher subsystems'); + } + // create the tx pool and the p2p client, which will need the l2 block source const p2pClient = await createP2PClient( - P2PClientType.Full, config, archiver, proofVerifier, @@ -342,7 +345,10 @@ export class AztecNodeService implements AztecNode, AztecNodeAdmin, Traceable { // We should really not be modifying the config object config.txPublicSetupAllowList = config.txPublicSetupAllowList ?? (await getDefaultAllowedSetupFunctions()); - // Create FullNodeCheckpointsBuilder for validator and non-validator block proposal handling + // We'll accumulate sentinel watchers here + const watchers: Watcher[] = []; + + // Create FullNodeCheckpointsBuilder for block proposal handling and tx validation const validatorCheckpointsBuilder = new FullNodeCheckpointsBuilder( { ...config, l1GenesisTime, slotDuration: Number(slotDuration) }, worldStateSynchronizer, @@ -351,47 +357,48 @@ export class AztecNodeService implements AztecNode, AztecNodeAdmin, Traceable { telemetry, ); - // We'll accumulate sentinel watchers here - const watchers: Watcher[] = []; - - // Create validator client if required - const validatorClient = await createValidatorClient(config, { - checkpointsBuilder: validatorCheckpointsBuilder, - worldState: worldStateSynchronizer, - p2pClient, - telemetry, - dateProvider, - epochCache, - blockSource: archiver, - l1ToL2MessageSource: archiver, - keyStoreManager, - blobClient, - }); - - // If we have a validator client, register it as a source of offenses for the slasher, - // and have it register callbacks on the p2p client *before* we start it, otherwise messages - // like attestations or auths will fail. - if (validatorClient) { - watchers.push(validatorClient); - if (!options.dontStartSequencer) { - await validatorClient.registerHandlers(); - } - } + let validatorClient: ValidatorClient | undefined; - // If there's no validator client but alwaysReexecuteBlockProposals is enabled, - // create a BlockProposalHandler to reexecute block proposals for monitoring - if (!validatorClient && config.alwaysReexecuteBlockProposals) { - log.info('Setting up block proposal reexecution for monitoring'); - createBlockProposalHandler(config, { + if (!proverOnly) { + // Create validator client if required + validatorClient = await createValidatorClient(config, { checkpointsBuilder: validatorCheckpointsBuilder, worldState: worldStateSynchronizer, + p2pClient, + telemetry, + dateProvider, epochCache, blockSource: archiver, l1ToL2MessageSource: archiver, - p2pClient, - dateProvider, - telemetry, - }).registerForReexecution(p2pClient); + keyStoreManager, + blobClient, + }); + + // If we have a validator client, register it as a source of offenses for the slasher, + // and have it register callbacks on the p2p client *before* we start it, otherwise messages + // like attestations or auths will fail. + if (validatorClient) { + watchers.push(validatorClient); + if (!options.dontStartSequencer) { + await validatorClient.registerHandlers(); + } + } + + // If there's no validator client but alwaysReexecuteBlockProposals is enabled, + // create a BlockProposalHandler to reexecute block proposals for monitoring + if (!validatorClient && config.alwaysReexecuteBlockProposals) { + log.info('Setting up block proposal reexecution for monitoring'); + createBlockProposalHandler(config, { + checkpointsBuilder: validatorCheckpointsBuilder, + worldState: worldStateSynchronizer, + epochCache, + blockSource: archiver, + l1ToL2MessageSource: archiver, + p2pClient, + dateProvider, + telemetry, + }).registerForReexecution(p2pClient); + } } // Start world state and wait for it to sync to the archiver. @@ -400,29 +407,33 @@ export class AztecNodeService implements AztecNode, AztecNodeAdmin, Traceable { // Start p2p. Note that it depends on world state to be running. await p2pClient.start(); - const validatorsSentinel = await createSentinel(epochCache, archiver, p2pClient, config); - if (validatorsSentinel && config.slashInactivityPenalty > 0n) { - watchers.push(validatorsSentinel); - } - + let validatorsSentinel: Awaited> | undefined; let epochPruneWatcher: EpochPruneWatcher | undefined; - if (config.slashPrunePenalty > 0n || config.slashDataWithholdingPenalty > 0n) { - epochPruneWatcher = new EpochPruneWatcher( - archiver, - archiver, - epochCache, - p2pClient.getTxProvider(), - validatorCheckpointsBuilder, - config, - ); - watchers.push(epochPruneWatcher); - } - - // We assume we want to slash for invalid attestations unless all max penalties are set to 0 let attestationsBlockWatcher: AttestationsBlockWatcher | undefined; - if (config.slashProposeInvalidAttestationsPenalty > 0n || config.slashAttestDescendantOfInvalidPenalty > 0n) { - attestationsBlockWatcher = new AttestationsBlockWatcher(archiver, epochCache, config); - watchers.push(attestationsBlockWatcher); + + if (!proverOnly) { + validatorsSentinel = await createSentinel(epochCache, archiver, p2pClient, config); + if (validatorsSentinel && config.slashInactivityPenalty > 0n) { + watchers.push(validatorsSentinel); + } + + if (config.slashPrunePenalty > 0n || config.slashDataWithholdingPenalty > 0n) { + epochPruneWatcher = new EpochPruneWatcher( + archiver, + archiver, + epochCache, + p2pClient.getTxProvider(), + validatorCheckpointsBuilder, + config, + ); + watchers.push(epochPruneWatcher); + } + + // We assume we want to slash for invalid attestations unless all max penalties are set to 0 + if (config.slashProposeInvalidAttestationsPenalty > 0n || config.slashAttestDescendantOfInvalidPenalty > 0n) { + attestationsBlockWatcher = new AttestationsBlockWatcher(archiver, epochCache, config); + watchers.push(attestationsBlockWatcher); + } } // Start p2p-related services once the archiver has completed sync @@ -844,8 +855,9 @@ export class AztecNodeService implements AztecNode, AztecNodeAdmin, Traceable { } await this.p2pClient!.sendTx(tx); - this.metrics.receivedTx(timer.ms(), true); - this.log.info(`Received tx ${txHash}`, { txHash }); + const duration = timer.ms(); + this.metrics.receivedTx(duration, true); + this.log.info(`Received tx ${txHash} in ${duration}ms`, { txHash }); } public async getTxReceipt(txHash: TxHash): Promise { diff --git a/yarn-project/bb-prover/src/bb/execute.ts b/yarn-project/bb-prover/src/bb/execute.ts index 06af043b4737..a2d5d2e1df73 100644 --- a/yarn-project/bb-prover/src/bb/execute.ts +++ b/yarn-project/bb-prover/src/bb/execute.ts @@ -52,6 +52,8 @@ type BBExecResult = { signal: string | undefined; }; +export const DEFAULT_BB_VERIFY_CONCURRENCY = 4; + /** * Invokes the Barretenberg binary with the provided command and args * @param pathToBB - The path to the BB binary @@ -398,7 +400,14 @@ export async function verifyProof( '--disable_zk', ...getArgs(ultraHonkFlavor), ]; - return await verifyProofInternal(pathToBB, `verify`, args, logger); + + let concurrency = DEFAULT_BB_VERIFY_CONCURRENCY; + + if (process.env.VERIFY_HARDWARE_CONCURRENCY) { + concurrency = parseInt(process.env.VERIFY_HARDWARE_CONCURRENCY, 10); + } + + return await verifyProofInternal(pathToBB, `verify`, args, logger, concurrency); } export async function verifyAvmProof( diff --git a/yarn-project/blob-client/src/client/http.test.ts b/yarn-project/blob-client/src/client/http.test.ts index c5c7d8eb832f..ceb41961c796 100644 --- a/yarn-project/blob-client/src/client/http.test.ts +++ b/yarn-project/blob-client/src/client/http.test.ts @@ -85,16 +85,22 @@ describe('HttpBlobClient', () => { return; } - if (req.url?.includes('/eth/v1/beacon/headers/')) { + if (req.url?.includes('/eth/v1/config/genesis')) { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ data: { genesisTime: '1000' } })); + } else if (req.url?.includes('/eth/v1/config/spec')) { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ data: { secondsPerSlot: '12' } })); + } else if (req.url?.includes('/eth/v1/beacon/headers/')) { res.writeHead(200, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ data: { header: { message: { slot: latestSlotNumber } } } })); - } else if (req.url?.includes('/eth/v1/beacon/blob_sidecars/')) { + } else if (req.url?.includes('/eth/v1/beacon/blobs/')) { if (missedSlots.some(slot => req.url?.includes(`/${slot}`))) { res.writeHead(404, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ error: 'Not Found' })); } else { res.writeHead(200, { 'Content-Type': 'application/json' }); - res.end(JSON.stringify({ data: blobData })); + res.end(JSON.stringify({ data: blobData.map(b => b.blob) })); } } else { res.writeHead(404, { 'Content-Type': 'application/json' }); @@ -135,6 +141,61 @@ describe('HttpBlobClient', () => { expect(retrievedBlobs[1].commitment).toEqual(testBlobs[1].commitment); }); + it('should compute slot from l1BlockTimestamp without headers call when genesis config is cached', async () => { + await startExecutionHostServer(); + await startConsensusHostServer(); + + const client = new HttpBlobClient({ + l1RpcUrls: [`http://localhost:${executionHostPort}`], + l1ConsensusHostUrls: [`http://localhost:${consensusHostPort}`], + }); + + // Call start() to fetch and cache genesis config (genesis_time=1000, SECONDS_PER_SLOT=12) + await client.start(); + + const fetchSpy = jest.spyOn(client as any, 'fetch'); + + // slot = (l1BlockTimestamp - genesis_time) / seconds_per_slot = (1024 - 1000) / 12 = 2 + // so blobs should be fetched at slot 2 + const retrievedBlobs = await client.getBlobSidecar('0x1234', testBlobsHashes, { + l1BlockTimestamp: 1024n, + }); + + expect(retrievedBlobs).toHaveLength(2); + expect(retrievedBlobs[0].commitment).toEqual(testBlobs[0].commitment); + expect(retrievedBlobs[1].commitment).toEqual(testBlobs[1].commitment); + + // Headers call for slot resolution should NOT have been made + expect(fetchSpy).not.toHaveBeenCalledWith( + expect.stringContaining('/eth/v1/beacon/headers/0x'), + expect.anything(), + ); + // Blobs fetched at the computed slot (2) + expect(fetchSpy).toHaveBeenCalledWith(expect.stringContaining('/eth/v1/beacon/blobs/2'), expect.anything()); + }); + + it('should fall back to headers call when l1BlockTimestamp is not provided', async () => { + await startExecutionHostServer(); + await startConsensusHostServer(); + + const client = new HttpBlobClient({ + l1RpcUrls: [`http://localhost:${executionHostPort}`], + l1ConsensusHostUrls: [`http://localhost:${consensusHostPort}`], + }); + + // Call start() to cache genesis config, but do NOT pass l1BlockTimestamp + await client.start(); + + const fetchSpy = jest.spyOn(client as any, 'fetch'); + + // No l1BlockTimestamp — should fall back to headers call + const retrievedBlobs = await client.getBlobSidecar('0x1234', testBlobsHashes); + + expect(retrievedBlobs).toHaveLength(2); + // Headers call for slot resolution SHOULD have been made (via parentBeaconBlockRoot from execution RPC) + expect(fetchSpy).toHaveBeenCalledWith(expect.stringContaining('/eth/v1/beacon/headers/'), expect.anything()); + }); + it('should handle when multiple consensus hosts are provided', async () => { await startExecutionHostServer(); await startConsensusHostServer(); @@ -423,11 +484,11 @@ describe('HttpBlobClient', () => { // Verify we hit the 404 for slot 33 before trying slot 34, and that we use the api key header // (see issue https://github.com/AztecProtocol/aztec-packages/issues/13415) expect(fetchSpy).toHaveBeenCalledWith( - expect.stringContaining('/eth/v1/beacon/blob_sidecars/33'), + expect.stringContaining('/eth/v1/beacon/blobs/33'), expect.objectContaining({ headers: { ['X-API-KEY']: 'my-api-key' } }), ); expect(fetchSpy).toHaveBeenCalledWith( - expect.stringContaining('/eth/v1/beacon/blob_sidecars/34'), + expect.stringContaining('/eth/v1/beacon/blobs/34'), expect.objectContaining({ headers: { ['X-API-KEY']: 'my-api-key' } }), ); }); @@ -458,10 +519,7 @@ describe('HttpBlobClient', () => { expect(fetchSpy).toHaveBeenCalledTimes(latestSlotNumber - 33 + 2); for (let i = 33; i <= latestSlotNumber; i++) { - expect(fetchSpy).toHaveBeenCalledWith( - expect.stringContaining(`/eth/v1/beacon/blob_sidecars/${i}`), - expect.anything(), - ); + expect(fetchSpy).toHaveBeenCalledWith(expect.stringContaining(`/eth/v1/beacon/blobs/${i}`), expect.anything()); } }); @@ -519,27 +577,12 @@ describe('HttpBlobClient', () => { ]; expect(await client.getBlobSidecar('0x1234', [blobHash])).toEqual([]); - // Incorrect bytes for the commitment. - blobData = [ - ...originalBlobData, - { - ...blobJson, - // eslint-disable-next-line camelcase - kzg_commitment: 'abcdefghijk', - }, - ]; - expect(await client.getBlobSidecar('0x1234', [blobHash])).toEqual([]); - - // Commitment does not exist. - blobData = [ - ...originalBlobData, - { - blob: blobJson.blob, - } as BlobJson, - ]; + // Blob from a different hash, commitment is computed correctly but doesn't match requested hash. + const otherBlob = await makeRandomBlob(3); + blobData = [...originalBlobData, otherBlob.toJSON()]; expect(await client.getBlobSidecar('0x1234', [blobHash])).toEqual([]); - // Correct blob json. + // Correct blob hex json. blobData = [...originalBlobData, blobJson]; const result = await client.getBlobSidecar('0x1234', [blobHash]); expect(result).toHaveLength(1); @@ -906,9 +949,9 @@ describe('HttpBlobClient FileStore Integration', () => { if (req.url?.includes('/eth/v1/beacon/headers/')) { res.writeHead(200, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ data: { header: { message: { slot: 1 } } } })); - } else if (req.url?.includes('/eth/v1/beacon/blob_sidecars/')) { + } else if (req.url?.includes('/eth/v1/beacon/blobs/')) { res.writeHead(200, { 'Content-Type': 'application/json' }); - res.end(JSON.stringify({ data: blobData })); + res.end(JSON.stringify({ data: blobData.map(b => b.blob) })); } else { res.writeHead(404, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ error: 'Not Found' })); @@ -951,8 +994,8 @@ describe('HttpBlobClient FileStore Integration', () => { const retrievedBlobs = await client.getBlobSidecar('0x1234', testBlobsHashes); expect(retrievedBlobs).toHaveLength(2); - // Consensus should not be called for blob_sidecars since filestore had all blobs - expect(fetchSpy).not.toHaveBeenCalledWith(expect.stringContaining('blob_sidecars'), expect.anything()); + // Consensus should not be called for blobs since filestore had all blobs + expect(fetchSpy).not.toHaveBeenCalledWith(expect.stringContaining('/beacon/blobs/'), expect.anything()); }); it('should fall back to consensus when filestore has partial blobs', async () => { diff --git a/yarn-project/blob-client/src/client/http.ts b/yarn-project/blob-client/src/client/http.ts index 5d626933261a..ffa48d2cbe74 100644 --- a/yarn-project/blob-client/src/client/http.ts +++ b/yarn-project/blob-client/src/client/http.ts @@ -24,6 +24,11 @@ export class HttpBlobClient implements BlobClientInterface { private disabled = false; private healthcheckUploadIntervalId?: NodeJS.Timeout; + /** Cached beacon genesis time (seconds since Unix epoch). Fetched once at startup. */ + private beaconGenesisTime?: bigint; + /** Cached beacon slot duration in seconds. Fetched once at startup. */ + private beaconSecondsPerSlot?: number; + constructor( config?: BlobClientConfig, private readonly opts: { @@ -251,7 +256,7 @@ export class HttpBlobClient implements BlobClientInterface { // The beacon api can query by slot number, so we get that first const consensusCtx = { l1ConsensusHostUrls, ...ctx }; this.log.trace(`Attempting to get slot number for block hash`, consensusCtx); - const slotNumber = await this.getSlotNumber(blockHash); + const slotNumber = await this.getSlotNumber(blockHash, opts?.parentBeaconBlockRoot, opts?.l1BlockTimestamp); this.log.debug(`Got slot number ${slotNumber} from consensus host for querying blobs`, consensusCtx); if (slotNumber) { @@ -268,7 +273,12 @@ export class HttpBlobClient implements BlobClientInterface { l1ConsensusHostUrl, ...ctx, }); - const blobs = await this.getBlobsFromHost(l1ConsensusHostUrl, slotNumber, l1ConsensusHostIndex); + const blobs = await this.getBlobsFromHost( + l1ConsensusHostUrl, + slotNumber, + l1ConsensusHostIndex, + getMissingBlobHashes(), + ); const result = await fillResults(blobs); this.log.debug( `Got ${blobs.length} blobs from consensus host (total: ${result.length}/${blobHashes.length})`, @@ -387,7 +397,7 @@ export class HttpBlobClient implements BlobClientInterface { blobHashes: Buffer[] = [], l1ConsensusHostIndex?: number, ): Promise { - const blobs = await this.getBlobsFromHost(hostUrl, blockHashOrSlot, l1ConsensusHostIndex); + const blobs = await this.getBlobsFromHost(hostUrl, blockHashOrSlot, l1ConsensusHostIndex, blobHashes); return (await processFetchedBlobs(blobs, blobHashes, this.log)).filter((b): b is Blob => b !== undefined); } @@ -395,11 +405,12 @@ export class HttpBlobClient implements BlobClientInterface { hostUrl: string, blockHashOrSlot: string | number, l1ConsensusHostIndex?: number, + blobHashes?: Buffer[], ): Promise { try { - let res = await this.fetchBlobSidecars(hostUrl, blockHashOrSlot, l1ConsensusHostIndex); + let res = await this.fetchBlobSidecars(hostUrl, blockHashOrSlot, l1ConsensusHostIndex, blobHashes); if (res.ok) { - return parseBlobJsonsFromResponse(await res.json(), this.log); + return await parseBlobJsonsFromResponse(await res.json(), this.log); } if (res.status === 404 && typeof blockHashOrSlot === 'number') { @@ -414,9 +425,9 @@ export class HttpBlobClient implements BlobClientInterface { let currentSlot = blockHashOrSlot + 1; while (res.status === 404 && maxRetries > 0 && latestSlot !== undefined && currentSlot <= latestSlot) { this.log.debug(`Trying slot ${currentSlot}`); - res = await this.fetchBlobSidecars(hostUrl, currentSlot, l1ConsensusHostIndex); + res = await this.fetchBlobSidecars(hostUrl, currentSlot, l1ConsensusHostIndex, blobHashes); if (res.ok) { - return parseBlobJsonsFromResponse(await res.json(), this.log); + return await parseBlobJsonsFromResponse(await res.json(), this.log); } currentSlot++; maxRetries--; @@ -439,8 +450,17 @@ export class HttpBlobClient implements BlobClientInterface { hostUrl: string, blockHashOrSlot: string | number, l1ConsensusHostIndex?: number, + blobHashes?: Buffer[], ): Promise { - const baseUrl = `${hostUrl}/eth/v1/beacon/blob_sidecars/${blockHashOrSlot}`; + let baseUrl = `${hostUrl}/eth/v1/beacon/blobs/${blockHashOrSlot}`; + + if (blobHashes && blobHashes.length > 0) { + const params = new URLSearchParams(); + for (const hash of blobHashes) { + params.append('versioned_hashes', `0x${hash.toString('hex')}`); + } + baseUrl += `?${params.toString()}`; + } const { url, ...options } = getBeaconNodeFetchOptions(baseUrl, this.config, l1ConsensusHostIndex); this.log.debug(`Fetching blob sidecar for ${blockHashOrSlot}`, { url, ...options }); @@ -482,34 +502,50 @@ export class HttpBlobClient implements BlobClientInterface { * @param blockHash - The block hash * @returns The slot number */ - private async getSlotNumber(blockHash: `0x${string}`): Promise { + private async getSlotNumber( + blockHash: `0x${string}`, + parentBeaconBlockRoot?: string, + l1BlockTimestamp?: bigint, + ): Promise { const { l1ConsensusHostUrls, l1RpcUrls } = this.config; if (!l1ConsensusHostUrls || l1ConsensusHostUrls.length === 0) { this.log.debug('No consensus host url configured'); return undefined; } - if (!l1RpcUrls || l1RpcUrls.length === 0) { - this.log.debug('No execution host url configured'); - return undefined; + // Primary path: compute slot from timestamp if genesis config is cached (no network call needed) + if ( + l1BlockTimestamp !== undefined && + this.beaconGenesisTime !== undefined && + this.beaconSecondsPerSlot !== undefined + ) { + const slot = Number((l1BlockTimestamp - this.beaconGenesisTime) / BigInt(this.beaconSecondsPerSlot)); + this.log.debug(`Computed slot ${slot} from L1 block timestamp`, { l1BlockTimestamp }); + return slot; } - // Ping execution node to get the parentBeaconBlockRoot for this block - let parentBeaconBlockRoot: string | undefined; - const client = createPublicClient({ - transport: fallback(l1RpcUrls.map(url => http(url, { batch: false }))), - }); - try { - const res: RpcBlock = await client.request({ - method: 'eth_getBlockByHash', - params: [blockHash, /*tx flag*/ false], + if (!parentBeaconBlockRoot) { + // parentBeaconBlockRoot not provided by caller — fetch it from the execution RPC + if (!l1RpcUrls || l1RpcUrls.length === 0) { + this.log.debug('No execution host url configured'); + return undefined; + } + + const client = createPublicClient({ + transport: fallback(l1RpcUrls.map(url => http(url, { batch: false }))), }); + try { + const res: RpcBlock = await client.request({ + method: 'eth_getBlockByHash', + params: [blockHash, /*tx flag*/ false], + }); - if (res.parentBeaconBlockRoot) { - parentBeaconBlockRoot = res.parentBeaconBlockRoot; + if (res.parentBeaconBlockRoot) { + parentBeaconBlockRoot = res.parentBeaconBlockRoot; + } + } catch (err) { + this.log.error(`Error getting parent beacon block root`, err); } - } catch (err) { - this.log.error(`Error getting parent beacon block root`, err); } if (!parentBeaconBlockRoot) { @@ -555,9 +591,12 @@ export class HttpBlobClient implements BlobClientInterface { /** * Start the blob client. - * Uploads the initial healthcheck file (awaited) and starts periodic uploads. + * Fetches and caches beacon genesis config for timestamp-based slot resolution, + * then uploads the initial healthcheck file (awaited) and starts periodic uploads. */ public async start(): Promise { + await this.fetchBeaconConfig(); + if (!this.fileStoreUploadClient) { return; } @@ -582,6 +621,53 @@ export class HttpBlobClient implements BlobClientInterface { }, intervalMs); } + /** + * Fetches and caches beacon genesis time and slot duration from the first available consensus host. + * These static values enable timestamp-based slot resolution, eliminating the per-fetch headers call. + * Logs a warning and leaves fields undefined if all hosts fail, callers fall back gracefully. + */ + private async fetchBeaconConfig(): Promise { + const { l1ConsensusHostUrls } = this.config; + if (!l1ConsensusHostUrls || l1ConsensusHostUrls.length === 0) { + return; + } + + for (let i = 0; i < l1ConsensusHostUrls.length; i++) { + try { + const { url: genesisUrl, ...genesisOptions } = getBeaconNodeFetchOptions( + `${l1ConsensusHostUrls[i]}/eth/v1/config/genesis`, + this.config, + i, + ); + const { url: specUrl, ...specOptions } = getBeaconNodeFetchOptions( + `${l1ConsensusHostUrls[i]}/eth/v1/config/spec`, + this.config, + i, + ); + + const [genesisRes, specRes] = await Promise.all([ + this.fetch(genesisUrl, genesisOptions), + this.fetch(specUrl, specOptions), + ]); + + if (genesisRes.ok && specRes.ok) { + const genesis = await genesisRes.json(); + const spec = await specRes.json(); + this.beaconGenesisTime = BigInt(genesis.data.genesisTime); + this.beaconSecondsPerSlot = parseInt(spec.data.secondsPerSlot); + this.log.debug(`Fetched beacon genesis config`, { + genesisTime: this.beaconGenesisTime, + secondsPerSlot: this.beaconSecondsPerSlot, + }); + return; + } + } catch (err) { + this.log.warn(`Failed to fetch beacon config from host ${l1ConsensusHostUrls[i]}`, err); + } + } + this.log.warn('Could not fetch beacon genesis config from any consensus host — will use headers call fallback'); + } + /** * Stop the blob client, clearing any periodic tasks. */ @@ -593,10 +679,9 @@ export class HttpBlobClient implements BlobClientInterface { } } -function parseBlobJsonsFromResponse(response: any, logger: Logger): BlobJson[] { +async function parseBlobJsonsFromResponse(response: any, logger: Logger): Promise { try { - const blobs = response.data.map(parseBlobJson); - return blobs; + return await Promise.all((response.data as string[]).map(parseBlobJson)); } catch (err) { logger.error(`Error parsing blob json from response`, err); return []; @@ -607,10 +692,9 @@ function parseBlobJsonsFromResponse(response: any, logger: Logger): BlobJson[] { // https://ethereum.github.io/beacon-APIs/?urls.primaryName=dev#/Beacon/getBlobSidecars // Here we attempt to parse the response data to Buffer, and check the lengths (via Blob's constructor), to avoid // throwing an error down the line when calling Blob.fromJson(). -function parseBlobJson(data: any): BlobJson { - const blobBuffer = Buffer.from(data.blob.slice(2), 'hex'); - const commitmentBuffer = Buffer.from(data.kzg_commitment.slice(2), 'hex'); - const blob = new Blob(blobBuffer, commitmentBuffer); +async function parseBlobJson(rawHex: string): Promise { + const blobBuffer = Buffer.from(rawHex.slice(2), 'hex'); + const blob = await Blob.fromBlobBuffer(blobBuffer); return blob.toJSON(); } diff --git a/yarn-project/blob-client/src/client/interface.ts b/yarn-project/blob-client/src/client/interface.ts index b9dfd8e30728..6959e31edbdc 100644 --- a/yarn-project/blob-client/src/client/interface.ts +++ b/yarn-project/blob-client/src/client/interface.ts @@ -11,6 +11,17 @@ export interface GetBlobSidecarOptions { * - Near tip: FileStore first with no retries (data should exist), L1 consensus second (freshest data), then FileStore with retries, then archive (eg. blobscan) */ isHistoricalSync?: boolean; + /** + * The parent beacon block root for the L1 block containing the blobs. + * If provided, skips the eth_getBlockByHash execution RPC call inside getSlotNumber. + */ + parentBeaconBlockRoot?: string; + /** + * The timestamp of the L1 execution block containing the blobs. + * When provided alongside a cached beacon genesis config (fetched at startup), allows computing + * the beacon slot directly via timestamp math, skipping the beacon headers network call entirely. + */ + l1BlockTimestamp?: bigint; } export interface BlobClientInterface { diff --git a/yarn-project/end-to-end/src/e2e_bot.test.ts b/yarn-project/end-to-end/src/e2e_bot.test.ts index 9ed66d5b27f6..00c5b06e16c3 100644 --- a/yarn-project/end-to-end/src/e2e_bot.test.ts +++ b/yarn-project/end-to-end/src/e2e_bot.test.ts @@ -134,6 +134,8 @@ describe('e2e_bot', () => { // TODO: this should be taken from the `setup` call above l1Mnemonic: new SecretValue('test test test test test test test test test test test junk'), flushSetupTransactions: true, + // Increase fee headroom to handle fee volatility from rapid block building in tests + minFeePadding: 9, }; { @@ -172,6 +174,8 @@ describe('e2e_bot', () => { // TODO: this should be taken from the `setup` call above l1Mnemonic: new SecretValue('test test test test test test test test test test test junk'), flushSetupTransactions: true, + // Increase fee headroom to handle fee volatility from rapid block building in tests + minFeePadding: 9, }; { diff --git a/yarn-project/end-to-end/src/e2e_contract_updates.test.ts b/yarn-project/end-to-end/src/e2e_contract_updates.test.ts index 8c8fd078c7de..10bf16f3e987 100644 --- a/yarn-project/end-to-end/src/e2e_contract_updates.test.ts +++ b/yarn-project/end-to-end/src/e2e_contract_updates.test.ts @@ -157,14 +157,16 @@ describe('e2e_contract_updates', () => { ); // Increases the delay so it should happen immediately - await contract.methods.set_update_delay(BigInt(MINIMUM_UPDATE_DELAY) + 1n).send({ from: defaultAccountAddress }); + await contract.methods + .set_update_delay(BigInt(DEFAULT_TEST_UPDATE_DELAY) + 1n) + .send({ from: defaultAccountAddress }); expect(await contract.methods.get_update_delay().simulate({ from: defaultAccountAddress })).toEqual( - BigInt(MINIMUM_UPDATE_DELAY) + 1n, + BigInt(DEFAULT_TEST_UPDATE_DELAY) + 1n, ); await contract.methods.update_to(updatedContractClassId).send({ from: defaultAccountAddress }); - await cheatCodes.warpL2TimeAtLeastBy(sequencer, aztecNode, BigInt(MINIMUM_UPDATE_DELAY) + 1n); + await cheatCodes.warpL2TimeAtLeastBy(sequencer, aztecNode, BigInt(DEFAULT_TEST_UPDATE_DELAY) + 1n); // Should be updated now await wallet.registerContract(instance, UpdatedContract.artifact); diff --git a/yarn-project/end-to-end/src/e2e_state_vars.test.ts b/yarn-project/end-to-end/src/e2e_state_vars.test.ts index 104c9f18a72e..c5b8d6f0635b 100644 --- a/yarn-project/end-to-end/src/e2e_state_vars.test.ts +++ b/yarn-project/end-to-end/src/e2e_state_vars.test.ts @@ -314,7 +314,7 @@ describe('e2e_state_vars', () => { from: defaultAccountAddress, }); - if (aztecSlotDuration !== 36) { + if (aztecSlotDuration !== 72) { throw new Error( 'Aztec slot duration changed and this will break this test. Update CHANGE_AUTHORIZED_DELAY constant in the Auth contract to be 5 slots again.', ); diff --git a/yarn-project/end-to-end/src/fixtures/setup.ts b/yarn-project/end-to-end/src/fixtures/setup.ts index 6def0cbc67f6..abfd4da901e0 100644 --- a/yarn-project/end-to-end/src/fixtures/setup.ts +++ b/yarn-project/end-to-end/src/fixtures/setup.ts @@ -49,7 +49,6 @@ import type { SequencerClient } from '@aztec/sequencer-client'; import { type ContractInstanceWithAddress, getContractInstanceFromInstantiationParams } from '@aztec/stdlib/contract'; import type { AztecNodeAdmin } from '@aztec/stdlib/interfaces/client'; import { tryStop } from '@aztec/stdlib/interfaces/server'; -import type { P2PClientType } from '@aztec/stdlib/p2p'; import type { PublicDataTreeLeaf } from '@aztec/stdlib/trees'; import { type TelemetryClient, @@ -456,7 +455,7 @@ export async function setup( } let mockGossipSubNetwork: MockGossipSubNetwork | undefined; - let p2pClientDeps: P2PClientDeps | undefined = undefined; + let p2pClientDeps: P2PClientDeps | undefined = undefined; if (opts.mockGossipSubNetwork) { mockGossipSubNetwork = new MockGossipSubNetwork(); @@ -503,7 +502,7 @@ export async function setup( const proverNodePrivateKeyHex: Hex = `0x${proverNodePrivateKey!.toString('hex')}`; const proverNodeDataDirectory = path.join(directoryToCleanup, randomBytes(8).toString('hex')); - const p2pClientDeps: Partial> = { + const p2pClientDeps: Partial = { p2pServiceFactory: mockGossipSubNetwork && getMockPubSubP2PServiceFactory(mockGossipSubNetwork!), rpcTxProviders: [aztecNodeService], }; @@ -719,7 +718,7 @@ export function createAndSyncProverNode( deps: { telemetry?: TelemetryClient; dateProvider: DateProvider; - p2pClientDeps?: P2PClientDeps; + p2pClientDeps?: P2PClientDeps; }, options: { prefilledPublicData: PublicDataTreeLeaf[]; dontStart?: boolean }, ): Promise<{ proverNode: AztecNodeService }> { diff --git a/yarn-project/end-to-end/src/spartan/n_tps.test.ts b/yarn-project/end-to-end/src/spartan/n_tps.test.ts index 95405e1b9e84..72a3d3002955 100644 --- a/yarn-project/end-to-end/src/spartan/n_tps.test.ts +++ b/yarn-project/end-to-end/src/spartan/n_tps.test.ts @@ -33,6 +33,7 @@ import { getChartDir, getGitProjectRoot, getRPCEndpoint, + hasDeployedHelmRelease, installChaosMeshChart, setupEnvironment, startPortForwardForPrometeheus, @@ -226,6 +227,32 @@ describe('sustained N TPS test', () => { }); const spartanDir = `${getGitProjectRoot()}/spartan`; + // Skip chaos mesh installation if it was already deployed by deploy_network.sh + // (via CHAOS_MESH_SCENARIOS_FILE). Installing before infra ensures partition + // rules are in place when pods start, preventing unwanted peer connections. + const alreadyDeployed = await hasDeployedHelmRelease(CHAOS_MESH_NAME, config.NAMESPACE); + if (alreadyDeployed) { + logger.info('Chaos mesh chart already deployed, skipping installation'); + } else { + logger.info('Installing chaos mesh chart', { + name: CHAOS_MESH_NAME, + namespace: config.NAMESPACE, + valuesFile: 'network-requirements.yaml', + }); + await installChaosMeshChart({ + logger, + targetNamespace: config.NAMESPACE, + instanceName: CHAOS_MESH_NAME, + valuesFile: 'network-requirements.yaml', + helmChartDir: getChartDir(spartanDir, 'aztec-chaos-scenarios'), + }); + logger.info('Chaos mesh installation complete'); + + logger.info('Waiting for network to stabilize after chaos mesh installation...'); + await sleep(30 * 1000); + logger.info('Network stabilization wait complete'); + } + const rpcEndpoint = await getRPCEndpoint(config.NAMESPACE); endpoints.push(rpcEndpoint); const rpcUrl = rpcEndpoint.url; @@ -285,24 +312,6 @@ describe('sustained N TPS test', () => { }); logger.info('Benchmark contract deployed', { address: benchmarkContract.address.toString() }); - logger.info('Installing chaos mesh chart', { - name: CHAOS_MESH_NAME, - namespace: config.NAMESPACE, - valuesFile: 'network-requirements.yaml', - }); - await installChaosMeshChart({ - logger, - targetNamespace: config.NAMESPACE, - instanceName: CHAOS_MESH_NAME, - valuesFile: 'network-requirements.yaml', - helmChartDir: getChartDir(spartanDir, 'aztec-chaos-scenarios'), - }); - logger.info('Chaos mesh installation complete'); - - logger.info('Waiting for network to stabilize after chaos mesh installation...'); - await sleep(30 * 1000); - logger.info('Network stabilization wait complete'); - logger.info(`Test setup complete`); }); @@ -328,7 +337,7 @@ describe('sustained N TPS test', () => { prototypeTxs.set(from.toString(), prototypeTx); } - const tx = await cloneTx(prototypeTx, priorytFee); + const tx = await cloneTx(prototypeTx, priorytFee, logger); return tx; }; @@ -345,15 +354,23 @@ describe('sustained N TPS test', () => { let lowValueTxs = 0; const lowValueSendTx = async (wallet: TestWallet) => { lowValueTxs++; - //const feeAmount = Number(randomBigInt(100n)) + 1; - //const feeAmount = 1; const feeAmount = Math.floor(lowValueTxs / 1000) + 1; const fee = new GasFees(0, feeAmount); - logger.info('Sending low value tx ' + lowValueTxs + ' with fee ' + feeAmount); + const t0 = performance.now(); const tx = await (config.REAL_VERIFIER ? submitProven(wallet, fee) : submitUnproven(wallet, fee)); + const t1 = performance.now(); const txHash = await tx.send({ wait: NO_WAIT }); + const t2 = performance.now(); + + logger.info('Low value tx sent', { + txNum: lowValueTxs, + feeAmount, + cloneMs: Math.round(t1 - t0), + sendMs: Math.round(t2 - t1), + totalMs: Math.round(t2 - t0), + }); return txHash.toString(); }; @@ -362,13 +379,23 @@ describe('sustained N TPS test', () => { highValueTxs++; const feeAmount = Number(randomBigInt(10n)) + 1000; const fee = new GasFees(0, feeAmount); - logger.info('Sending high value tx ' + highValueTxs + ' with fee ' + feeAmount); + const t0 = performance.now(); const tx = await (config.REAL_VERIFIER ? submitProven(wallet, fee) : submitUnproven(wallet, fee)); + const t1 = performance.now(); metrics.recordSentTx(tx, `high_value_${highValueTps}tps`); const txHash = await tx.send({ wait: NO_WAIT }); + const t2 = performance.now(); + + logger.info('High value tx sent', { + txNum: highValueTxs, + feeAmount, + cloneMs: Math.round(t1 - t0), + sendMs: Math.round(t2 - t1), + totalMs: Math.round(t2 - t0), + }); return txHash.toString(); }; @@ -514,9 +541,11 @@ function sendTxsAtTps( return txHashes; } -async function cloneTx(tx: ProvenTx, priorityFee: GasFees): Promise { - // Clone the transaction +async function cloneTx(tx: ProvenTx, priorityFee: GasFees, logger: Logger): Promise { + const t0 = performance.now(); const clonedTxData = Tx.clone(tx, false); + const t1 = performance.now(); + (clonedTxData.data.constants.txContext.gasSettings as any).maxPriorityFeesPerGas = priorityFee; if (clonedTxData.data.forRollup) { @@ -534,7 +563,17 @@ async function cloneTx(tx: ProvenTx, priorityFee: GasFees): Promise { clonedTxData.data.forPublic.nonRevertibleAccumulatedData.nullifiers[i] = Fr.random(); } } + const t2 = performance.now(); + const clonedTx = new ProvenTx((tx as any).node, clonedTxData, tx.offchainEffects, tx.stats); await clonedTx.recomputeHash(); + const t3 = performance.now(); + + logger.debug('cloneTx timing', { + cloneMs: Math.round(t1 - t0), + mutateMs: Math.round(t2 - t1), + rehashMs: Math.round(t3 - t2), + totalMs: Math.round(t3 - t0), + }); return clonedTx; } diff --git a/yarn-project/end-to-end/src/spartan/utils/config.ts b/yarn-project/end-to-end/src/spartan/utils/config.ts index 3794cba69156..f4dd9e885205 100644 --- a/yarn-project/end-to-end/src/spartan/utils/config.ts +++ b/yarn-project/end-to-end/src/spartan/utils/config.ts @@ -16,6 +16,7 @@ const testConfigSchema = z.object({ AZTEC_PROOF_SUBMISSION_WINDOW: z.coerce.number().optional().default(5), AZTEC_LAG_IN_EPOCHS_FOR_VALIDATOR_SET: z.coerce.number().optional().default(2), FUNDING_PRIVATE_KEY: z.string().optional(), + AZTEC_ADMIN_API_KEY: z.string().optional(), }); export type TestConfig = z.infer; diff --git a/yarn-project/end-to-end/src/spartan/utils/index.ts b/yarn-project/end-to-end/src/spartan/utils/index.ts index b4ecc612825f..8f917cecdcdc 100644 --- a/yarn-project/end-to-end/src/spartan/utils/index.ts +++ b/yarn-project/end-to-end/src/spartan/utils/index.ts @@ -41,6 +41,9 @@ export { applyNetworkShaping, } from './chaos.js'; +// Helm +export { hasDeployedHelmRelease } from './helm.js'; + // Bot management export { restartBot, installTransferBot, uninstallTransferBot } from './bot.js'; diff --git a/yarn-project/end-to-end/src/spartan/utils/nodes.ts b/yarn-project/end-to-end/src/spartan/utils/nodes.ts index baba6c63de1a..c31c182f80aa 100644 --- a/yarn-project/end-to-end/src/spartan/utils/nodes.ts +++ b/yarn-project/end-to-end/src/spartan/utils/nodes.ts @@ -173,7 +173,7 @@ export async function withSequencersAdmin(env: TestConfig, fn: (node: AztecNo if (statusRes.status !== 200) { throw new Error(`Admin endpoint returned status ${statusRes.status}`); } - const client = createAztecNodeAdminClient(url); + const client = createAztecNodeAdminClient(url, {}, undefined, env.AZTEC_ADMIN_API_KEY); return { result: await fn(client), process }; } catch (err) { // Kill the port-forward before retrying diff --git a/yarn-project/ethereum/src/l1_tx_utils/l1_tx_utils.test.ts b/yarn-project/ethereum/src/l1_tx_utils/l1_tx_utils.test.ts index 8670113a7ed7..828a351bf5fb 100644 --- a/yarn-project/ethereum/src/l1_tx_utils/l1_tx_utils.test.ts +++ b/yarn-project/ethereum/src/l1_tx_utils/l1_tx_utils.test.ts @@ -138,6 +138,31 @@ describe('L1TxUtils', () => { await gasUtils.waitMonitoringStopped(1); }); + it('recovery send reuses nonce after sendRawTransaction fails', async () => { + // Send a successful tx first to advance the chain nonce + await gasUtils.sendAndMonitorTransaction(request); + + const expectedNonce = await l1Client.getTransactionCount({ + blockTag: 'pending', + address: l1Client.account.address, + }); + + // Next send fails at sendRawTransaction (e.g. network error) + const originalSendRawTransaction = l1Client.sendRawTransaction.bind(l1Client); + using _sendSpy = jest + .spyOn(l1Client, 'sendRawTransaction') + .mockImplementationOnce(() => Promise.reject(new Error('network error'))) + .mockImplementation(originalSendRawTransaction); + + await expect(gasUtils.sendTransaction(request)).rejects.toThrow('network error'); + + // Recovery send should reuse the same nonce (not skip ahead) + const { txHash, state: recoveryState } = await gasUtils.sendTransaction(request); + + expect(recoveryState.nonce).toBe(expectedNonce); + expect((await l1Client.getTransaction({ hash: txHash })).nonce).toBe(expectedNonce); + }, 30_000); + // Regression for TMNT-312 it('speed-up of blob tx sets non-zero maxFeePerBlobGas', async () => { await cheatCodes.setAutomine(false); @@ -919,6 +944,8 @@ describe('L1TxUtils', () => { }); it('does not consume nonce when transaction times out before sending', async () => { + // first send a transaction to advance the nonce + await gasUtils.sendAndMonitorTransaction(request); // Get the expected nonce before any transaction const expectedNonce = await l1Client.getTransactionCount({ address: l1Client.account.address }); diff --git a/yarn-project/ethereum/src/l1_tx_utils/l1_tx_utils.ts b/yarn-project/ethereum/src/l1_tx_utils/l1_tx_utils.ts index 5c5c0f776db2..48b8dfc41aa5 100644 --- a/yarn-project/ethereum/src/l1_tx_utils/l1_tx_utils.ts +++ b/yarn-project/ethereum/src/l1_tx_utils/l1_tx_utils.ts @@ -14,16 +14,13 @@ import { type Abi, type BlockOverrides, type Hex, - type NonceManager, type PrepareTransactionRequestRequest, type StateOverride, type TransactionReceipt, type TransactionSerializable, - createNonceManager, formatGwei, serializeTransaction, } from 'viem'; -import { jsonRpc } from 'viem/nonce'; import type { ViemClient } from '../types.js'; import { formatViemError } from '../utils.js'; @@ -47,7 +44,6 @@ import { const MAX_L1_TX_STATES = 32; export class L1TxUtils extends ReadOnlyL1TxUtils { - protected nonceManager: NonceManager; protected txs: L1TxState[] = []; /** Tx delayer for testing. Only set when enableDelayer config is true. */ public delayer?: Delayer; @@ -68,7 +64,6 @@ export class L1TxUtils extends ReadOnlyL1TxUtils { delayer?: Delayer, ) { super(client, logger, dateProvider, config, debugMaxGasLimit); - this.nonceManager = createNonceManager({ source: jsonRpc() }); this.kzg = kzg; // Set up delayer: use provided one or create new @@ -244,9 +239,6 @@ export class L1TxUtils extends ReadOnlyL1TxUtils { throw new InterruptError(`Transaction sending is interrupted`); } - // Check timeout before consuming nonce to avoid leaking a nonce that was never sent. - // A leaked nonce creates a gap (e.g. nonce 107 consumed but unsent), so all subsequent - // transactions (108, 109, ...) can never be mined since the chain expects 107 first. const now = new Date(await this.getL1Timestamp()); if (gasConfig.txTimeoutAt && now > gasConfig.txTimeoutAt) { throw new TimeoutError( @@ -254,11 +246,7 @@ export class L1TxUtils extends ReadOnlyL1TxUtils { ); } - const nonce = await this.nonceManager.consume({ - client: this.client, - address: account, - chainId: this.client.chain.id, - }); + const nonce = await this.client.getTransactionCount({ address: account, blockTag: 'pending' }); const baseState = { request, gasLimit, blobInputs, gasPrice, nonce }; const txData = this.makeTxData(baseState, { isCancelTx: false }); @@ -449,7 +437,6 @@ export class L1TxUtils extends ReadOnlyL1TxUtils { { nonce, account, pendingNonce, timePassed }, ); await this.updateState(state, TxUtilsState.NOT_MINED); - this.nonceManager.reset({ address: account, chainId: this.client.chain.id }); throw new DroppedTransactionError(nonce, account); } @@ -541,12 +528,7 @@ export class L1TxUtils extends ReadOnlyL1TxUtils { // Oh no, the transaction has timed out! if (isCancelTx || !gasConfig.cancelTxOnTimeout) { - // If this was already a cancellation tx, or we are configured to not cancel txs, we just mark it as NOT_MINED - // and reset the nonce manager, so the next tx that comes along can reuse the nonce if/when this tx gets dropped. - // This is the nastiest scenario for us, since the new tx could acquire the next nonce, but then this tx is dropped, - // and the new tx would never get mined. Eventually, the new tx would also drop. await this.updateState(state, TxUtilsState.NOT_MINED); - this.nonceManager.reset({ address: account, chainId: this.client.chain.id }); } else { // Otherwise we fire the cancellation without awaiting to avoid blocking the caller, // and monitor it in the background so we can speed it up as needed. @@ -685,7 +667,6 @@ export class L1TxUtils extends ReadOnlyL1TxUtils { { nonce, account }, ); await this.updateState(state, TxUtilsState.NOT_MINED); - this.nonceManager.reset({ address: account, chainId: this.client.chain.id }); return; } @@ -697,7 +678,6 @@ export class L1TxUtils extends ReadOnlyL1TxUtils { { nonce, account, currentNonce }, ); await this.updateState(state, TxUtilsState.NOT_MINED); - this.nonceManager.reset({ address: account, chainId: this.client.chain.id }); return; } diff --git a/yarn-project/foundation/src/crypto/poseidon/index.test.ts b/yarn-project/foundation/src/crypto/poseidon/index.test.ts index ca9a932a9a74..84b6f39d6e06 100644 --- a/yarn-project/foundation/src/crypto/poseidon/index.test.ts +++ b/yarn-project/foundation/src/crypto/poseidon/index.test.ts @@ -1,11 +1,11 @@ -import { BarretenbergSync } from '@aztec/bb.js'; +import { Barretenberg } from '@aztec/bb.js'; import { Fr } from '../../curves/bn254/field.js'; import { poseidon2Permutation } from './index.js'; describe('poseidon2Permutation', () => { beforeAll(async () => { - await BarretenbergSync.initSingleton({ threads: 1 }); + await Barretenberg.initSingleton({ threads: 1 }); }); it('test vectors from cpp should match', async () => { diff --git a/yarn-project/foundation/src/crypto/poseidon/index.ts b/yarn-project/foundation/src/crypto/poseidon/index.ts index 4aba9a4a2029..601a1a000b65 100644 --- a/yarn-project/foundation/src/crypto/poseidon/index.ts +++ b/yarn-project/foundation/src/crypto/poseidon/index.ts @@ -1,4 +1,4 @@ -import { BarretenbergSync } from '@aztec/bb.js'; +import { Barretenberg } from '@aztec/bb.js'; import { Fr } from '../../curves/bn254/field.js'; import { type Fieldable, serializeToFields } from '../../serialize/serialize.js'; @@ -10,9 +10,9 @@ import { type Fieldable, serializeToFields } from '../../serialize/serialize.js' */ export async function poseidon2Hash(input: Fieldable[]): Promise { const inputFields = serializeToFields(input); - await BarretenbergSync.initSingleton(); - const api = BarretenbergSync.getSingleton(); - const response = api.poseidon2Hash({ + await Barretenberg.initSingleton(); + const api = Barretenberg.getSingleton(); + const response = await api.poseidon2Hash({ inputs: inputFields.map(i => i.toBuffer()), }); return Fr.fromBuffer(Buffer.from(response.hash)); @@ -27,9 +27,9 @@ export async function poseidon2Hash(input: Fieldable[]): Promise { export async function poseidon2HashWithSeparator(input: Fieldable[], separator: number): Promise { const inputFields = serializeToFields(input); inputFields.unshift(new Fr(separator)); - await BarretenbergSync.initSingleton(); - const api = BarretenbergSync.getSingleton(); - const response = api.poseidon2Hash({ + await Barretenberg.initSingleton(); + const api = Barretenberg.getSingleton(); + const response = await api.poseidon2Hash({ inputs: inputFields.map(i => i.toBuffer()), }); return Fr.fromBuffer(Buffer.from(response.hash)); @@ -44,9 +44,9 @@ export async function poseidon2Permutation(input: Fieldable[]): Promise { const inputFields = serializeToFields(input); // We'd like this assertion but it's not possible to use it in the browser. // assert(input.length === 4, 'Input state must be of size 4'); - await BarretenbergSync.initSingleton(); - const api = BarretenbergSync.getSingleton(); - const response = api.poseidon2Permutation({ + await Barretenberg.initSingleton(); + const api = Barretenberg.getSingleton(); + const response = await api.poseidon2Permutation({ inputs: inputFields.map(i => i.toBuffer()), }); // We'd like this assertion but it's not possible to use it in the browser. @@ -65,9 +65,9 @@ export async function poseidon2HashBytes(input: Buffer): Promise { inputFields.push(Fr.fromBuffer(fieldBytes)); } - await BarretenbergSync.initSingleton(); - const api = BarretenbergSync.getSingleton(); - const response = api.poseidon2Hash({ + await Barretenberg.initSingleton(); + const api = Barretenberg.getSingleton(); + const response = await api.poseidon2Hash({ inputs: inputFields.map(i => i.toBuffer()), }); diff --git a/yarn-project/foundation/src/curves/bn254/field.ts b/yarn-project/foundation/src/curves/bn254/field.ts index a5f81149ea42..80d228b470ac 100644 --- a/yarn-project/foundation/src/curves/bn254/field.ts +++ b/yarn-project/foundation/src/curves/bn254/field.ts @@ -118,14 +118,18 @@ abstract class BaseField { } cmp(rhs: BaseField): -1 | 0 | 1 { - const rhsBigInt = rhs.asBigInt; - return this.asBigInt === rhsBigInt ? 0 : this.asBigInt < rhsBigInt ? -1 : 1; + return BaseField.cmpAsBigInt(this.asBigInt, rhs.asBigInt); } static cmp(lhs: BaseField, rhs: BaseField): -1 | 0 | 1 { return lhs.cmp(rhs); } + // Actual bigint comparison. Arguments must have been validated previously. + static cmpAsBigInt(lhs: bigint, rhs: bigint): -1 | 0 | 1 { + return lhs === rhs ? 0 : lhs < rhs ? -1 : 1; + } + isZero(): boolean { return this.asBigInt === 0n; } diff --git a/yarn-project/p2p/src/client/factory.ts b/yarn-project/p2p/src/client/factory.ts index 497eb08d6a39..e8de6a8055a7 100644 --- a/yarn-project/p2p/src/client/factory.ts +++ b/yarn-project/p2p/src/client/factory.ts @@ -9,7 +9,6 @@ import type { L2BlockSource } from '@aztec/stdlib/block'; import type { ChainConfig } from '@aztec/stdlib/config'; import type { ContractDataSource } from '@aztec/stdlib/contract'; import type { AztecNode, ClientProtocolCircuitVerifier, WorldStateSynchronizer } from '@aztec/stdlib/interfaces/server'; -import { P2PClientType } from '@aztec/stdlib/p2p'; import { type TelemetryClient, getTelemetryClient } from '@aztec/telemetry-client'; import { P2PClient } from '../client/p2p_client.js'; @@ -27,14 +26,14 @@ import { NodeRpcTxSource, type TxSource, createNodeRpcTxSources } from '../servi import { TxFileStore } from '../services/tx_file_store/tx_file_store.js'; import { configureP2PClientAddresses, createLibP2PPeerIdFromPrivateKey, getPeerIdPrivateKey } from '../util.js'; -export type P2PClientDeps = { +export type P2PClientDeps = { txPool?: TxPoolV2; store?: AztecAsyncKVStore; attestationPool?: AttestationPoolApi; logger?: Logger; txCollectionNodeSources?: TxSource[]; rpcTxProviders?: AztecNode[]; - p2pServiceFactory?: (...args: Parameters<(typeof LibP2PService)['new']>) => Promise>; + p2pServiceFactory?: (...args: Parameters<(typeof LibP2PService)['new']>) => Promise; }; export const P2P_STORE_NAME = 'p2p'; @@ -42,8 +41,7 @@ export const P2P_ARCHIVE_STORE_NAME = 'p2p-archive'; export const P2P_PEER_STORE_NAME = 'p2p-peers'; export const P2P_ATTESTATION_STORE_NAME = 'p2p-attestation'; -export async function createP2PClient( - clientType: T, +export async function createP2PClient( inputConfig: P2PConfig & DataStoreConfig & ChainConfig, archiver: L2BlockSource & ContractDataSource, proofVerifier: ClientProtocolCircuitVerifier, @@ -52,7 +50,7 @@ export async function createP2PClient( packageVersion: string, dateProvider: DateProvider = new DateProvider(), telemetry: TelemetryClient = getTelemetryClient(), - deps: P2PClientDeps = {}, + deps: P2PClientDeps = {}, ) { const config = await configureP2PClientAddresses({ ...inputConfig, @@ -111,9 +109,8 @@ export async function createP2PClient( attestationPool: deps.attestationPool ?? new AttestationPool(attestationStore, telemetry), }; - const p2pService = await createP2PService( + const p2pService = await createP2PService( config, - clientType, archiver, proofVerifier, worldStateSynchronizer, @@ -171,7 +168,6 @@ export async function createP2PClient( ); return new P2PClient( - clientType, store, archiver, mempools, @@ -185,9 +181,8 @@ export async function createP2PClient( ); } -async function createP2PService( +async function createP2PService( config: P2PConfig & DataStoreConfig, - clientType: T, archiver: L2BlockSource & ContractDataSource, proofVerifier: ClientProtocolCircuitVerifier, worldStateSynchronizer: WorldStateSynchronizer, @@ -195,7 +190,7 @@ async function createP2PService( store: AztecAsyncKVStore, peerStore: AztecLMDBStoreV2, mempools: MemPools, - p2pServiceFactory: P2PClientDeps['p2pServiceFactory'], + p2pServiceFactory: P2PClientDeps['p2pServiceFactory'], packageVersion: string, logger: Logger, telemetry: TelemetryClient, @@ -211,7 +206,7 @@ async function createP2PService( const peerIdPrivateKey = await getPeerIdPrivateKey(config, store, logger); const peerId = await createLibP2PPeerIdFromPrivateKey(peerIdPrivateKey.getValue()); - const p2pService = await (p2pServiceFactory ?? LibP2PService.new)(clientType, config, peerId, { + const p2pService = await (p2pServiceFactory ?? LibP2PService.new)(config, peerId, { packageVersion, mempools, l2BlockSource: archiver, diff --git a/yarn-project/p2p/src/client/interface.ts b/yarn-project/p2p/src/client/interface.ts index 220802ee1b85..3ba683362f39 100644 --- a/yarn-project/p2p/src/client/interface.ts +++ b/yarn-project/p2p/src/client/interface.ts @@ -1,13 +1,7 @@ import type { SlotNumber } from '@aztec/foundation/branded-types'; import type { EthAddress, L2BlockId } from '@aztec/stdlib/block'; -import type { ITxProvider, P2PApiFull } from '@aztec/stdlib/interfaces/server'; -import type { - BlockProposal, - CheckpointAttestation, - CheckpointProposal, - P2PClientType, - TopicType, -} from '@aztec/stdlib/p2p'; +import type { ITxProvider, P2PClient } from '@aztec/stdlib/interfaces/server'; +import type { BlockProposal, CheckpointAttestation, CheckpointProposal, TopicType } from '@aztec/stdlib/p2p'; import type { BlockHeader, Tx, TxHash } from '@aztec/stdlib/tx'; import type { PeerId } from '@libp2p/interface'; @@ -54,7 +48,7 @@ export interface P2PSyncState { /** * Interface of a P2P client. **/ -export type P2P = P2PApiFull & { +export type P2P = P2PClient & { /** * Broadcasts a block proposal to other peers. * diff --git a/yarn-project/p2p/src/client/p2p_client.test.ts b/yarn-project/p2p/src/client/p2p_client.test.ts index 9dc1c59982f4..20a41e28ec11 100644 --- a/yarn-project/p2p/src/client/p2p_client.test.ts +++ b/yarn-project/p2p/src/client/p2p_client.test.ts @@ -7,7 +7,6 @@ import type { AztecAsyncKVStore } from '@aztec/kv-store'; import { openTmpStore } from '@aztec/kv-store/lmdb-v2'; import { L2Block } from '@aztec/stdlib/block'; import { EmptyL1RollupConstants, type L1RollupConstants } from '@aztec/stdlib/epoch-helpers'; -import { P2PClientType } from '@aztec/stdlib/p2p'; import { mockTx } from '@aztec/stdlib/testing'; import { TxHash } from '@aztec/stdlib/tx'; @@ -62,17 +61,7 @@ describe('P2P Client', () => { }); const createClient = (config: Partial = {}) => - new P2PClient( - P2PClientType.Full, - kvStore, - blockSource, - mempools, - p2pService, - txCollection, - undefined, - epochCache, - config, - ); + new P2PClient(kvStore, blockSource, mempools, p2pService, txCollection, undefined, epochCache, config); const advanceToProvenBlock = async (blockNumber: BlockNumber) => { blockSource.setProvenBlockNumber(blockNumber); diff --git a/yarn-project/p2p/src/client/p2p_client.ts b/yarn-project/p2p/src/client/p2p_client.ts index 0f6941d213e2..7996594ff9cb 100644 --- a/yarn-project/p2p/src/client/p2p_client.ts +++ b/yarn-project/p2p/src/client/p2p_client.ts @@ -20,13 +20,7 @@ import { import type { ContractDataSource } from '@aztec/stdlib/contract'; import { getTimestampForSlot } from '@aztec/stdlib/epoch-helpers'; import { type PeerInfo, tryStop } from '@aztec/stdlib/interfaces/server'; -import { - type BlockProposal, - CheckpointAttestation, - type CheckpointProposal, - type P2PClientType, - type TopicType, -} from '@aztec/stdlib/p2p'; +import { type BlockProposal, CheckpointAttestation, type CheckpointProposal, type TopicType } from '@aztec/stdlib/p2p'; import type { BlockHeader, Tx, TxHash } from '@aztec/stdlib/tx'; import { Attributes, type TelemetryClient, WithTracer, getTelemetryClient, trackSpan } from '@aztec/telemetry-client'; @@ -59,10 +53,7 @@ import { type P2P, P2PClientState, type P2PSyncState } from './interface.js'; /** * The P2P client implementation. */ -export class P2PClient - extends WithTracer - implements P2P, P2P -{ +export class P2PClient extends WithTracer implements P2P { /** The JS promise that will be running to keep the client's data in sync. Can be interrupted if the client is stopped. */ private runningPromise!: Promise; @@ -94,7 +85,6 @@ export class P2PClient private slotMonitor: RunningPromise | undefined; constructor( - _clientType: T, private store: AztecAsyncKVStore, private l2BlockSource: L2BlockSource & ContractDataSource, mempools: MemPools, diff --git a/yarn-project/p2p/src/client/test/tx_proposal_collector/proposal_tx_collector_worker.ts b/yarn-project/p2p/src/client/test/tx_proposal_collector/proposal_tx_collector_worker.ts index 655becaf4173..e1f054b98a02 100644 --- a/yarn-project/p2p/src/client/test/tx_proposal_collector/proposal_tx_collector_worker.ts +++ b/yarn-project/p2p/src/client/test/tx_proposal_collector/proposal_tx_collector_worker.ts @@ -8,7 +8,7 @@ import { openTmpStore } from '@aztec/kv-store/lmdb-v2'; import type { L2BlockSource } from '@aztec/stdlib/block'; import type { ContractDataSource } from '@aztec/stdlib/contract'; import type { ClientProtocolCircuitVerifier } from '@aztec/stdlib/interfaces/server'; -import { P2PClientType, PeerErrorSeverity } from '@aztec/stdlib/p2p'; +import { PeerErrorSeverity } from '@aztec/stdlib/p2p'; import type { Tx, TxValidationResult } from '@aztec/stdlib/tx'; import { type TelemetryClient, getTelemetryClient } from '@aztec/telemetry-client'; @@ -114,7 +114,6 @@ async function startClient(config: P2PConfig, clientIndex: number) { }; client = await createP2PClient( - P2PClientType.Full, config as P2PConfig & DataStoreConfig, l2BlockSource as L2BlockSource & ContractDataSource, proofVerifier as ClientProtocolCircuitVerifier, diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/eviction_manager.test.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/eviction_manager.test.ts index 9ffb86919685..a1fda0cd2532 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/eviction_manager.test.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/eviction_manager.test.ts @@ -4,7 +4,7 @@ import { BlockHeader } from '@aztec/stdlib/tx'; import { type MockProxy, mock } from 'jest-mock-extended'; -import { type TxMetaData, stubTxMetaValidationData } from '../tx_metadata.js'; +import { type TxMetaData, stubTxMetaData } from '../tx_metadata.js'; import { EvictionManager } from './eviction_manager.js'; import { EvictionEvent, @@ -174,19 +174,7 @@ describe('EvictionManager', () => { let preAddRule: MockProxy; let poolAccess: MockProxy; - const createMeta = (txHash: string, priorityFee: bigint): TxMetaData => ({ - txHash, - anchorBlockHeaderHash: '0x1234', - priorityFee, - feePayer: '0xfeepayer', - claimAmount: 0n, - feeLimit: 100n, - nullifiers: [`0x${txHash.slice(2)}null1`], - expirationTimestamp: 0n, - receivedAt: 0, - estimatedSizeBytes: 0, - data: stubTxMetaValidationData(), - }); + const createMeta = (txHash: string, priorityFee: bigint): TxMetaData => stubTxMetaData(txHash, { priorityFee }); beforeEach(() => { preAddRule = mock({ name: 'preAddRule' }); @@ -330,19 +318,7 @@ describe('EvictionManager', () => { const preAddRule2 = mock({ name: 'secondRule' }); const poolAccess = mock(); - const createMeta = (txHash: string, priorityFee: bigint): TxMetaData => ({ - txHash, - anchorBlockHeaderHash: '0x1234', - priorityFee, - feePayer: '0xfeepayer', - claimAmount: 0n, - feeLimit: 100n, - nullifiers: [`0x${txHash.slice(2)}null1`], - expirationTimestamp: 0n, - receivedAt: 0, - estimatedSizeBytes: 0, - data: stubTxMetaValidationData(), - }); + const createMeta = (txHash: string, priorityFee: bigint): TxMetaData => stubTxMetaData(txHash, { priorityFee }); preAddRule1.check.mockRejectedValue(new Error('Rule failed')); preAddRule2.check.mockResolvedValue({ diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/fee_payer_balance_eviction_rule.test.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/fee_payer_balance_eviction_rule.test.ts index ecb671eb7d80..7341a099f5da 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/fee_payer_balance_eviction_rule.test.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/fee_payer_balance_eviction_rule.test.ts @@ -7,7 +7,7 @@ import { BlockHeader, GlobalVariables } from '@aztec/stdlib/tx'; import { jest } from '@jest/globals'; import { type MockProxy, mock } from 'jest-mock-extended'; -import { type TxMetaData, stubTxMetaValidationData } from '../tx_metadata.js'; +import { type TxMetaData, stubTxMetaData } from '../tx_metadata.js'; import { FeePayerBalanceEvictionRule } from './fee_payer_balance_eviction_rule.js'; import type { EvictionContext, PoolOperations } from './interfaces.js'; import { EvictionEvent } from './interfaces.js'; @@ -33,19 +33,7 @@ describe('FeePayerBalanceEvictionRule', () => { claimAmount?: bigint; feePayer?: string; } = {}, - ): TxMetaData => ({ - txHash, - anchorBlockHeaderHash: '0x1234', - priorityFee: opts.priorityFee ?? 100n, - feePayer: opts.feePayer ?? feePayer1, - claimAmount: opts.claimAmount ?? 0n, - feeLimit: opts.feeLimit ?? 100n, - nullifiers: [`0x${txHash.slice(2)}null1`], - expirationTimestamp: 0n, - receivedAt: 0, - estimatedSizeBytes: 0, - data: stubTxMetaValidationData(), - }); + ) => stubTxMetaData(txHash, { feePayer: feePayer1, ...opts }); // Create mock pool operations const createPoolOps = (txsByFeePayer: Map): PoolOperations => { @@ -144,8 +132,8 @@ describe('FeePayerBalanceEvictionRule', () => { const result = await rule.evict(context, pool); expect(result.success).toBe(true); - expect(result.txsEvicted).toEqual(['0x1111']); // Low priority evicted - expect(deleteTxsMock).toHaveBeenCalledWith(['0x1111'], 'FeePayerBalanceEviction'); + expect(result.txsEvicted).toEqual([lowPriorityMeta.txHash]); // Low priority evicted + expect(deleteTxsMock).toHaveBeenCalledWith([lowPriorityMeta.txHash], 'FeePayerBalanceEviction'); }); it('evicts multiple low-priority txs when balance is insufficient', async () => { @@ -160,7 +148,7 @@ describe('FeePayerBalanceEvictionRule', () => { const context: EvictionContext = { event: EvictionEvent.TXS_ADDED, - newTxHashes: ['0x1111', '0x2222', '0x3333'], + newTxHashes: [lowMeta.txHash, medMeta.txHash, highMeta.txHash], feePayers: [feePayer1], }; @@ -168,9 +156,9 @@ describe('FeePayerBalanceEvictionRule', () => { expect(result.success).toBe(true); // Both low and medium priority should be evicted - expect(result.txsEvicted).toContain('0x1111'); - expect(result.txsEvicted).toContain('0x2222'); - expect(result.txsEvicted).not.toContain('0x3333'); + expect(result.txsEvicted).toContain(lowMeta.txHash); + expect(result.txsEvicted).toContain(medMeta.txHash); + expect(result.txsEvicted).not.toContain(highMeta.txHash); }); it('priority ordering is correct - highest priority gets funded first', async () => { @@ -186,15 +174,15 @@ describe('FeePayerBalanceEvictionRule', () => { const context: EvictionContext = { event: EvictionEvent.TXS_ADDED, - newTxHashes: ['0xaaaa', '0xbbbb', '0xcccc'], + newTxHashes: [tx10.txHash, tx50.txHash, tx100.txHash], feePayers: [feePayer1], }; const result = await rule.evict(context, pool); expect(result.success).toBe(true); - expect(result.txsEvicted).toEqual(['0xaaaa']); // Only lowest priority evicted - expect(deleteTxsMock).toHaveBeenCalledWith(['0xaaaa'], 'FeePayerBalanceEviction'); + expect(result.txsEvicted).toEqual([tx10.txHash]); // Only lowest priority evicted + expect(deleteTxsMock).toHaveBeenCalledWith([tx10.txHash], 'FeePayerBalanceEviction'); }); it('considers claim amount when calculating available balance', async () => { @@ -249,7 +237,7 @@ describe('FeePayerBalanceEvictionRule', () => { const result = await rule.evict(context, pool); expect(result.success).toBe(true); - expect(result.txsEvicted).toEqual(['0x2222']); // Low priority evicted + expect(result.txsEvicted).toEqual([lowMeta.txHash]); // Low priority evicted }); it('handles zero balance', async () => { @@ -268,7 +256,7 @@ describe('FeePayerBalanceEvictionRule', () => { const result = await rule.evict(context, pool); expect(result.success).toBe(true); - expect(result.txsEvicted).toEqual(['0x1111']); + expect(result.txsEvicted).toEqual([meta.txHash]); }); it('handles empty fee payers list', async () => { @@ -347,7 +335,7 @@ describe('FeePayerBalanceEvictionRule', () => { const result = await rule.evict(context, pool); expect(result.success).toBe(true); - expect(result.txsEvicted).toEqual(['0x1111']); + expect(result.txsEvicted).toEqual([lowMeta.txHash]); }); }); @@ -396,7 +384,7 @@ describe('FeePayerBalanceEvictionRule', () => { const result = await rule.evict(context, pool); expect(result.success).toBe(true); - expect(result.txsEvicted).toEqual(['0x1111']); + expect(result.txsEvicted).toEqual([meta.txHash]); }); }); @@ -474,7 +462,7 @@ describe('FeePayerBalanceEvictionRule', () => { const context: EvictionContext = { event: EvictionEvent.TXS_ADDED, - newTxHashes: ['0xaaaa', '0xbbbb', '0xcccc'], + newTxHashes: [tx1.txHash, tx2.txHash, tx3.txHash], feePayers: [feePayer1], }; @@ -482,10 +470,10 @@ describe('FeePayerBalanceEvictionRule', () => { expect(result.success).toBe(true); // tx1 (lowest priority) should be evicted - expect(result.txsEvicted).toEqual(['0xaaaa']); + expect(result.txsEvicted).toEqual([tx1.txHash]); // tx2 and tx3 should be kept - expect(result.txsEvicted).not.toContain('0xbbbb'); - expect(result.txsEvicted).not.toContain('0xcccc'); + expect(result.txsEvicted).not.toContain(tx2.txHash); + expect(result.txsEvicted).not.toContain(tx3.txHash); }); it('uses txHash as tiebreaker when priorities are equal', async () => { diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/fee_payer_balance_pre_add_rule.test.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/fee_payer_balance_pre_add_rule.test.ts index af21423f39e2..5b8fd070dd8f 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/fee_payer_balance_pre_add_rule.test.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/fee_payer_balance_pre_add_rule.test.ts @@ -1,4 +1,4 @@ -import { type TxMetaData, stubTxMetaValidationData } from '../tx_metadata.js'; +import { type TxMetaData, stubTxMetaData } from '../tx_metadata.js'; import { FeePayerBalancePreAddRule } from './fee_payer_balance_pre_add_rule.js'; import { type PreAddPoolAccess, TxPoolRejectionCode } from './interfaces.js'; @@ -14,19 +14,7 @@ describe('FeePayerBalancePreAddRule', () => { claimAmount?: bigint; feePayer?: string; } = {}, - ): TxMetaData => ({ - txHash, - anchorBlockHeaderHash: '0x1234', - priorityFee: opts.priorityFee ?? 100n, - feePayer: opts.feePayer ?? '0xfeepayer', - claimAmount: opts.claimAmount ?? 0n, - feeLimit: opts.feeLimit ?? 100n, - nullifiers: [`0x${txHash.slice(2)}null1`], - expirationTimestamp: 0n, - receivedAt: 0, - estimatedSizeBytes: 0, - data: stubTxMetaValidationData(), - }); + ) => stubTxMetaData(txHash, opts); // Mock pool access with configurable behavior const createPoolAccess = (balance: bigint, existingTxs: TxMetaData[] = []): PreAddPoolAccess => ({ @@ -127,7 +115,7 @@ describe('FeePayerBalancePreAddRule', () => { const result = await rule.check(incomingMeta, poolAccess); expect(result.shouldIgnore).toBe(false); - expect(result.txHashesToEvict).toContain('0x2222'); + expect(result.txHashesToEvict).toContain(existingMeta.txHash); }); it('evicts multiple lower-priority txs when high-priority tx is added', async () => { @@ -141,8 +129,8 @@ describe('FeePayerBalancePreAddRule', () => { const result = await rule.check(incomingMeta, poolAccess); expect(result.shouldIgnore).toBe(false); - expect(result.txHashesToEvict).toContain('0x2222'); - expect(result.txHashesToEvict).toContain('0x3333'); + expect(result.txHashesToEvict).toContain(existingMeta1.txHash); + expect(result.txHashesToEvict).toContain(existingMeta2.txHash); }); it('handles priority ordering correctly - highest priority gets funded first', async () => { @@ -157,9 +145,9 @@ describe('FeePayerBalancePreAddRule', () => { const result = await rule.check(incomingMeta, poolAccess); expect(result.shouldIgnore).toBe(false); - expect(result.txHashesToEvict).toContain('0x2222'); // Low priority evicted - expect(result.txHashesToEvict).not.toContain('0x4444'); // High priority kept - expect(result.txHashesToEvict).not.toContain('0x3333'); // Med priority kept + expect(result.txHashesToEvict).toContain(lowPriorityMeta.txHash); // Low priority evicted + expect(result.txHashesToEvict).not.toContain(highPriorityMeta.txHash); // High priority kept + expect(result.txHashesToEvict).not.toContain(medPriorityMeta.txHash); // Med priority kept }); }); @@ -245,7 +233,7 @@ describe('FeePayerBalancePreAddRule', () => { expect(result.shouldIgnore).toBe(false); expect(result.txHashesToEvict).toHaveLength(1); - expect(result.txHashesToEvict[0]).toEqual('0x2222'); + expect(result.txHashesToEvict[0]).toEqual(existingMeta.txHash); }); it('returns empty eviction list when no evictions needed', async () => { diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/fee_payer_balance_pre_add_rule.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/fee_payer_balance_pre_add_rule.ts index 0cdebb94948d..fee5bf80d4e5 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/fee_payer_balance_pre_add_rule.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/fee_payer_balance_pre_add_rule.ts @@ -35,6 +35,7 @@ export class FeePayerBalancePreAddRule implements PreAddRule { // Create combined list with incoming tx const allTxs: Array<{ txHash: string; + txHashBigInt: bigint; priorityFee: bigint; feeLimit: bigint; claimAmount: bigint; @@ -42,6 +43,7 @@ export class FeePayerBalancePreAddRule implements PreAddRule { }> = [ ...existingTxs.map(t => ({ txHash: t.txHash, + txHashBigInt: t.txHashBigInt, priorityFee: t.priorityFee, feeLimit: t.feeLimit, claimAmount: t.claimAmount, @@ -49,6 +51,7 @@ export class FeePayerBalancePreAddRule implements PreAddRule { })), { txHash: incomingMeta.txHash, + txHashBigInt: incomingMeta.txHashBigInt, priorityFee: incomingMeta.priorityFee, feeLimit: incomingMeta.feeLimit, claimAmount: incomingMeta.claimAmount, diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/invalid_txs_after_mining_rule.test.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/invalid_txs_after_mining_rule.test.ts index 8c16d90d9c00..73cc569a6111 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/invalid_txs_after_mining_rule.test.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/invalid_txs_after_mining_rule.test.ts @@ -3,7 +3,7 @@ import { BlockHeader } from '@aztec/stdlib/tx'; import { jest } from '@jest/globals'; -import { type TxMetaData, stubTxMetaValidationData } from '../tx_metadata.js'; +import { type TxMetaData, stubTxMetaData } from '../tx_metadata.js'; import type { EvictionContext, PoolOperations } from './interfaces.js'; import { EvictionEvent } from './interfaces.js'; import { InvalidTxsAfterMiningRule } from './invalid_txs_after_mining_rule.js'; @@ -24,23 +24,7 @@ describe('InvalidTxsAfterMiningRule', () => { nullifiers?: string[]; expirationTimestamp?: bigint; } = {}, - ): TxMetaData => { - const nullifiers = opts.nullifiers ?? [`0x${txHash.slice(2)}null1`]; - const expirationTimestamp = opts.expirationTimestamp ?? DEFAULT_EXPIRATION_TIMESTAMP; - return { - txHash, - anchorBlockHeaderHash: '0x1234', - priorityFee: 100n, - feePayer: '0xfeepayer', - claimAmount: 0n, - feeLimit: 100n, - nullifiers, - expirationTimestamp, - receivedAt: 0, - estimatedSizeBytes: 0, - data: stubTxMetaValidationData({ expirationTimestamp }), - }; - }; + ) => stubTxMetaData(txHash, { expirationTimestamp: DEFAULT_EXPIRATION_TIMESTAMP, ...opts }); // Create mock pool operations const createPoolOps = (pendingTxs: TxMetaData[]): PoolOperations => { @@ -122,8 +106,8 @@ describe('InvalidTxsAfterMiningRule', () => { const result = await rule.evict(context, pool); expect(result.success).toBe(true); - expect(result.txsEvicted).toEqual(['0x1111']); // Only tx1 has duplicate nullifier - expect(deleteTxsMock).toHaveBeenCalledWith(['0x1111'], 'InvalidTxsAfterMining'); + expect(result.txsEvicted).toEqual([tx1.txHash]); // Only tx1 has duplicate nullifier + expect(deleteTxsMock).toHaveBeenCalledWith([tx1.txHash], 'InvalidTxsAfterMining'); }); it('evicts transactions with expired timestamps', async () => { @@ -142,8 +126,8 @@ describe('InvalidTxsAfterMiningRule', () => { const result = await rule.evict(context, pool); expect(result.success).toBe(true); - expect(result.txsEvicted).toEqual(['0x1111']); // Only tx1 is expired - expect(deleteTxsMock).toHaveBeenCalledWith(['0x1111'], 'InvalidTxsAfterMining'); + expect(result.txsEvicted).toEqual([tx1.txHash]); // Only tx1 is expired + expect(deleteTxsMock).toHaveBeenCalledWith([tx1.txHash], 'InvalidTxsAfterMining'); }); it('evicts transactions with timestamp equal to block timestamp', async () => { @@ -162,8 +146,8 @@ describe('InvalidTxsAfterMiningRule', () => { const result = await rule.evict(context, pool); expect(result.success).toBe(true); - expect(result.txsEvicted).toEqual(['0x1111']); // tx1 has timestamp <= block timestamp - expect(deleteTxsMock).toHaveBeenCalledWith(['0x1111'], 'InvalidTxsAfterMining'); + expect(result.txsEvicted).toEqual([tx1.txHash]); // tx1 has timestamp <= block timestamp + expect(deleteTxsMock).toHaveBeenCalledWith([tx1.txHash], 'InvalidTxsAfterMining'); }); it('handles transactions with both duplicate nullifiers and expired timestamps', async () => { @@ -182,8 +166,8 @@ describe('InvalidTxsAfterMiningRule', () => { const result = await rule.evict(context, pool); expect(result.success).toBe(true); - expect(result.txsEvicted).toEqual(['0x1111']); - expect(deleteTxsMock).toHaveBeenCalledWith(['0x1111'], 'InvalidTxsAfterMining'); + expect(result.txsEvicted).toEqual([tx1.txHash]); + expect(deleteTxsMock).toHaveBeenCalledWith([tx1.txHash], 'InvalidTxsAfterMining'); }); it('handles empty pending transactions list', async () => { @@ -222,7 +206,7 @@ describe('InvalidTxsAfterMiningRule', () => { const result = await rule.evict(context, pool); expect(result.success).toBe(true); - expect(result.txsEvicted).toEqual(['0x1111']); + expect(result.txsEvicted).toEqual([tx1.txHash]); }); it('evicts all matching transactions when multiple share nullifiers with mined block', async () => { @@ -242,9 +226,9 @@ describe('InvalidTxsAfterMiningRule', () => { const result = await rule.evict(context, pool); expect(result.success).toBe(true); - expect(result.txsEvicted).toContain('0x1111'); - expect(result.txsEvicted).toContain('0x2222'); - expect(result.txsEvicted).not.toContain('0x3333'); + expect(result.txsEvicted).toContain(tx1.txHash); + expect(result.txsEvicted).toContain(tx2.txHash); + expect(result.txsEvicted).not.toContain(tx3.txHash); }); it('handles error from deleteTxs operation', async () => { diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/invalid_txs_after_reorg_rule.test.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/invalid_txs_after_reorg_rule.test.ts index c7ab0ab474a6..c8ccb70890bc 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/invalid_txs_after_reorg_rule.test.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/invalid_txs_after_reorg_rule.test.ts @@ -7,7 +7,7 @@ import { BlockHeader } from '@aztec/stdlib/tx'; import { jest } from '@jest/globals'; import { type MockProxy, mock } from 'jest-mock-extended'; -import { type TxMetaData, stubTxMetaValidationData } from '../tx_metadata.js'; +import { type TxMetaData, stubTxMetaData } from '../tx_metadata.js'; import type { EvictionContext, PoolOperations } from './interfaces.js'; import { EvictionEvent } from './interfaces.js'; import { InvalidTxsAfterReorgRule } from './invalid_txs_after_reorg_rule.js'; @@ -21,19 +21,8 @@ describe('InvalidTxsAfterReorgRule', () => { let deleteTxsMock: jest.MockedFunction; // Helper to create TxMetaData for testing - const createMeta = (txHash: string, anchorBlockHeaderHash: string): TxMetaData => ({ - txHash, - anchorBlockHeaderHash, - priorityFee: 100n, - feePayer: '0xfeepayer', - claimAmount: 0n, - feeLimit: 100n, - nullifiers: [`0x${txHash.slice(2)}null1`], - expirationTimestamp: 0n, - receivedAt: 0, - estimatedSizeBytes: 0, - data: stubTxMetaValidationData(), - }); + const createMeta = (txHash: string, anchorBlockHeaderHash: string) => + stubTxMetaData(txHash, { anchorBlockHeaderHash }); // Create mock pool operations const createPoolOps = (pendingTxs: TxMetaData[]): PoolOperations => { @@ -134,8 +123,8 @@ describe('InvalidTxsAfterReorgRule', () => { expect(result.success).toBe(true); // Both txs reference pruned blocks (default mock returns undefined) - expect(result.txsEvicted).toContain('0x1111'); - expect(result.txsEvicted).toContain('0x2222'); + expect(result.txsEvicted).toContain(tx1.txHash); + expect(result.txsEvicted).toContain(tx2.txHash); // Ensure syncImmediate is called before accessing the world state snapshot expect(worldState.syncImmediate).toHaveBeenCalledWith(); }); @@ -201,9 +190,9 @@ describe('InvalidTxsAfterReorgRule', () => { expect(result.success).toBe(true); expect(result.txsEvicted).toHaveLength(3); - expect(result.txsEvicted).toContain('0x1111'); - expect(result.txsEvicted).toContain('0x2222'); - expect(result.txsEvicted).toContain('0x3333'); + expect(result.txsEvicted).toContain(tx1.txHash); + expect(result.txsEvicted).toContain(tx2.txHash); + expect(result.txsEvicted).toContain(tx3.txHash); // Only one unique block hash to look up expect(db.findLeafIndices).toHaveBeenCalledTimes(1); const calledHashes = db.findLeafIndices.mock.calls[0][1] as Fr[]; @@ -232,9 +221,9 @@ describe('InvalidTxsAfterReorgRule', () => { expect(result.success).toBe(true); expect(result.txsEvicted).toHaveLength(1); - expect(result.txsEvicted).toContain('0x2222'); - expect(result.txsEvicted).not.toContain('0x1111'); - expect(result.txsEvicted).not.toContain('0x3333'); + expect(result.txsEvicted).toContain(tx2.txHash); + expect(result.txsEvicted).not.toContain(tx1.txHash); + expect(result.txsEvicted).not.toContain(tx3.txHash); }); it('handles mix of shared and unique block hashes with some valid and some pruned', async () => { @@ -263,11 +252,11 @@ describe('InvalidTxsAfterReorgRule', () => { expect(result.success).toBe(true); expect(result.txsEvicted).toHaveLength(3); - expect(result.txsEvicted).toContain('0x3333'); - expect(result.txsEvicted).toContain('0x4444'); - expect(result.txsEvicted).toContain('0x5555'); - expect(result.txsEvicted).not.toContain('0x1111'); - expect(result.txsEvicted).not.toContain('0x2222'); + expect(result.txsEvicted).toContain(tx3.txHash); + expect(result.txsEvicted).toContain(tx4.txHash); + expect(result.txsEvicted).toContain(tx5.txHash); + expect(result.txsEvicted).not.toContain(tx1.txHash); + expect(result.txsEvicted).not.toContain(tx2.txHash); expect(db.findLeafIndices).toHaveBeenCalledTimes(1); const calledHashes = db.findLeafIndices.mock.calls[0][1] as Fr[]; expect(calledHashes).toHaveLength(3); diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/low_priority_pre_add_rule.test.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/low_priority_pre_add_rule.test.ts index b7173ba8bdbe..fd66a0df4aee 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/low_priority_pre_add_rule.test.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/low_priority_pre_add_rule.test.ts @@ -1,4 +1,4 @@ -import { type TxMetaData, comparePriority, stubTxMetaValidationData } from '../tx_metadata.js'; +import { type TxMetaData, comparePriority, stubTxMetaData } from '../tx_metadata.js'; import { type PreAddContext, type PreAddPoolAccess, TxPoolRejectionCode } from './interfaces.js'; import { LowPriorityPreAddRule } from './low_priority_pre_add_rule.js'; @@ -6,19 +6,7 @@ describe('LowPriorityPreAddRule', () => { let rule: LowPriorityPreAddRule; // Helper to create TxMetaData for testing - const createMeta = (txHash: string, priorityFee: bigint): TxMetaData => ({ - txHash, - anchorBlockHeaderHash: '0x1234', - priorityFee, - feePayer: '0xfeepayer', - claimAmount: 0n, - feeLimit: 100n, - nullifiers: [`0x${txHash.slice(2)}null1`], - expirationTimestamp: 0n, - receivedAt: 0, - estimatedSizeBytes: 0, - data: stubTxMetaValidationData(), - }); + const createMeta = (txHash: string, priorityFee: bigint) => stubTxMetaData(txHash, { priorityFee }); // Mock pool access with configurable behavior const createPoolAccess = (pendingCount: number, lowestPriorityTx?: TxMetaData): PreAddPoolAccess => ({ @@ -88,7 +76,7 @@ describe('LowPriorityPreAddRule', () => { const result = await rule.check(incomingMeta, poolAccess); expect(result.shouldIgnore).toBe(false); - expect(result.txHashesToEvict).toContain('0x2222'); + expect(result.txHashesToEvict).toContain(lowestPriorityMeta.txHash); expect(result.txHashesToEvict).toHaveLength(1); }); @@ -199,12 +187,12 @@ describe('LowPriorityPreAddRule', () => { // Without feeOnly const result1 = await rule.check(incomingMeta, poolAccess); expect(result1.shouldIgnore).toBe(false); - expect(result1.txHashesToEvict).toContain('0x2222'); + expect(result1.txHashesToEvict).toContain(lowestPriorityMeta.txHash); // With feeOnly const result2 = await rule.check(incomingMeta, poolAccess, { feeComparisonOnly: true }); expect(result2.shouldIgnore).toBe(false); - expect(result2.txHashesToEvict).toContain('0x2222'); + expect(result2.txHashesToEvict).toContain(lowestPriorityMeta.txHash); }); it('lower fee is always ignored regardless of feeOnly flag', async () => { diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/nullifier_conflict_rule.test.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/nullifier_conflict_rule.test.ts index f5f08c1ece36..5108966f9047 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/nullifier_conflict_rule.test.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/eviction/nullifier_conflict_rule.test.ts @@ -1,4 +1,4 @@ -import { type TxMetaData, stubTxMetaValidationData } from '../tx_metadata.js'; +import { type TxMetaData, stubTxMetaData } from '../tx_metadata.js'; import type { PreAddPoolAccess } from './interfaces.js'; import { NullifierConflictRule } from './nullifier_conflict_rule.js'; @@ -7,23 +7,8 @@ describe('NullifierConflictRule', () => { let rule: NullifierConflictRule; // Helper to create TxMetaData for testing - const createMeta = ( - txHash: string, - priorityFee: bigint, - nullifiers: string[] = [`0x${txHash.slice(2)}null1`], - ): TxMetaData => ({ - txHash, - anchorBlockHeaderHash: '0x1234', - priorityFee, - feePayer: '0xfeepayer', - claimAmount: 0n, - feeLimit: 1000n, - nullifiers, - expirationTimestamp: 0n, - receivedAt: 0, - estimatedSizeBytes: 0, - data: stubTxMetaValidationData(), - }); + const createMeta = (txHash: string, priorityFee: bigint, nullifiers?: string[]) => + stubTxMetaData(txHash, { priorityFee, feeLimit: 1000n, ...(nullifiers ? { nullifiers } : {}) }); // Mock pool access with configurable behavior const createPoolAccess = ( diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/index.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/index.ts index 391e7edccab0..cee49474dcb3 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/index.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/index.ts @@ -7,6 +7,6 @@ export { type PoolReadAccess, DEFAULT_TX_POOL_V2_CONFIG, } from './interfaces.js'; -export { type TxMetaData, type TxState, buildTxMetaData, comparePriority } from './tx_metadata.js'; +export { type TxMetaData, type TxState, buildTxMetaData, comparePriority, stubTxMetaData } from './tx_metadata.js'; export { TxArchive } from './archive/index.js'; export { DeletedPool } from './deleted_pool.js'; diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_metadata.test.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_metadata.test.ts index a2e6172c2922..d139138d5489 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_metadata.test.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_metadata.test.ts @@ -1,13 +1,7 @@ import { mockTx } from '@aztec/stdlib/testing'; import { TxPoolRejectionCode } from './eviction/interfaces.js'; -import { - type TxMetaData, - buildTxMetaData, - checkNullifierConflict, - comparePriority, - stubTxMetaValidationData, -} from './tx_metadata.js'; +import { buildTxMetaData, checkNullifierConflict, comparePriority, stubTxMetaData } from './tx_metadata.js'; describe('TxMetaData', () => { describe('buildTxMetaData', () => { @@ -16,6 +10,7 @@ describe('TxMetaData', () => { const meta = await buildTxMetaData(tx); expect(meta.txHash).toBe(tx.getTxHash().toString()); + expect(meta.txHashBigInt).toBe(tx.getTxHash().toBigInt()); expect(meta.anchorBlockHeaderHash).toBe((await tx.data.constants.anchorBlockHeader.hash()).toString()); expect(meta.feePayer).toBe(tx.data.feePayer.toString()); expect(meta.expirationTimestamp).toBe(tx.data.expirationTimestamp); @@ -68,19 +63,7 @@ describe('TxMetaData', () => { }); describe('comparePriority', () => { - const makeMeta = (fee: bigint, txHash = '0x1234'): TxMetaData => ({ - txHash, - anchorBlockHeaderHash: '0x5678', - priorityFee: fee, - feePayer: '0xabcd', - claimAmount: 0n, - feeLimit: 1000n, - nullifiers: [], - expirationTimestamp: 0n, - receivedAt: 0, - estimatedSizeBytes: 0, - data: stubTxMetaValidationData(), - }); + const makeMeta = (fee: bigint, txHash = '0x1234') => stubTxMetaData(txHash, { priorityFee: fee, nullifiers: [] }); it('returns negative when first has lower priority fee', () => { expect(comparePriority(makeMeta(100n), makeMeta(200n))).toBe(-1); @@ -102,19 +85,8 @@ describe('TxMetaData', () => { }); describe('checkNullifierConflict', () => { - const makeMeta = (txHash: string, priorityFee: bigint, nullifiers: string[]): TxMetaData => ({ - txHash, - anchorBlockHeaderHash: '0x5678', - priorityFee, - feePayer: '0xabcd', - claimAmount: 0n, - feeLimit: 1000n, - nullifiers, - expirationTimestamp: 0n, - receivedAt: 0, - estimatedSizeBytes: 0, - data: stubTxMetaValidationData(), - }); + const makeMeta = (txHash: string, priorityFee: bigint, nullifiers: string[]) => + stubTxMetaData(txHash, { priorityFee, nullifiers }); it('returns no conflict when nullifiers do not overlap', () => { const incoming = makeMeta('0x1111', 100n, ['0xnull1', '0xnull2']); diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_metadata.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_metadata.ts index 45a452c6086d..316f551bcc6c 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_metadata.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_metadata.ts @@ -3,7 +3,7 @@ import { Fr } from '@aztec/foundation/curves/bn254'; import { ProtocolContractAddress } from '@aztec/protocol-contracts'; import { BlockHash, type L2BlockId } from '@aztec/stdlib/block'; import { Gas } from '@aztec/stdlib/gas'; -import type { Tx } from '@aztec/stdlib/tx'; +import { type Tx, TxHash } from '@aztec/stdlib/tx'; import { getFeePayerBalanceDelta } from '../../msg_validators/tx_validator/fee_payer_balance.js'; import { getTxPriorityFee } from '../tx_pool/priority.js'; @@ -40,6 +40,9 @@ export type TxMetaData = { /** The transaction hash as hex string */ readonly txHash: string; + /** The transaction hash as bigint (for efficient Fr conversion in comparisons) */ + readonly txHashBigInt: bigint; + /** Block ID (number and hash) in which the transaction was mined (undefined if not mined) */ minedL2BlockId?: L2BlockId; @@ -83,7 +86,9 @@ export type TxState = 'pending' | 'protected' | 'mined' | 'deleted'; * Fr values are captured in closures for zero-cost re-validation. */ export async function buildTxMetaData(tx: Tx): Promise { - const txHash = tx.getTxHash().toString(); + const txHashObj = tx.getTxHash(); + const txHash = txHashObj.toString(); + const txHashBigInt = txHashObj.toBigInt(); const nullifierFrs = tx.data.getNonEmptyNullifiers(); const nullifiers = nullifierFrs.map(n => n.toString()); const anchorBlockHeaderHashFr = await tx.data.constants.anchorBlockHeader.hash(); @@ -99,6 +104,7 @@ export async function buildTxMetaData(tx: Tx): Promise { return { txHash, + txHashBigInt, anchorBlockHeaderHash, priorityFee, feePayer, @@ -134,11 +140,11 @@ const HEX_STRING_BYTES = 98; const BIGINT_BYTES = 32; const FR_BYTES = 80; // Fixed cost: object shell + txHash + anchorBlockHeaderHash + feePayer (3 hex strings) -// + priorityFee + claimAmount + feeLimit + includeByTimestamp (4 bigints) +// + txHashBigInt + priorityFee + claimAmount + feeLimit + includeByTimestamp (5 bigints) // + receivedAt (number, 8 bytes) + estimatedSizeBytes (number, 8 bytes) // + data closure object (~OBJECT_OVERHEAD + anchorBlockHeaderHashFr Fr + anchorBlockNumber number) const FIXED_METADATA_BYTES = - OBJECT_OVERHEAD + 3 * HEX_STRING_BYTES + 4 * BIGINT_BYTES + 8 + 8 + OBJECT_OVERHEAD + FR_BYTES + 8; + OBJECT_OVERHEAD + 3 * HEX_STRING_BYTES + 5 * BIGINT_BYTES + 8 + 8 + OBJECT_OVERHEAD + FR_BYTES + 8; /** Estimates the in-memory size of a TxMetaData object based on the number of nullifiers. */ function estimateTxMetaDataSize(nullifierCount: number): number { @@ -146,8 +152,13 @@ function estimateTxMetaDataSize(nullifierCount: number): number { return FIXED_METADATA_BYTES + nullifierCount * (HEX_STRING_BYTES + FR_BYTES); } +/** Converts a txHash bigint back to the canonical 0x-prefixed 64-char hex string. */ +export function txHashFromBigInt(value: bigint): string { + return TxHash.fromBigInt(value).toString(); +} + /** Minimal fields required for priority comparison. */ -type PriorityComparable = Pick; +type PriorityComparable = Pick; /** * Compares two priority fees in ascending order. @@ -162,10 +173,8 @@ export function compareFee(a: bigint, b: bigint): number { * Uses field element comparison for deterministic ordering. * Returns negative if a < b, positive if a > b, 0 if equal. */ -export function compareTxHash(a: string, b: string): number { - const fieldA = Fr.fromHexString(a); - const fieldB = Fr.fromHexString(b); - return fieldA.cmp(fieldB); +export function compareTxHash(a: bigint, b: bigint): number { + return Fr.cmpAsBigInt(a, b); } /** @@ -178,7 +187,7 @@ export function comparePriority(a: PriorityComparable, b: PriorityComparable): n if (feeComparison !== 0) { return feeComparison; } - return compareTxHash(a.txHash, b.txHash); + return compareTxHash(a.txHashBigInt, b.txHashBigInt); } /** @@ -253,3 +262,36 @@ export function stubTxMetaValidationData(overrides: { expirationTimestamp?: bigi }, }; } + +/** Creates a stub TxMetaData for tests. All fields have sensible defaults and can be overridden. */ +export function stubTxMetaData( + txHash: string, + overrides: { + priorityFee?: bigint; + feePayer?: string; + claimAmount?: bigint; + feeLimit?: bigint; + nullifiers?: string[]; + expirationTimestamp?: bigint; + anchorBlockHeaderHash?: string; + } = {}, +): TxMetaData { + const txHashBigInt = Fr.fromHexString(txHash).toBigInt(); + // Normalize to canonical zero-padded hex so txHashFromBigInt(txHashBigInt) === normalizedTxHash + const normalizedTxHash = txHashFromBigInt(txHashBigInt); + const expirationTimestamp = overrides.expirationTimestamp ?? 0n; + return { + txHash: normalizedTxHash, + txHashBigInt, + anchorBlockHeaderHash: overrides.anchorBlockHeaderHash ?? '0x1234', + priorityFee: overrides.priorityFee ?? 100n, + feePayer: overrides.feePayer ?? '0xfeepayer', + claimAmount: overrides.claimAmount ?? 0n, + feeLimit: overrides.feeLimit ?? 100n, + nullifiers: overrides.nullifiers ?? [`0x${normalizedTxHash.slice(2)}null1`], + expirationTimestamp, + receivedAt: 0, + estimatedSizeBytes: 0, + data: stubTxMetaValidationData({ expirationTimestamp }), + }; +} diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_pool_indices.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_pool_indices.ts index a9a368dce37c..42dd87db5cbf 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_pool_indices.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_pool_indices.ts @@ -1,7 +1,7 @@ import { SlotNumber } from '@aztec/foundation/branded-types'; import type { L2BlockId } from '@aztec/stdlib/block'; -import { type TxMetaData, type TxState, compareFee, compareTxHash } from './tx_metadata.js'; +import { type TxMetaData, type TxState, compareFee, compareTxHash, txHashFromBigInt } from './tx_metadata.js'; /** * Manages in-memory indices for the transaction pool. @@ -22,8 +22,8 @@ export class TxPoolIndices { #nullifierToTxHash: Map = new Map(); /** Fee payer to txHashes index (pending txs only) */ #feePayerToTxHashes: Map> = new Map(); - /** Pending txHashes grouped by priority fee */ - #pendingByPriority: Map> = new Map(); + /** Pending txHash bigints grouped by priority fee */ + #pendingByPriority: Map> = new Map(); /** Protected transactions: txHash -> slotNumber */ #protectedTransactions: Map = new Map(); @@ -73,17 +73,17 @@ export class TxPoolIndices { * @param order - 'desc' for highest priority first, 'asc' for lowest priority first */ *iteratePendingByPriority(order: 'asc' | 'desc', filter?: (hash: string) => boolean): Generator { - // Use compareFee from tx_metadata, swap args for descending order const feeCompareFn = order === 'desc' ? (a: bigint, b: bigint) => compareFee(b, a) : compareFee; - const hashCompareFn = order === 'desc' ? (a: string, b: string) => compareTxHash(b, a) : compareTxHash; + const hashCompareFn = + order === 'desc' ? (a: bigint, b: bigint) => compareTxHash(b, a) : (a: bigint, b: bigint) => compareTxHash(a, b); const sortedFees = [...this.#pendingByPriority.keys()].sort(feeCompareFn); for (const fee of sortedFees) { const hashesAtFee = this.#pendingByPriority.get(fee)!; - // Use compareTxHash from tx_metadata, swap args for descending order const sortedHashes = [...hashesAtFee].sort(hashCompareFn); - for (const hash of sortedHashes) { + for (const hashBigInt of sortedHashes) { + const hash = txHashFromBigInt(hashBigInt); if (filter === undefined || filter(hash)) { yield hash; } @@ -265,8 +265,8 @@ export class TxPoolIndices { getPendingTxs(): TxMetaData[] { const result: TxMetaData[] = []; for (const hashSet of this.#pendingByPriority.values()) { - for (const txHash of hashSet) { - const meta = this.#metadata.get(txHash); + for (const txHashBigInt of hashSet) { + const meta = this.#metadata.get(txHashFromBigInt(txHashBigInt)); if (meta) { result.push(meta); } @@ -414,7 +414,7 @@ export class TxPoolIndices { prioritySet = new Set(); this.#pendingByPriority.set(meta.priorityFee, prioritySet); } - prioritySet.add(meta.txHash); + prioritySet.add(meta.txHashBigInt); } #removeFromPendingIndices(meta: TxMetaData): void { @@ -435,7 +435,7 @@ export class TxPoolIndices { // Remove from priority map const hashSet = this.#pendingByPriority.get(meta.priorityFee); if (hashSet) { - hashSet.delete(meta.txHash); + hashSet.delete(meta.txHashBigInt); if (hashSet.size === 0) { this.#pendingByPriority.delete(meta.priorityFee); } diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_pool_v2.test.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_pool_v2.test.ts index aa0b9af8affa..f778f6bb3f14 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_pool_v2.test.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_pool_v2.test.ts @@ -5269,4 +5269,121 @@ describe('TxPoolV2', () => { expect(await pool.getTxStatus(tx.getTxHash())).toBeUndefined(); }); }); + + describe('persistence consistency', () => { + it('pool state is consistent across restart when getTxEffect throws for a later tx in batch', async () => { + const testStore = await openTmpStore('p2p-comeback-gettxeffect'); + const testArchiveStore = await openTmpStore('archive-comeback-gettxeffect'); + + try { + const pool1 = new AztecKVTxPoolV2(testStore, testArchiveStore, { + l2BlockSource: mockL2BlockSource, + worldStateSynchronizer: mockWorldState, + createTxValidator: () => Promise.resolve(alwaysValidValidator), + }); + await pool1.start(); + + // Add tx1 (fee=5) with a nullifier + const tx1 = await mockPublicTx(1, 5); + await pool1.addPendingTxs([tx1]); + expect(await pool1.getTxStatus(tx1.getTxHash())).toBe('pending'); + + // Create tx2 (same nullifier as tx1, higher fee — will evict tx1) and tx3 (different nullifiers) + const tx2 = await mockPublicTx(2, 10); + setNullifier(tx2, 0, getNullifier(tx1, 0)); + const tx3 = await mockPublicTx(3, 1); + + // Mock getTxEffect to throw for tx3 (simulates L2BlockSource I/O failure) + const tx3HashStr = tx3.getTxHash().toString(); + mockL2BlockSource.getTxEffect.mockImplementation((txHash: TxHash) => { + if (txHash.toString() === tx3HashStr) { + throw new Error('Simulated L2BlockSource failure'); + } + return Promise.resolve(undefined); + }); + + // Batch fails because tx3's getMinedBlockId throws + await expect(pool1.addPendingTxs([tx2, tx3])).rejects.toThrow('Simulated L2BlockSource failure'); + + const statusBeforeRestart = await pool1.getTxStatus(tx1.getTxHash()); + + await pool1.stop(); + mockL2BlockSource.getTxEffect.mockResolvedValue(undefined); + + const pool2 = new AztecKVTxPoolV2(testStore, testArchiveStore, { + l2BlockSource: mockL2BlockSource, + worldStateSynchronizer: mockWorldState, + createTxValidator: () => Promise.resolve(alwaysValidValidator), + }); + await pool2.start(); + + const statusAfterRestart = await pool2.getTxStatus(tx1.getTxHash()); + expect(statusAfterRestart).toBe(statusBeforeRestart); + + await pool2.stop(); + } finally { + mockL2BlockSource.getTxEffect.mockResolvedValue(undefined); + await testStore.delete(); + await testArchiveStore.delete(); + } + }); + + it('pool state is consistent across restart when validateMeta throws for a later tx in batch', async () => { + const testStore = await openTmpStore('p2p-comeback-validatemeta'); + const testArchiveStore = await openTmpStore('archive-comeback-validatemeta'); + + try { + // Create a validator that throws (not rejects) for tx3 + let tx3HashStr = ''; + const throwingValidator: TxValidator = { + validateTx: (meta: TxMetaData) => { + if (meta.txHash === tx3HashStr) { + throw new Error('Simulated validator crash'); + } + return Promise.resolve({ result: 'valid' }); + }, + }; + + const pool1 = new AztecKVTxPoolV2(testStore, testArchiveStore, { + l2BlockSource: mockL2BlockSource, + worldStateSynchronizer: mockWorldState, + createTxValidator: () => Promise.resolve(throwingValidator), + }); + await pool1.start(); + + // Add tx1 (fee=5) with a nullifier + const tx1 = await mockPublicTx(1, 5); + await pool1.addPendingTxs([tx1]); + expect(await pool1.getTxStatus(tx1.getTxHash())).toBe('pending'); + + // Create tx2 (same nullifier as tx1, higher fee — will evict tx1) and tx3 (different nullifiers) + const tx2 = await mockPublicTx(2, 10); + setNullifier(tx2, 0, getNullifier(tx1, 0)); + const tx3 = await mockPublicTx(3, 1); + tx3HashStr = tx3.getTxHash().toString(); + + // Batch fails because tx3's validateMeta throws + await expect(pool1.addPendingTxs([tx2, tx3])).rejects.toThrow('Simulated validator crash'); + + const statusBeforeRestart = await pool1.getTxStatus(tx1.getTxHash()); + + await pool1.stop(); + + const pool2 = new AztecKVTxPoolV2(testStore, testArchiveStore, { + l2BlockSource: mockL2BlockSource, + worldStateSynchronizer: mockWorldState, + createTxValidator: () => Promise.resolve(alwaysValidValidator), + }); + await pool2.start(); + + const statusAfterRestart = await pool2.getTxStatus(tx1.getTxHash()); + expect(statusAfterRestart).toBe(statusBeforeRestart); + + await pool2.stop(); + } finally { + await testStore.delete(); + await testArchiveStore.delete(); + } + }); + }); }); diff --git a/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_pool_v2_impl.ts b/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_pool_v2_impl.ts index 94e4f224dc94..88f6e887b9a8 100644 --- a/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_pool_v2_impl.ts +++ b/yarn-project/p2p/src/mem_pools/tx_pool_v2/tx_pool_v2_impl.ts @@ -187,6 +187,30 @@ export class TxPoolV2Impl { const errors = new Map(); const acceptedPending = new Set(); + // Phase 1: Pre-compute all throwable I/O outside the transaction. + // If any pre-computation throws, the entire call fails before mutations happen. + const precomputed = new Map(); + + const validator = await this.#createTxValidator(); + + for (const tx of txs) { + const txHash = tx.getTxHash(); + const txHashStr = txHash.toString(); + + const meta = await buildTxMetaData(tx); + const minedBlockId = await this.#getMinedBlockId(txHash); + + // Validate non-mined txs (mined and pre-protected txs bypass validation inside the transaction) + let isValid = true; + if (!minedBlockId) { + isValid = await this.#validateMeta(meta, validator); + } + + precomputed.set(txHashStr, { meta, minedBlockId, isValid }); + } + + // Phase 2: Apply mutations inside the transaction using only pre-computed results, + // in-memory reads, and buffered DB writes. Nothing here can throw an unhandled exception. const poolAccess = this.#createPreAddPoolAccess(); const preAddContext: PreAddContext | undefined = opts.feeComparisonOnly !== undefined ? { feeComparisonOnly: opts.feeComparisonOnly } : undefined; @@ -202,22 +226,25 @@ export class TxPoolV2Impl { continue; } - // Check mined status first (applies to all paths) - const minedBlockId = await this.#getMinedBlockId(txHash); + const { meta, minedBlockId, isValid } = precomputed.get(txHashStr)!; const preProtectedSlot = this.#indices.getProtectionSlot(txHashStr); if (minedBlockId) { // Already mined - add directly (protection already set if pre-protected) - await this.#addTx(tx, { mined: minedBlockId }, opts); + await this.#addTx(tx, { mined: minedBlockId }, opts, meta); accepted.push(txHash); } else if (preProtectedSlot !== undefined) { // Pre-protected and not mined - add as protected (bypass validation) - await this.#addTx(tx, { protected: preProtectedSlot }, opts); + await this.#addTx(tx, { protected: preProtectedSlot }, opts, meta); accepted.push(txHash); + } else if (!isValid) { + // Failed pre-computed validation + rejected.push(txHash); } else { - // Regular pending tx - validate and run pre-add rules + // Regular pending tx - run pre-add rules using pre-computed metadata const result = await this.#tryAddRegularPendingTx( tx, + meta, opts, poolAccess, acceptedPending, @@ -227,8 +254,6 @@ export class TxPoolV2Impl { ); if (result.status === 'accepted') { acceptedPending.add(txHashStr); - } else if (result.status === 'rejected') { - rejected.push(txHash); } else { ignored.push(txHash); } @@ -259,27 +284,21 @@ export class TxPoolV2Impl { return { accepted, ignored, rejected, ...(errors.size > 0 ? { errors } : {}) }; } - /** Validates and adds a regular pending tx. Returns status. */ + /** Adds a validated pending tx, running pre-add rules and evicting conflicts. */ async #tryAddRegularPendingTx( tx: Tx, + precomputedMeta: TxMetaData, opts: { source?: string }, poolAccess: PreAddPoolAccess, acceptedPending: Set, ignored: TxHash[], errors: Map, preAddContext?: PreAddContext, - ): Promise<{ status: 'accepted' | 'ignored' | 'rejected' }> { - const txHash = tx.getTxHash(); - const txHashStr = txHash.toString(); - - // Build metadata and validate using metadata - const meta = await buildTxMetaData(tx); - if (!(await this.#validateMeta(meta))) { - return { status: 'rejected' }; - } + ): Promise<{ status: 'accepted' | 'ignored' }> { + const txHashStr = tx.getTxHash().toString(); // Run pre-add rules - const preAddResult = await this.#evictionManager.runPreAddRules(meta, poolAccess, preAddContext); + const preAddResult = await this.#evictionManager.runPreAddRules(precomputedMeta, poolAccess, preAddContext); if (preAddResult.shouldIgnore) { this.#log.debug(`Ignoring tx ${txHashStr}: ${preAddResult.reason?.message ?? 'unknown reason'}`); @@ -323,7 +342,7 @@ export class TxPoolV2Impl { } // Add the transaction - await this.#addTx(tx, 'pending', opts); + await this.#addTx(tx, 'pending', opts, precomputedMeta); return { status: 'accepted' }; } @@ -765,9 +784,10 @@ export class TxPoolV2Impl { tx: Tx, state: 'pending' | { protected: SlotNumber } | { mined: L2BlockId }, opts: { source?: string } = {}, + precomputedMeta?: TxMetaData, ): Promise { const txHashStr = tx.getTxHash().toString(); - const meta = await buildTxMetaData(tx); + const meta = precomputedMeta ?? (await buildTxMetaData(tx)); meta.receivedAt = this.#dateProvider.now(); await this.#txsDB.set(txHashStr, tx.toBuffer()); diff --git a/yarn-project/p2p/src/services/encoding.bench.test.ts b/yarn-project/p2p/src/services/encoding.bench.test.ts new file mode 100644 index 000000000000..06da50451304 --- /dev/null +++ b/yarn-project/p2p/src/services/encoding.bench.test.ts @@ -0,0 +1,129 @@ +import { asyncPool } from '@aztec/foundation/async-pool'; +import { randomBytes } from '@aztec/foundation/crypto/random'; +import { sha256 } from '@aztec/foundation/crypto/sha256'; +import { MAX_L2_BLOCK_SIZE_KB, MAX_MESSAGE_SIZE_KB, MAX_TX_SIZE_KB } from '@aztec/stdlib/p2p'; + +import { createHash } from 'node:crypto'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { type RecordableHistogram, createHistogram } from 'node:perf_hooks'; + +const HASH_COUNT = 20; +const TOPIC = '/aztec/tx/0.1.0'; + +const MESSAGE_SIZES_KB = [1, 64, MAX_TX_SIZE_KB, MAX_L2_BLOCK_SIZE_KB, MAX_MESSAGE_SIZE_KB] as const; + +type SizeKb = (typeof MESSAGE_SIZES_KB)[number]; + +const CONCURRENCY_LEVELS = [1, 4] as const; +type CaseKey = `${SizeKb}-${(typeof CONCURRENCY_LEVELS)[number]}`; + +const NS_PER_MS = 1e6; + +const CASES = MESSAGE_SIZES_KB.flatMap(s => CONCURRENCY_LEVELS.map(c => [s, c] as const)); + +describe('P2P Message ID: Benchmarks', () => { + let hashJsHistograms: Record; + let nodeCryptoHistograms: Record; + let subtleHistograms: Record; + + let messageData: Record; + + beforeAll(() => { + const allKeys = CASES.map(([s, c]) => `${s}-${c}` as CaseKey); + hashJsHistograms = Object.fromEntries(allKeys.map(k => [k, { h: createHistogram(), total: 0 }])) as any; + nodeCryptoHistograms = Object.fromEntries(allKeys.map(k => [k, { h: createHistogram(), total: 0 }])) as any; + subtleHistograms = Object.fromEntries(allKeys.map(k => [k, { h: createHistogram(), total: 0 }])) as any; + + messageData = Object.fromEntries(MESSAGE_SIZES_KB.map(sizeKb => [sizeKb, randomBytes(sizeKb * 1024)])) as any; + }); + + afterAll(async () => { + const implementations = [ + { key: 'hash.js', label: 'hashJs.sha256 x' + HASH_COUNT, histograms: hashJsHistograms }, + { key: 'node-crypto', label: 'crypto.createHash x' + HASH_COUNT, histograms: nodeCryptoHistograms }, + { key: 'web-crypto', label: 'globalThis.crypto.subtle.digest x' + HASH_COUNT, histograms: subtleHistograms }, + ]; + + const data: { name: string; value: number; unit: string }[] = []; + for (const [sizeKb, concurrency] of CASES) { + const key: CaseKey = `${sizeKb}-${concurrency}`; + for (const impl of implementations) { + const { h, total } = impl.histograms[key]; + data.push({ name: `MsgId/${impl.key}/x${concurrency}/${sizeKb}kb/avg`, value: h.mean, unit: 'ms' }); + data.push({ name: `MsgId/${impl.key}/x${concurrency}/${sizeKb}kb/p50`, value: h.percentile(50), unit: 'ms' }); + data.push({ name: `MsgId/${impl.key}/x${concurrency}/${sizeKb}kb/p99`, value: h.percentile(99), unit: 'ms' }); + data.push({ name: `MsgId/${impl.key}/x${concurrency}/${sizeKb}kb/sum`, value: total, unit: 'ms' }); + } + } + + if (process.env.BENCH_OUTPUT) { + await fs.mkdir(path.dirname(process.env.BENCH_OUTPUT), { recursive: true }); + await fs.writeFile(process.env.BENCH_OUTPUT, JSON.stringify(data, null, 2)); + } else if (process.env.BENCH_OUTPUT_MD) { + await fs.mkdir(path.dirname(process.env.BENCH_OUTPUT_MD), { recursive: true }); + await using f = await fs.open(process.env.BENCH_OUTPUT_MD, 'w'); + await f.write('| Function | CONCURRENCY | Size (KB) | Avg (ms) | P50 (ms) | P99 (ms) | TOTAL (ms) |\n'); + await f.write('|----------|---|-----------|----------|----------|----------|------------|\n'); + for (const [sizeKb, concurrency] of CASES) { + const key: CaseKey = `${sizeKb}-${concurrency}`; + for (const impl of implementations) { + const { h, total } = impl.histograms[key]; + await f.write( + `| ${impl.label} | ${concurrency} | ${sizeKb} | ${h.mean} | ${h.percentile(50)} | ${h.percentile(99)} | ${total} |\n`, + ); + } + } + } + }); + + it.each(CASES)('hash.js sha256: %d KB x%d', async (sizeKb, concurrency) => { + const data = messageData[sizeKb as SizeKb]; + const key: CaseKey = `${sizeKb}-${concurrency}`; + const res = hashJsHistograms[key]; + + const testStart = process.hrtime.bigint(); + await asyncPool(concurrency, Array(HASH_COUNT), () => { + const start = process.hrtime.bigint(); + sha256(Buffer.concat([Buffer.from(TOPIC), data])).subarray(0, 20); + const elapsed = Number(process.hrtime.bigint() - start) / NS_PER_MS; + res.h.record(Math.trunc(Math.max(1, elapsed))); + return Promise.resolve(); + }); + res.total = Number(process.hrtime.bigint() - testStart) / NS_PER_MS; + }); + + it.each(CASES)('node:crypto createHash: %d KB x%d', async (sizeKb, concurrency) => { + const data = messageData[sizeKb as SizeKb]; + const key: CaseKey = `${sizeKb}-${concurrency}`; + const res = nodeCryptoHistograms[key]; + + const testStart = process.hrtime.bigint(); + await asyncPool(concurrency, Array(HASH_COUNT), () => { + const start = process.hrtime.bigint(); + createHash('sha256').update(TOPIC).update(data).digest().subarray(0, 20); + const elapsed = Number(process.hrtime.bigint() - start) / NS_PER_MS; + res.h.record(Math.trunc(Math.max(1, elapsed))); + return Promise.resolve(); + }); + res.total = Number(process.hrtime.bigint() - testStart) / NS_PER_MS; + }); + + it.each(CASES)('crypto.subtle.digest parallel: %d KB x%d', async (sizeKb, concurrency) => { + const data = messageData[sizeKb as SizeKb]; + const concat = Buffer.concat([Buffer.from(TOPIC), data]); + const key: CaseKey = `${sizeKb}-${concurrency}`; + const res = subtleHistograms[key]; + + const testStart = process.hrtime.bigint(); + + await asyncPool(concurrency, Array(HASH_COUNT), async () => { + const start = process.hrtime.bigint(); + await crypto.subtle.digest('SHA-256', concat).then(buf => Buffer.from(buf).subarray(0, 20)); + const elapsed = Number(process.hrtime.bigint() - start) / NS_PER_MS; + res.h.record(Math.trunc(Math.max(1, elapsed))); + }); + + res.total = Number(process.hrtime.bigint() - testStart) / NS_PER_MS; + }); +}); diff --git a/yarn-project/p2p/src/services/encoding.ts b/yarn-project/p2p/src/services/encoding.ts index 9a4d610c4fa5..0aea0032d158 100644 --- a/yarn-project/p2p/src/services/encoding.ts +++ b/yarn-project/p2p/src/services/encoding.ts @@ -1,11 +1,11 @@ // Taken from lodestar: https://github.com/ChainSafe/lodestar -import { sha256 } from '@aztec/foundation/crypto/sha256'; import { createLogger } from '@aztec/foundation/log'; import { MAX_TX_SIZE_KB, TopicType, getTopicFromString } from '@aztec/stdlib/p2p'; import type { RPC } from '@chainsafe/libp2p-gossipsub/message'; import type { DataTransform } from '@chainsafe/libp2p-gossipsub/types'; import type { Message } from '@libp2p/interface'; +import { webcrypto } from 'node:crypto'; import { compressSync, uncompressSync } from 'snappy'; import xxhashFactory from 'xxhash-wasm'; @@ -44,11 +44,10 @@ export function msgIdToStrFn(msgId: Uint8Array): string { * @param message - The libp2p message * @returns The message identifier */ -export function getMsgIdFn(message: Message) { - const { topic } = message; - - const vec = [Buffer.from(topic), message.data]; - return sha256(Buffer.concat(vec)).subarray(0, 20); +export async function getMsgIdFn({ topic, data }: Message): Promise { + const buffer = Buffer.concat([Buffer.from(topic), data]); + const hash = await webcrypto.subtle.digest('SHA-256', buffer); + return Buffer.from(hash.slice(0, 20)); } const DefaultMaxSizesKb: Record = { diff --git a/yarn-project/p2p/src/services/libp2p/libp2p_service.test.ts b/yarn-project/p2p/src/services/libp2p/libp2p_service.test.ts index d9bf686d8f70..a0e812112a30 100644 --- a/yarn-project/p2p/src/services/libp2p/libp2p_service.test.ts +++ b/yarn-project/p2p/src/services/libp2p/libp2p_service.test.ts @@ -9,7 +9,7 @@ import { openTmpStore } from '@aztec/kv-store/lmdb'; import { L2Block, type L2BlockSource } from '@aztec/stdlib/block'; import type { ContractDataSource } from '@aztec/stdlib/contract'; import type { ClientProtocolCircuitVerifier } from '@aztec/stdlib/interfaces/server'; -import { BlockProposal, P2PClientType, PeerErrorSeverity } from '@aztec/stdlib/p2p'; +import { BlockProposal, PeerErrorSeverity } from '@aztec/stdlib/p2p'; import { makeBlockHeader, makeBlockProposal, @@ -1118,7 +1118,6 @@ class TestLibP2PService extends LibP2PService { }); super( - P2PClientType.Full, mockConfig, node, mockPeerDiscoveryService, diff --git a/yarn-project/p2p/src/services/libp2p/libp2p_service.ts b/yarn-project/p2p/src/services/libp2p/libp2p_service.ts index 7215f2bc449e..0122f322ebc0 100644 --- a/yarn-project/p2p/src/services/libp2p/libp2p_service.ts +++ b/yarn-project/p2p/src/services/libp2p/libp2p_service.ts @@ -16,13 +16,12 @@ import { CheckpointProposal, type CheckpointProposalCore, type Gossipable, - P2PClientType, P2PMessage, type ValidationResult as P2PValidationResult, PeerErrorSeverity, TopicType, createTopicString, - getTopicsForClientAndConfig, + getTopicsForConfig, metricsTopicStrToLabels, } from '@aztec/stdlib/p2p'; import { MerkleTreeId } from '@aztec/stdlib/trees'; @@ -135,7 +134,7 @@ type ReceivedMessageValidationResult = /** * Lib P2P implementation of the P2PService interface. */ -export class LibP2PService extends WithTracer implements P2PService { +export class LibP2PService extends WithTracer implements P2PService { private discoveryRunningPromise?: RunningPromise; private msgIdSeenValidators: Record = {} as Record; @@ -182,7 +181,6 @@ export class LibP2PService extends protected logger: Logger; constructor( - private clientType: T, private config: P2PConfig, protected node: PubSubLibp2p, private peerDiscoveryService: PeerDiscoveryService, @@ -262,8 +260,7 @@ export class LibP2PService extends * @param txPool - The transaction pool to be accessed by the service. * @returns The new service. */ - public static async new( - clientType: T, + public static async new( config: P2PConfig, peerId: PeerId, deps: { @@ -475,7 +472,6 @@ export class LibP2PService extends peerManager.shouldDisableP2PGossip(peerId) ? -Infinity : peerManager.getPeerScore(peerId); return new LibP2PService( - clientType, config, node, peerDiscoveryService, @@ -549,7 +545,7 @@ export class LibP2PService extends await this.node.start(); // Subscribe to standard GossipSub topics by default - for (const topic of getTopicsForClientAndConfig(this.clientType, this.config.disableTransactions)) { + for (const topic of getTopicsForConfig(this.config.disableTransactions)) { this.subscribeToTopic(this.topicStrings[topic]); } @@ -818,9 +814,7 @@ export class LibP2PService extends if (msg.topic === this.topicStrings[TopicType.tx]) { await this.handleGossipedTx(p2pMessage.payload, msgId, source); } else if (msg.topic === this.topicStrings[TopicType.checkpoint_attestation]) { - if (this.clientType === P2PClientType.Full) { - await this.processCheckpointAttestationFromPeer(p2pMessage.payload, msgId, source); - } + await this.processCheckpointAttestationFromPeer(p2pMessage.payload, msgId, source); } else if (msg.topic === this.topicStrings[TopicType.block_proposal]) { await this.processBlockFromPeer(p2pMessage.payload, msgId, source); } else if (msg.topic === this.topicStrings[TopicType.checkpoint_proposal]) { diff --git a/yarn-project/p2p/src/services/reqresp/batch-tx-requester/batch_tx_requester.test.ts b/yarn-project/p2p/src/services/reqresp/batch-tx-requester/batch_tx_requester.test.ts index 9cd2997d3a2a..bc7234438f72 100644 --- a/yarn-project/p2p/src/services/reqresp/batch-tx-requester/batch_tx_requester.test.ts +++ b/yarn-project/p2p/src/services/reqresp/batch-tx-requester/batch_tx_requester.test.ts @@ -607,8 +607,11 @@ describe('BatchTxRequester', () => { expect(peerCollection.getBadPeers()).toContain(peers[0].toString()); expect(peerCollection.getBadPeers()).not.toContain(peers[1].toString()); - // Verify bad peer is excluded from queries - peer0 should be in bad peers - expect(peerCollection.nextDumbPeerToQuery()).toBeUndefined(); + // Verify bad peer is excluded from dumb queries. + // The good peer can still be temporarily in-flight when run() returns, so we only assert + // that peer0 is never sampled from the currently available dumb peers. + const dumbPeersToQuery = sampleAllPeers(peerCollection.nextDumbPeerToQuery.bind(peerCollection)) ?? []; + expect(dumbPeersToQuery).not.toContain(peers[0].toString()); }); it('should recover bad peer after successful response', async () => { diff --git a/yarn-project/p2p/src/services/reqresp/reqresp.ts b/yarn-project/p2p/src/services/reqresp/reqresp.ts index 7d544395230d..0f9fec565566 100644 --- a/yarn-project/p2p/src/services/reqresp/reqresp.ts +++ b/yarn-project/p2p/src/services/reqresp/reqresp.ts @@ -627,7 +627,9 @@ export class ReqResp implements ReqRespInterface { // and that this stream should be dropped const isMessageToNotWarn = err instanceof Error && - ['stream reset', 'Cannot push value onto an ended pushable'].some(msg => err.message.includes(msg)); + ['stream reset', 'Cannot push value onto an ended pushable', 'read ECONNRESET'].some(msg => + err.message.includes(msg), + ); const level = isMessageToNotWarn ? 'debug' : 'warn'; this.logger[level]('Unknown stream error while handling the stream, aborting', { protocol, diff --git a/yarn-project/p2p/src/test-helpers/make-test-p2p-clients.ts b/yarn-project/p2p/src/test-helpers/make-test-p2p-clients.ts index b0c69cb7ebe5..1bb554f79b06 100644 --- a/yarn-project/p2p/src/test-helpers/make-test-p2p-clients.ts +++ b/yarn-project/p2p/src/test-helpers/make-test-p2p-clients.ts @@ -7,7 +7,6 @@ import { sleep } from '@aztec/foundation/sleep'; import type { DataStoreConfig } from '@aztec/kv-store/config'; import { openTmpStore } from '@aztec/kv-store/lmdb-v2'; import type { WorldStateSynchronizer } from '@aztec/stdlib/interfaces/server'; -import { P2PClientType } from '@aztec/stdlib/p2p'; import { createP2PClient } from '../client/index.js'; import type { P2PClient } from '../client/p2p_client.js'; @@ -98,7 +97,6 @@ export async function makeTestP2PClient( const kvStore = await openTmpStore('test'); const client = await createP2PClient( - P2PClientType.Full, config, l2BlockSource, proofVerifier, diff --git a/yarn-project/p2p/src/test-helpers/mock-pubsub.ts b/yarn-project/p2p/src/test-helpers/mock-pubsub.ts index 03d50870945d..cf48654e0aff 100644 --- a/yarn-project/p2p/src/test-helpers/mock-pubsub.ts +++ b/yarn-project/p2p/src/test-helpers/mock-pubsub.ts @@ -4,7 +4,6 @@ import type { AztecAsyncKVStore } from '@aztec/kv-store'; import type { L2BlockSource } from '@aztec/stdlib/block'; import type { ContractDataSource } from '@aztec/stdlib/contract'; import type { ClientProtocolCircuitVerifier, WorldStateSynchronizer } from '@aztec/stdlib/interfaces/server'; -import { P2PClientType } from '@aztec/stdlib/p2p'; import type { TelemetryClient } from '@aztec/telemetry-client'; import type { GossipsubEvents, GossipsubMessage } from '@chainsafe/libp2p-gossipsub'; @@ -42,11 +41,10 @@ type GossipSubService = PubSubLibp2p['services']['pubsub']; * Given a mock gossip sub network, returns a factory function that creates an instance LibP2PService connected to it. * Designed to be used in tests in P2PClientDeps.p2pServiceFactory. */ -export function getMockPubSubP2PServiceFactory( +export function getMockPubSubP2PServiceFactory( network: MockGossipSubNetwork, -): (...args: Parameters<(typeof LibP2PService)['new']>) => Promise> { +): (...args: Parameters<(typeof LibP2PService)['new']>) => Promise { return ( - clientType: P2PClientType, config: P2PConfig, peerId: PeerId, deps: { @@ -66,8 +64,7 @@ export function getMockPubSubP2PServiceFactory( const peerManager = new DummyPeerManager(peerId, network); const reqresp: ReqRespInterface = new MockReqResp(peerId, network); const peerDiscoveryService = new DummyPeerDiscoveryService(); - const service = new LibP2PService( - clientType as T, + const service = new LibP2PService( config, libp2p, peerDiscoveryService, diff --git a/yarn-project/p2p/src/test-helpers/reqresp-nodes.ts b/yarn-project/p2p/src/test-helpers/reqresp-nodes.ts index a3c0fe5443b9..f0e6f04232e2 100644 --- a/yarn-project/p2p/src/test-helpers/reqresp-nodes.ts +++ b/yarn-project/p2p/src/test-helpers/reqresp-nodes.ts @@ -12,7 +12,6 @@ import type { IVCProofVerificationResult, WorldStateSynchronizer, } from '@aztec/stdlib/interfaces/server'; -import type { P2PClientType } from '@aztec/stdlib/p2p'; import type { Tx } from '@aztec/stdlib/tx'; import { compressComponentVersions } from '@aztec/stdlib/versioning'; import { type TelemetryClient, getTelemetryClient } from '@aztec/telemetry-client'; @@ -107,8 +106,7 @@ export async function createLibp2pNode( * * */ -export async function createTestLibP2PService( - clientType: T, +export async function createTestLibP2PService( boostrapAddrs: string[] = [], archiver: L2BlockSource & ContractDataSource, worldStateSynchronizer: WorldStateSynchronizer, @@ -159,8 +157,7 @@ export async function createTestLibP2PService( p2pNode.services.pubsub.score.params.appSpecificScore = (peerId: string) => peerManager.shouldDisableP2PGossip(peerId) ? -Infinity : peerManager.getPeerScore(peerId); - return new LibP2PService( - clientType, + return new LibP2PService( config, p2pNode as PubSubLibp2p, discoveryService, diff --git a/yarn-project/p2p/src/testbench/p2p_client_testbench_worker.ts b/yarn-project/p2p/src/testbench/p2p_client_testbench_worker.ts index a44fa2808dea..dc2fa88beb89 100644 --- a/yarn-project/p2p/src/testbench/p2p_client_testbench_worker.ts +++ b/yarn-project/p2p/src/testbench/p2p_client_testbench_worker.ts @@ -19,7 +19,7 @@ import { protocolContractsHash } from '@aztec/protocol-contracts'; import type { L2BlockSource } from '@aztec/stdlib/block'; import type { ContractDataSource } from '@aztec/stdlib/contract'; import type { ClientProtocolCircuitVerifier, WorldStateSynchronizer } from '@aztec/stdlib/interfaces/server'; -import { type BlockProposal, P2PClientType, P2PMessage } from '@aztec/stdlib/p2p'; +import { type BlockProposal, P2PMessage } from '@aztec/stdlib/p2p'; import { ChonkProof } from '@aztec/stdlib/proofs'; import { makeAztecAddress, makeBlockHeader, makeBlockProposal, mockTx } from '@aztec/stdlib/testing'; import { Tx, TxHash, type TxValidationResult } from '@aztec/stdlib/tx'; @@ -86,12 +86,11 @@ export interface BenchReadyMessage { } const txCache = new Map(); -class TestLibP2PService extends LibP2PService { +class TestLibP2PService extends LibP2PService { private disableTxValidation: boolean; private gossipMessageCount = 0; constructor( - clientType: T, config: P2PConfig, node: PubSubLibp2p, peerDiscoveryService: PeerDiscoveryService, @@ -107,7 +106,6 @@ class TestLibP2PService extends Li disableTxValidation = true, ) { super( - clientType, config, node, peerDiscoveryService, @@ -365,7 +363,6 @@ process.on('message', async msg => { }; const client = await createP2PClient( - P2PClientType.Full, config as P2PConfig & DataStoreConfig, l2BlockSource, proofVerifier as ClientProtocolCircuitVerifier, @@ -378,7 +375,6 @@ process.on('message', async msg => { ); const testService = new TestLibP2PService( - P2PClientType.Full, config, (client as any).p2pService.node, (client as any).p2pService.peerDiscoveryService, diff --git a/yarn-project/sequencer-client/src/sequencer/timetable.test.ts b/yarn-project/sequencer-client/src/sequencer/timetable.test.ts index c247cfddf3b1..9fc63d1e7be1 100644 --- a/yarn-project/sequencer-client/src/sequencer/timetable.test.ts +++ b/yarn-project/sequencer-client/src/sequencer/timetable.test.ts @@ -1,3 +1,4 @@ +import { createLogger } from '@aztec/foundation/log'; import { MIN_EXECUTION_TIME } from '@aztec/stdlib/timetable'; import { SequencerTimetable } from './timetable.js'; @@ -5,6 +6,7 @@ import { SequencerState } from './utils.js'; describe('sequencer-timetable', () => { let timetable: SequencerTimetable; + const logger = createLogger('sequencer-timetable-test'); const ETHEREUM_SLOT_DURATION = 12; const AZTEC_SLOT_DURATION = 36; @@ -287,16 +289,18 @@ describe('sequencer-timetable', () => { describe('maxNumberOfBlocks calculation', () => { it.each([ - { aztecSlot: 36, blockDuration: 8000 }, - { aztecSlot: 72, blockDuration: 8000 }, - { aztecSlot: 120, blockDuration: 10000 }, + { aztecSlot: 36, blockDuration: 8000, publishTime: L1_PUBLISHING_TIME }, + { aztecSlot: 72, blockDuration: 8000, publishTime: L1_PUBLISHING_TIME }, + { aztecSlot: 120, blockDuration: 10000, publishTime: L1_PUBLISHING_TIME }, + { aztecSlot: 72, blockDuration: 6000, publishTime: 36 }, + { aztecSlot: 72, blockDuration: 6000, publishTime: 24 }, ])( - 'should calculate max blocks with aztecSlot=$aztecSlot blockDuration=$blockDuration)', - ({ aztecSlot, blockDuration }) => { + 'should calculate max blocks with aztecSlot=$aztecSlot blockDuration=$blockDuration publishTime=$publishTime)', + ({ aztecSlot, blockDuration, publishTime }) => { const tt = new SequencerTimetable({ ethereumSlotDuration: ETHEREUM_SLOT_DURATION, aztecSlotDuration: aztecSlot, - l1PublishingTime: L1_PUBLISHING_TIME, + l1PublishingTime: publishTime, blockDurationMs: blockDuration, enforce: ENFORCE_TIMETABLE, }); @@ -311,6 +315,9 @@ describe('sequencer-timetable', () => { const result2 = tt.canStartNextBlock(20); expect(result2.canStart).toBe(true); } + logger.info( + `AztecSlot: ${aztecSlot}, BlockDuration: ${blockDuration}, PublishTime: ${publishTime}, MaxBlocks: ${tt.maxNumberOfBlocks}\n\n`, + ); }, ); diff --git a/yarn-project/sequencer-client/src/sequencer/timetable.ts b/yarn-project/sequencer-client/src/sequencer/timetable.ts index 505979f95af7..86b88a1ba99f 100644 --- a/yarn-project/sequencer-client/src/sequencer/timetable.ts +++ b/yarn-project/sequencer-client/src/sequencer/timetable.ts @@ -132,7 +132,7 @@ export class SequencerTimetable { const initializeDeadline = this.aztecSlotDuration - minWorkToDo; this.initializeDeadline = initializeDeadline; - this.log.verbose( + this.log.info( `Sequencer timetable initialized with ${this.maxNumberOfBlocks} blocks per slot (${this.enforce ? 'enforced' : 'not enforced'})`, { ethereumSlotDuration: this.ethereumSlotDuration, diff --git a/yarn-project/stdlib/src/block/l2_block_stream/l2_block_stream.test.ts b/yarn-project/stdlib/src/block/l2_block_stream/l2_block_stream.test.ts index 9d6e5beeebac..1ba698b4b307 100644 --- a/yarn-project/stdlib/src/block/l2_block_stream/l2_block_stream.test.ts +++ b/yarn-project/stdlib/src/block/l2_block_stream/l2_block_stream.test.ts @@ -755,6 +755,60 @@ describe('L2BlockStream', () => { ]); }); + describe('startingBlock with stale checkpoint state', () => { + // When a node restarts with startingBlock set and has local blocks but no checkpoint + // state (e.g. checkpoint tracking is new, or checkpoint state was reset), Loop 1 + // should not spam checkpoint events for all historical checkpoints. + + it('skips historical checkpoint events before startingBlock on restart with stale checkpoint state', async () => { + // node has blocks 1-15 locally (proposed=15) but no checkpoint state. + // Checkpoint 5 covers blocks 13-15 (the last checkpoint). + setRemoteTipsMultiBlock(15, 15); + localData.proposed.number = BlockNumber(15); + // localData.checkpointed starts at 0 - simulating stale/missing checkpoint state + + blockStream = new TestL2BlockStream(blockSource, localData, handler, undefined, { + batchSize: 10, + startingBlock: 13, // start from checkpoint 5 (blocks 13-15) + }); + + await blockStream.work(); + + // Should only emit checkpoint 5 (the one containing startingBlock=13), not all 5 checkpoints + expect(handler.events).toEqual([expectCheckpointed(5)]); + // Verify we don't spam checkpoints 1-4 + const checkpointEvents = handler.events.filter(e => e.type === 'chain-checkpointed'); + expect(checkpointEvents).toHaveLength(1); + }); + + it('without startingBlock emits all historical checkpoints for already-local blocks', async () => { + // Same scenario without startingBlock: should emit all 5 checkpoints (correct catch-up behavior) + setRemoteTipsMultiBlock(15, 15); + localData.proposed.number = BlockNumber(15); + // localData.checkpointed starts at 0 + + await blockStream.work(); + + // All 5 checkpoints should be emitted since they're all for already-local blocks + const checkpointEvents = handler.events.filter(e => e.type === 'chain-checkpointed'); + expect(checkpointEvents).toHaveLength(5); + }); + + it('does not call getCheckpointedBlocks(0) when startingBlock is 0', async () => { + // getCheckpointedBlocks rejects block 0 + setRemoteTipsMultiBlock(15, 15); + blockStream = new TestL2BlockStream(blockSource, localData, handler, undefined, { + batchSize: 10, + startingBlock: 0, + }); + + await blockStream.work(); + + const calls = blockSource.getCheckpointedBlocks.mock.calls; + expect(calls.every(([blockNum]) => blockNum >= 1)).toBe(true); + }); + }); + describe('checkpoint prefetching', () => { it('prefetches multiple checkpoints in a single RPC call', async () => { // Set up: 9 blocks in 3 checkpoints diff --git a/yarn-project/stdlib/src/block/l2_block_stream/l2_block_stream.ts b/yarn-project/stdlib/src/block/l2_block_stream/l2_block_stream.ts index dae14e6fb962..9aac6c45b143 100644 --- a/yarn-project/stdlib/src/block/l2_block_stream/l2_block_stream.ts +++ b/yarn-project/stdlib/src/block/l2_block_stream/l2_block_stream.ts @@ -109,6 +109,27 @@ export class L2BlockStream { let nextBlockNumber = latestBlockNumber + 1; let nextCheckpointToEmit = CheckpointNumber(localTips.checkpointed.checkpoint.number + 1); + + // When startingBlock is set, also skip ahead for checkpoints. + if ( + this.opts.startingBlock !== undefined && + this.opts.startingBlock >= 1 && + nextCheckpointToEmit <= sourceTips.checkpointed.checkpoint.number + ) { + const startingBlockCheckpoints = await this.l2BlockSource.getCheckpointedBlocks( + BlockNumber(this.opts.startingBlock), + 1, + ); + if (startingBlockCheckpoints.length > 0) { + nextCheckpointToEmit = CheckpointNumber( + Math.max(nextCheckpointToEmit, startingBlockCheckpoints[0].checkpointNumber), + ); + } else { + // startingBlock is past all checkpointed blocks; skip Loop 1 entirely. + nextCheckpointToEmit = CheckpointNumber(sourceTips.checkpointed.checkpoint.number + 1); + } + } + if (this.opts.skipFinalized) { // When skipping finalized blocks we need to provide reliable reorg detection while fetching as few blocks as // possible. Finalized blocks cannot be reorged by definition, so we can skip most of them. We do need the very diff --git a/yarn-project/stdlib/src/interfaces/p2p.ts b/yarn-project/stdlib/src/interfaces/p2p.ts index fe0b5517bcff..301cbb4f9fee 100644 --- a/yarn-project/stdlib/src/interfaces/p2p.ts +++ b/yarn-project/stdlib/src/interfaces/p2p.ts @@ -3,7 +3,6 @@ import type { SlotNumber } from '@aztec/foundation/branded-types'; import { z } from 'zod'; import { CheckpointAttestation } from '../p2p/checkpoint_attestation.js'; -import type { P2PClientType } from '../p2p/client_type.js'; import { type ApiSchemaFor, optional, schemas } from '../schemas/index.js'; import { Tx } from '../tx/tx.js'; import { TxHash } from '../tx/tx_hash.js'; @@ -27,7 +26,7 @@ const PeerInfoSchema = z.discriminatedUnion('status', [ ]); /** Exposed API to the P2P module. */ -export interface P2PApiWithoutAttestations { +export interface P2PApi { /** * Returns all pending transactions in the transaction pool. * @param limit - The number of items to returns @@ -48,9 +47,7 @@ export interface P2PApiWithoutAttestations { * Returns info for all connected, dialing, and cached peers. */ getPeers(includePending?: boolean): Promise; -} -export interface P2PApiWithAttestations extends P2PApiWithoutAttestations { /** * Queries the Attestation pool for checkpoint attestations for the given slot * @@ -61,19 +58,11 @@ export interface P2PApiWithAttestations extends P2PApiWithoutAttestations { getCheckpointAttestationsForSlot(slot: SlotNumber, proposalId?: string): Promise; } -export interface P2PClient extends P2PApiWithAttestations { +export interface P2PClient extends P2PApi { /** Manually adds checkpoint attestations to the p2p client attestation pool. */ addOwnCheckpointAttestations(attestations: CheckpointAttestation[]): Promise; } -export type P2PApi = T extends P2PClientType.Full - ? P2PApiWithAttestations - : P2PApiWithoutAttestations; - -export type P2PApiFull = T extends P2PClientType.Full - ? P2PApiWithAttestations & P2PClient - : P2PApiWithoutAttestations; - export const P2PApiSchema: ApiSchemaFor = { getCheckpointAttestationsForSlot: z .function() diff --git a/yarn-project/stdlib/src/p2p/client_type.ts b/yarn-project/stdlib/src/p2p/client_type.ts index 75d1fea547c1..e69de29bb2d1 100644 --- a/yarn-project/stdlib/src/p2p/client_type.ts +++ b/yarn-project/stdlib/src/p2p/client_type.ts @@ -1,6 +0,0 @@ -export enum P2PClientType { - // Full p2p clients will subscribe to all gossip topics - Full, - // Prove p2p clients will only subscribe to transaction and proving topics - Prover, -} diff --git a/yarn-project/stdlib/src/p2p/index.ts b/yarn-project/stdlib/src/p2p/index.ts index c057784472ac..590f7b4e60f6 100644 --- a/yarn-project/stdlib/src/p2p/index.ts +++ b/yarn-project/stdlib/src/p2p/index.ts @@ -8,7 +8,6 @@ export * from './interface.js'; export * from './signature_utils.js'; export * from './signed_txs.js'; export * from './topic_type.js'; -export * from './client_type.js'; export * from './message_validator.js'; export * from './peer_error.js'; export * from './constants.js'; diff --git a/yarn-project/stdlib/src/p2p/topic_type.ts b/yarn-project/stdlib/src/p2p/topic_type.ts index 949ec2c6173b..107298f8dc38 100644 --- a/yarn-project/stdlib/src/p2p/topic_type.ts +++ b/yarn-project/stdlib/src/p2p/topic_type.ts @@ -1,5 +1,3 @@ -import { P2PClientType } from './client_type.js'; - /** * Creates the topic channel identifier string from a given topic type */ @@ -27,19 +25,14 @@ export enum TopicType { checkpoint_attestation = 'checkpoint_attestation', } -export function getTopicTypeForClientType(clientType: P2PClientType) { - if (clientType === P2PClientType.Full) { - return [TopicType.tx, TopicType.block_proposal, TopicType.checkpoint_proposal, TopicType.checkpoint_attestation]; - } else if (clientType === P2PClientType.Prover) { - return [TopicType.tx, TopicType.block_proposal, TopicType.checkpoint_proposal]; - } else { - const _: never = clientType; - return [TopicType.tx]; - } -} - -export function getTopicsForClientAndConfig(clientType: P2PClientType, disableTransactions: boolean) { - const topics = getTopicTypeForClientType(clientType); +/** Returns all gossip topics, optionally filtering out transactions. */ +export function getTopicsForConfig(disableTransactions: boolean) { + const topics = [ + TopicType.tx, + TopicType.block_proposal, + TopicType.checkpoint_proposal, + TopicType.checkpoint_attestation, + ]; if (disableTransactions) { return topics.filter(topic => topic !== TopicType.tx); } diff --git a/yarn-project/stdlib/src/p2p/topics.test.ts b/yarn-project/stdlib/src/p2p/topics.test.ts index 79c395f0801b..78556986802a 100644 --- a/yarn-project/stdlib/src/p2p/topics.test.ts +++ b/yarn-project/stdlib/src/p2p/topics.test.ts @@ -1,16 +1,13 @@ -import { P2PClientType } from './client_type.js'; -import { TopicType, getTopicFromString, getTopicsForClientAndConfig } from './topic_type.js'; +import { TopicType, getTopicFromString, getTopicsForConfig } from './topic_type.js'; describe('Gossip topic retrieval', () => { it.each([ - [P2PClientType.Full, ['tx', 'block_proposal', 'checkpoint_proposal', 'checkpoint_attestation'], true], - [P2PClientType.Prover, ['tx', 'block_proposal', 'checkpoint_proposal'], true], - [P2PClientType.Full, ['block_proposal', 'checkpoint_proposal', 'checkpoint_attestation'], false], - [P2PClientType.Prover, ['block_proposal', 'checkpoint_proposal'], false], + [['tx', 'block_proposal', 'checkpoint_proposal', 'checkpoint_attestation'], true], + [['block_proposal', 'checkpoint_proposal', 'checkpoint_attestation'], false], ])( - 'Node type %s subscribes to topics %s with transactions enabled: %s', - (clientType: P2PClientType, expectedTopics: string[], transactionsEnabled: boolean) => { - expect(getTopicsForClientAndConfig(clientType, !transactionsEnabled)).toEqual(expectedTopics); + 'subscribes to topics %s with transactions enabled: %s', + (expectedTopics: string[], transactionsEnabled: boolean) => { + expect(getTopicsForConfig(!transactionsEnabled)).toEqual(expectedTopics); }, ); });