diff --git a/ci3/ci-metrics/app.py b/ci3/ci-metrics/app.py index c62875e7d19a..e2925d9ae3b0 100644 --- a/ci3/ci-metrics/app.py +++ b/ci3/ci-metrics/app.py @@ -6,6 +6,7 @@ import os import re import redis +import time import threading from pathlib import Path @@ -37,14 +38,30 @@ def verify_password(username, password): def _init(): - """Initialize SQLite and start background threads.""" + """Initialize SQLite, warm caches, and start background threads.""" try: db.get_db() metrics.start_test_listener(r) + metrics.start_phase_listener(r) metrics.start_ci_run_sync(r) + github_data.start_merge_queue_poller() + github_data.start_pr_dirs_worker() print("[ci-metrics] Background threads started") except Exception as e: print(f"[ci-metrics] Warning: startup failed: {e}") + # Warm billing caches so first request isn't slow + try: + from billing.gcp import _ensure_cached as _warm_gcp + _warm_gcp() + print("[ci-metrics] GCP billing cache warmed") + except Exception as e: + print(f"[ci-metrics] GCP billing warmup failed: {e}") + try: + from billing.aws import _ensure_cached as _warm_aws + _warm_aws() + print("[ci-metrics] AWS costs cache warmed") + except Exception as e: + print(f"[ci-metrics] AWS costs warmup failed: {e}") threading.Thread(target=_init, daemon=True, name='metrics-init').start() @@ -101,6 +118,74 @@ def _json(data): return Response(json.dumps(data), mimetype='application/json') +_TEN_DAYS = 10 * 24 * 3600 + + +def _cache_ttl(date_to: str) -> int: + """Return 10-day TTL for historical ranges (date_to < today), else 5 min.""" + try: + if datetime.strptime(date_to, '%Y-%m-%d').date() < datetime.now().date(): + return _TEN_DAYS + except ValueError: + pass + return 300 + + +# ---- Author mapping: git display name → GitHub username ---- + +_author_map = {} +_author_map_ts = 0 + + +def _get_author_map() -> dict: + """Build git display name → GitHub username mapping from ci_runs + pr_authors.""" + global _author_map, _author_map_ts + now = time.time() + if now - _author_map_ts < 3600 and _author_map: + return _author_map + rows = db.query(''' + SELECT cr.author as git_name, pa.author as github_user, COUNT(*) as c + FROM ci_runs cr + JOIN pr_authors pa ON cr.pr_number = pa.pr_number + WHERE cr.author IS NOT NULL AND cr.author != '' + AND pa.author IS NOT NULL AND pa.author != '' + GROUP BY cr.author, pa.author + ''') + name_to_gh = {} + for row in rows: + gn = row['git_name'] + gh = row['github_user'] + if gn not in name_to_gh: + name_to_gh[gn] = {} + name_to_gh[gn][gh] = name_to_gh[gn].get(gh, 0) + row['c'] + result = {} + for gn, gh_counts in name_to_gh.items(): + best = max(gh_counts, key=gh_counts.get) + result[gn] = best + result[best] = best # identity mapping for usernames used as commit_author + _author_map = result + _author_map_ts = now + return result + + +def _normalize_authors(authors_str: str) -> str: + """Normalize comma-separated git names to deduplicated GitHub usernames.""" + if not authors_str: + return '' + amap = _get_author_map() + seen = set() + result = [] + for name in authors_str.split(','): + name = name.strip() + if not name: + continue + gh = amap.get(name, name) + if gh not in seen: + seen.add(gh) + result.append(gh) + return ','.join(result) + + # ---- Namespace billing ---- @app.route('/namespace-billing') @@ -166,7 +251,7 @@ def api_ci_runs(): ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) if date_from else None ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) if date_to else None - runs = metrics.get_ci_runs(r, ts_from, ts_to) + runs = metrics.get_ci_runs(ts_from, ts_to) if status_filter: runs = [run for run in runs if run.get('status') == status_filter] @@ -185,7 +270,7 @@ def api_ci_runs(): @auth.login_required def api_ci_stats(): ts_from = int((datetime.now() - timedelta(days=7)).timestamp() * 1000) - runs = metrics.get_ci_runs(r, ts_from) + runs = metrics.get_ci_runs(ts_from) total = len(runs) passed = sum(1 for run in runs if run.get('status') == 'PASSED') @@ -233,6 +318,7 @@ def api_costs_overview(): buckets[key]['aws_total'] += entry.get('aws_total', 0) buckets[key]['gcp_total'] += entry.get('gcp_total', 0) result['by_date'] = sorted(buckets.values(), key=lambda x: x['date']) + result['period'] = {'from': date_from, 'to': date_to} return _json(result) @@ -287,7 +373,7 @@ def api_costs_attribution(): ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) - runs = metrics.get_ci_runs(r, ts_from, ts_to) + runs = metrics.get_ci_runs(ts_from, ts_to) runs_with_cost = [run for run in runs if run.get('cost_usd') is not None] # Enrich merge queue runs with PR author from GitHub @@ -311,6 +397,9 @@ def api_costs_attribution(): prn = info['pr_number'] if prn and int(prn) in pr_authors: author = pr_authors[int(prn)]['author'] + # Attribute nightly / release runs to a special 'release' actor + if info['type'] in ('nightly', 'releases'): + author = 'release' inst_type = run.get('instance_type', 'unknown') vcpus = run.get('instance_vcpus') @@ -383,14 +472,17 @@ def api_costs_attribution(): instances.sort(key=lambda x: -(x['cost_usd'] or 0)) all_types = sorted(by_type.keys()) + # Pre-compute runs-per-date to avoid O(dates × instances) + runs_per_date = {} + for inst in instances: + runs_per_date[inst['date']] = runs_per_date.get(inst['date'], 0) + 1 by_date_list = [] for date in sorted(by_date_type): - entry = {'date': date, 'total': 0, 'runs': 0} + entry = {'date': date, 'total': 0, 'runs': runs_per_date.get(date, 0)} for rt in all_types: entry[rt] = round(by_date_type[date].get(rt, 0), 2) entry['total'] += by_date_type[date].get(rt, 0) entry['total'] = round(entry['total'], 2) - entry['runs'] = sum(1 for inst in instances if inst['date'] == date) by_date_list.append(entry) by_date_list = _aggregate_dates(by_date_list, granularity, @@ -405,6 +497,7 @@ def api_costs_attribution(): 'by_date': by_date_list, 'run_types': all_types, 'instances': instances[:500], + 'period': {'from': date_from, 'to': date_to}, 'totals': {'aws': round(total_aws, 2), 'gcp': round(gcp_total, 2), 'gcp_unattributed': round(gcp_total, 2), 'combined': round(total_aws + gcp_total, 2)}, @@ -421,7 +514,7 @@ def api_costs_runners(): ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) - runs = metrics.get_ci_runs(r, ts_from, ts_to) + runs = metrics.get_ci_runs(ts_from, ts_to) runs_with_cost = [run for run in runs if run.get('cost_usd') is not None] if dashboard: runs_with_cost = [run for run in runs_with_cost if run.get('dashboard') == dashboard] @@ -475,6 +568,7 @@ def api_costs_runners(): 'by_date': by_date, 'by_instance_type': by_instance, 'by_dashboard': by_dashboard, + 'period': {'from': date_from, 'to': date_to}, 'summary': { 'total_cost': round(total_cost, 2), 'spot_pct': round(100.0 * spot_cost / max(total_cost, 0.01), 1), @@ -493,13 +587,18 @@ def api_ci_performance(): date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) dashboard = request.args.get('dashboard', '') granularity = request.args.get('granularity', 'daily') + _ck = f'perf:{date_from}:{date_to}:{dashboard}:{granularity}' + if cached := db.cache_get(_ck): + return _json(cached) + _t0 = time.perf_counter() ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) - runs = metrics.get_ci_runs(r, ts_from, ts_to) + runs = metrics.get_ci_runs(ts_from, ts_to) runs = [run for run in runs if run.get('status') in ('PASSED', 'FAILED')] if dashboard: runs = [run for run in runs if run.get('dashboard') == dashboard] + _t1 = time.perf_counter() by_date_map = {} for run in runs: @@ -519,6 +618,7 @@ def api_ci_performance(): by_date = [] for date in sorted(by_date_map): d = by_date_map[date] + durs = sorted(d['durations']) by_date.append({ 'date': date, 'total': d['total'], @@ -526,75 +626,107 @@ def api_ci_performance(): 'failed': d['failed'], 'pass_rate': round(100.0 * d['passed'] / max(d['total'], 1), 1), 'failure_rate': round(100.0 * d['failed'] / max(d['total'], 1), 1), - 'avg_duration_mins': round(sum(d['durations']) / len(d['durations']), 1) if d['durations'] else None, + 'avg_duration_mins': round(sum(durs) / len(durs), 1) if durs else None, + 'p50_duration_mins': round(durs[len(durs) // 2], 1) if durs else None, + 'p95_duration_mins': round(durs[int(len(durs) * 0.95)], 1) if durs else None, + 'max_duration_mins': round(max(durs), 1) if durs else None, }) + _t2 = time.perf_counter() + # Merge test outcome counts from test_daily_stats before aggregation + ds_conditions = ['date >= ?', 'date <= ?'] + ds_params = [date_from, date_to] + if dashboard: + ds_conditions.append('dashboard = ?') + ds_params.append(dashboard) + ds_where = 'WHERE ' + ' AND '.join(ds_conditions) + + daily_test_counts = db.query(f''' + SELECT date, SUM(passed) as passed, SUM(failed) as failed, SUM(flaked) as flaked + FROM test_daily_stats {ds_where} + GROUP BY date + ''', ds_params) + daily_test_map = {r['date']: r for r in daily_test_counts} + for d in by_date: + tc = daily_test_map.get(d['date'], {}) + d['flake_count'] = tc.get('flaked', 0) or 0 + d['test_failure_count'] = tc.get('failed', 0) or 0 + d['test_success_count'] = tc.get('passed', 0) or 0 + by_date = _aggregate_dates(by_date, granularity, - sum_fields=['total', 'passed', 'failed'], - avg_fields=['avg_duration_mins']) + sum_fields=['total', 'passed', 'failed', + 'flake_count', 'test_failure_count', 'test_success_count'], + avg_fields=['avg_duration_mins', 'p50_duration_mins', + 'p95_duration_mins', 'max_duration_mins']) for d in by_date: d['pass_rate'] = round(100.0 * d['passed'] / max(d['total'], 1), 1) d['failure_rate'] = round(100.0 * d['failed'] / max(d['total'], 1), 1) - # Daily flake/failure counts from test_events - if dashboard: - flake_daily = db.query(''' - SELECT substr(timestamp, 1, 10) as date, COUNT(*) as count - FROM test_events WHERE status = 'flaked' AND dashboard = ? - AND timestamp >= ? AND timestamp < ? - GROUP BY substr(timestamp, 1, 10) - ''', (dashboard, date_from, date_to + 'T23:59:59')) - fail_test_daily = db.query(''' - SELECT substr(timestamp, 1, 10) as date, COUNT(*) as count - FROM test_events WHERE status = 'failed' AND dashboard = ? - AND timestamp >= ? AND timestamp < ? - GROUP BY substr(timestamp, 1, 10) - ''', (dashboard, date_from, date_to + 'T23:59:59')) - else: - flake_daily = db.query(''' - SELECT substr(timestamp, 1, 10) as date, COUNT(*) as count - FROM test_events WHERE status = 'flaked' - AND timestamp >= ? AND timestamp < ? - GROUP BY substr(timestamp, 1, 10) - ''', (date_from, date_to + 'T23:59:59')) - fail_test_daily = db.query(''' - SELECT substr(timestamp, 1, 10) as date, COUNT(*) as count - FROM test_events WHERE status = 'failed' - AND timestamp >= ? AND timestamp < ? - GROUP BY substr(timestamp, 1, 10) - ''', (date_from, date_to + 'T23:59:59')) - flake_daily_map = {r['date']: r['count'] for r in flake_daily} - fail_test_daily_map = {r['date']: r['count'] for r in fail_test_daily} - for d in by_date: - d['flake_count'] = flake_daily_map.get(d['date'], 0) - d['test_failure_count'] = fail_test_daily_map.get(d['date'], 0) + # Duration by dashboard (pipeline) — from pre-aggregated ci_run_daily_stats + dbd_rows = db.query(''' + SELECT date, dashboard, run_count, passed, failed, + sum_duration, min_duration, max_duration, p50_duration, p95_duration + FROM ci_run_daily_stats + WHERE date >= ? AND date <= ? + ORDER BY date + ''', (date_from, date_to)) + + dbd_map = {} # {dashboard: [{date, avg_duration_mins, ...}]} + for r in dbd_rows: + dbd_map.setdefault(r['dashboard'], []).append({ + 'date': r['date'], + 'avg_duration_mins': round(r['sum_duration'] / max(r['run_count'], 1), 1), + 'total_duration_mins': round(r['sum_duration'], 1), + 'p50_duration_mins': r['p50_duration'], + 'p95_duration_mins': r['p95_duration'], + 'count': r['run_count'], + }) + + duration_by_dashboard = {} + for db_name, entries in dbd_map.items(): + duration_by_dashboard[db_name] = _aggregate_dates( + entries, granularity, + sum_fields=['count', 'total_duration_mins'], + avg_fields=['avg_duration_mins', 'p50_duration_mins', 'p95_duration_mins']) - # Top flakes/failures + _t3 = time.perf_counter() + # Top flakes/failures (with affected authors — filter out empty/NULL) + _author_concat = "GROUP_CONCAT(DISTINCT CASE WHEN commit_author IS NOT NULL AND commit_author != '' THEN commit_author END)" if dashboard: - top_flakes = db.query(''' - SELECT test_cmd, COUNT(*) as count, ref_name + top_flakes = db.query(f''' + SELECT test_cmd, COUNT(*) as count, dashboard, + {_author_concat} as authors FROM test_events WHERE status='flaked' AND dashboard = ? AND timestamp >= ? AND timestamp <= ? - GROUP BY test_cmd ORDER BY count DESC LIMIT 15 + GROUP BY test_cmd ORDER BY count DESC LIMIT 20 ''', (dashboard, date_from, date_to + 'T23:59:59')) - top_failures = db.query(''' - SELECT test_cmd, COUNT(*) as count + top_failures = db.query(f''' + SELECT test_cmd, COUNT(*) as count, dashboard, + {_author_concat} as authors FROM test_events WHERE status='failed' AND dashboard = ? AND timestamp >= ? AND timestamp <= ? - GROUP BY test_cmd ORDER BY count DESC LIMIT 15 + GROUP BY test_cmd ORDER BY count DESC LIMIT 20 ''', (dashboard, date_from, date_to + 'T23:59:59')) else: - top_flakes = db.query(''' - SELECT test_cmd, COUNT(*) as count, ref_name + top_flakes = db.query(f''' + SELECT test_cmd, COUNT(*) as count, dashboard, + {_author_concat} as authors FROM test_events WHERE status='flaked' AND timestamp >= ? AND timestamp <= ? - GROUP BY test_cmd ORDER BY count DESC LIMIT 15 + GROUP BY test_cmd ORDER BY count DESC LIMIT 20 ''', (date_from, date_to + 'T23:59:59')) - top_failures = db.query(''' - SELECT test_cmd, COUNT(*) as count + top_failures = db.query(f''' + SELECT test_cmd, COUNT(*) as count, dashboard, + {_author_concat} as authors FROM test_events WHERE status='failed' AND timestamp >= ? AND timestamp <= ? - GROUP BY test_cmd ORDER BY count DESC LIMIT 15 + GROUP BY test_cmd ORDER BY count DESC LIMIT 20 ''', (date_from, date_to + 'T23:59:59')) + # Normalize git display names → GitHub usernames + for row in top_flakes: + row['authors'] = _normalize_authors(row.get('authors', '')) + for row in top_failures: + row['authors'] = _normalize_authors(row.get('authors', '')) + # Summary total = len(runs) passed = sum(1 for run in runs if run.get('status') == 'PASSED') @@ -606,38 +738,24 @@ def api_ci_performance(): if complete and ts: durations.append((complete - ts) / 60000.0) - if dashboard: - flake_count = db.query(''' - SELECT COUNT(*) as c FROM test_events WHERE status='flaked' AND dashboard = ? - AND timestamp >= ? AND timestamp <= ? - ''', (dashboard, date_from, date_to + 'T23:59:59')) - total_tests = db.query(''' - SELECT COUNT(*) as c FROM test_events WHERE status IN ('failed','flaked') AND dashboard = ? - AND timestamp >= ? AND timestamp <= ? - ''', (dashboard, date_from, date_to + 'T23:59:59')) - total_failures_count = db.query(''' - SELECT COUNT(*) as c FROM test_events WHERE status='failed' AND dashboard = ? - AND timestamp >= ? AND timestamp <= ? - ''', (dashboard, date_from, date_to + 'T23:59:59')) - else: - flake_count = db.query(''' - SELECT COUNT(*) as c FROM test_events WHERE status='flaked' AND timestamp >= ? AND timestamp <= ? - ''', (date_from, date_to + 'T23:59:59')) - total_tests = db.query(''' - SELECT COUNT(*) as c FROM test_events WHERE status IN ('failed','flaked') AND timestamp >= ? AND timestamp <= ? - ''', (date_from, date_to + 'T23:59:59')) - total_failures_count = db.query(''' - SELECT COUNT(*) as c FROM test_events WHERE status='failed' AND timestamp >= ? AND timestamp <= ? - ''', (date_from, date_to + 'T23:59:59')) - - fc = flake_count[0]['c'] if flake_count else 0 - tc = total_tests[0]['c'] if total_tests else 0 - tfc = total_failures_count[0]['c'] if total_failures_count else 0 - - return _json({ + # Test outcome summary from test_daily_stats + ds_summary = db.query(f''' + SELECT SUM(passed) as passed, SUM(failed) as failed, SUM(flaked) as flaked + FROM test_daily_stats {ds_where} + ''', ds_params) + ds_s = ds_summary[0] if ds_summary else {} + fc = ds_s.get('flaked', 0) or 0 + tfc = ds_s.get('failed', 0) or 0 + tpc = ds_s.get('passed', 0) or 0 + tc = fc + tfc + tpc + + _t4 = time.perf_counter() + _result = { 'by_date': by_date, + 'duration_by_dashboard': duration_by_dashboard, 'top_flakes': top_flakes, 'top_failures': top_failures, + 'period': {'from': date_from, 'to': date_to}, 'summary': { 'total_runs': total, 'pass_rate': round(100.0 * passed / max(total, 1), 1), @@ -646,8 +764,12 @@ def api_ci_performance(): 'flake_rate': round(100.0 * fc / max(tc, 1), 1) if tc else 0, 'total_flakes': fc, 'total_test_failures': tfc, + 'total_test_successes': tpc, }, - }) + } + print(f"[perf] ci_performance {date_from}..{date_to} | get_ci_runs={_t1-_t0:.3f}s db_queries={_t2-_t1:.3f}s agg={_t3-_t2:.3f}s top_flakes={_t4-_t3:.3f}s total={_t4-_t0:.3f}s", flush=True) + db.cache_set(_ck, _result, _cache_ttl(date_to)) + return _json(_result) # ---- GitHub integration ---- @@ -682,10 +804,19 @@ def api_pr_metrics(): date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) author = request.args.get('author', '') + _ck = f'pr_metrics:{date_from}:{date_to}:{author}' + if cached := db.cache_get(_ck): + return _json(cached) + _t0 = time.perf_counter() ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) - ci_runs = metrics.get_ci_runs(r, ts_from, ts_to) - return _json(github_data.get_pr_metrics(date_from, date_to, author, ci_runs)) + ci_runs = metrics.get_ci_runs(ts_from, ts_to) + _t1 = time.perf_counter() + _result = github_data.get_pr_metrics(date_from, date_to, author, ci_runs) + _t2 = time.perf_counter() + print(f"[perf] pr_metrics {date_from}..{date_to} | get_ci_runs={_t1-_t0:.3f}s get_pr_metrics={_t2-_t1:.3f}s total={_t2-_t0:.3f}s", flush=True) + db.cache_set(_ck, _result, _cache_ttl(date_to)) + return _json(_result) @app.route('/api/merge-queue/stats') @@ -693,7 +824,32 @@ def api_pr_metrics(): def api_merge_queue_stats(): date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) - return _json(github_data.get_merge_queue_stats(date_from, date_to)) + _ck = f'mq_stats:{date_from}:{date_to}' + if cached := db.cache_get(_ck): + return _json(cached) + _t0 = time.perf_counter() + _result = github_data.get_merge_queue_stats(date_from, date_to) + _t1 = time.perf_counter() + print(f"[perf] merge_queue_stats {date_from}..{date_to} | get_merge_queue_stats={_t1-_t0:.3f}s total={_t1-_t0:.3f}s", flush=True) + db.cache_set(_ck, _result, _cache_ttl(date_to)) + return _json(_result) + + +@app.route('/api/test-history/') +@auth.login_required +def api_test_history(test_hash): + """Test event history by hash — SQLite backing for Redis history_ lists.""" + branch = request.args.get('branch', '') + limit = min(int(request.args.get('limit', 1000)), 5000) + rows = metrics.get_test_history(test_hash, branch, limit) + return _json(rows) + + +@app.route('/api/ci/runs/pr/') +@auth.login_required +def api_ci_runs_for_pr(pr_number): + limit = min(int(request.args.get('limit', 100)), 500) + return _json(metrics.get_ci_runs_for_pr(pr_number, limit)) @app.route('/api/ci/flakes-by-command') @@ -702,8 +858,38 @@ def api_flakes_by_command(): date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) dashboard = request.args.get('dashboard', '') + _ck = f'flakes:{date_from}:{date_to}:{dashboard}' + _t0 = time.perf_counter() metrics.sync_failed_tests_to_sqlite(r) - return _json(metrics.get_flakes_by_command(date_from, date_to, dashboard)) + _t1 = time.perf_counter() + if cached := db.cache_get(_ck): + return _json(cached) + _result = metrics.get_flakes_by_command(date_from, date_to, dashboard) + _t2 = time.perf_counter() + print(f"[perf] flakes_by_command {date_from}..{date_to} | sync={_t1-_t0:.3f}s get_flakes={_t2-_t1:.3f}s total={_t2-_t0:.3f}s", flush=True) + db.cache_set(_ck, _result, _cache_ttl(date_to)) + return _json(_result) + + +# ---- CI Phase timing ---- + +@app.route('/api/ci/phases') +@auth.login_required +def api_ci_phases(): + """CI phase timing breakdown: avg time per phase, by date, and per run.""" + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + dashboard = request.args.get('dashboard', '') + run_id = request.args.get('run_id', '') + _ck = f'phases:{date_from}:{date_to}:{dashboard}:{run_id}' + if cached := db.cache_get(_ck): + return _json(cached) + _t0 = time.perf_counter() + _result = metrics.get_phases(date_from, date_to, dashboard, run_id) + _t1 = time.perf_counter() + print(f"[perf] ci_phases {date_from}..{date_to} | get_phases={_t1-_t0:.3f}s total={_t1-_t0:.3f}s", flush=True) + db.cache_set(_ck, _result, _cache_ttl(date_to)) + return _json(_result) # ---- Test timings ---- @@ -717,96 +903,157 @@ def api_test_timings(): dashboard = request.args.get('dashboard', '') status = request.args.get('status', '') # filter to specific status test_cmd = request.args.get('test_cmd', '') # filter to specific test - - conditions = ['duration_secs IS NOT NULL', 'duration_secs > 0', - 'timestamp >= ?', "timestamp < ? || 'T23:59:59'"] - params = [date_from, date_to] - + _ck = f'timings:{date_from}:{date_to}:{dashboard}:{status}:{test_cmd}' + _ttl = _cache_ttl(date_to) + if cached := db.cache_get(_ck): + return _json(cached) + _t0 = time.perf_counter() + + # Base WHERE for test_daily_stats + ds_conds = ['date >= ?', 'date <= ?'] + ds_params = [date_from, date_to] if dashboard: - conditions.append('dashboard = ?') - params.append(dashboard) - if status: - conditions.append('status = ?') - params.append(status) + ds_conds.append('dashboard = ?') + ds_params.append(dashboard) if test_cmd: - conditions.append('test_cmd = ?') - params.append(test_cmd) - - where = 'WHERE ' + ' AND '.join(conditions) - - # Per-test stats - by_test = db.query(f''' - SELECT test_cmd, - COUNT(*) as count, - ROUND(AVG(duration_secs), 1) as avg_secs, - ROUND(MIN(duration_secs), 1) as min_secs, - ROUND(MAX(duration_secs), 1) as max_secs, - SUM(CASE WHEN status = 'passed' THEN 1 ELSE 0 END) as passed, - SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed, - SUM(CASE WHEN status = 'flaked' THEN 1 ELSE 0 END) as flaked, - dashboard - FROM test_events {where} - GROUP BY test_cmd - ORDER BY count DESC - LIMIT 200 - ''', params) - - # Add pass rate - for row in by_test: - total = row['passed'] + row['failed'] + row['flaked'] - row['pass_rate'] = round(100.0 * row['passed'] / max(total, 1), 1) - row['total_time_secs'] = round(row['avg_secs'] * row['count'], 0) - - # Daily time series (aggregate across all tests or filtered test) - by_date = db.query(f''' - SELECT substr(timestamp, 1, 10) as date, - COUNT(*) as count, - ROUND(AVG(duration_secs), 1) as avg_secs, - ROUND(MAX(duration_secs), 1) as max_secs, - SUM(CASE WHEN status = 'passed' THEN 1 ELSE 0 END) as passed, - SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed, - SUM(CASE WHEN status = 'flaked' THEN 1 ELSE 0 END) as flaked - FROM test_events {where} - GROUP BY substr(timestamp, 1, 10) - ORDER BY date - ''', params) - - # Summary - summary_rows = db.query(f''' - SELECT COUNT(*) as count, - ROUND(AVG(duration_secs), 1) as avg_secs, - ROUND(MAX(duration_secs), 1) as max_secs, - SUM(duration_secs) as total_secs, - SUM(CASE WHEN status = 'passed' THEN 1 ELSE 0 END) as passed, - SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed, - SUM(CASE WHEN status = 'flaked' THEN 1 ELSE 0 END) as flaked - FROM test_events {where} - ''', params) - s = summary_rows[0] if summary_rows else {} - - # Slowest individual test runs + ds_conds.append('test_cmd = ?') + ds_params.append(test_cmd) + ds_where = 'WHERE ' + ' AND '.join(ds_conds) + + if not status: + # Fast path: push GROUP BY into SQL — returns N_tests + N_dates rows, not N_tests*N_dates rows + by_test_rows = db.query(f''' + SELECT test_cmd, MAX(dashboard) as dashboard, + SUM(passed) as passed, SUM(failed) as failed, SUM(flaked) as flaked, + SUM(total_secs) as total_secs, SUM(count_timed) as count_timed, + MIN(min_secs) as min_secs, MAX(max_secs) as max_secs + FROM test_daily_stats {ds_where} + GROUP BY test_cmd + ORDER BY SUM(passed)+SUM(failed)+SUM(flaked) DESC LIMIT 500 + ''', ds_params) + _t1 = time.perf_counter() + + by_date_rows = db.query(f''' + SELECT date, + SUM(passed) as passed, SUM(failed) as failed, SUM(flaked) as flaked, + SUM(total_secs) as total_secs, SUM(count_timed) as count_timed + FROM test_daily_stats {ds_where} + GROUP BY date ORDER BY date + ''', ds_params) + _t2 = time.perf_counter() + + by_test = [] + for t in by_test_rows: + count = (t['passed'] or 0) + (t['failed'] or 0) + (t['flaked'] or 0) + avg_secs = round(t['total_secs'] / t['count_timed'], 1) if t['count_timed'] else None + by_test.append({ + 'test_cmd': t['test_cmd'], 'dashboard': t['dashboard'], 'count': count, + 'passed': t['passed'] or 0, 'failed': t['failed'] or 0, 'flaked': t['flaked'] or 0, + 'pass_rate': round(100.0 * (t['passed'] or 0) / max(count, 1), 1), + 'avg_secs': avg_secs, 'min_secs': t['min_secs'], 'max_secs': t['max_secs'], + 'total_time_secs': round(t['total_secs'] or 0, 0), + }) + + by_date = [] + for d in by_date_rows: + count = (d['passed'] or 0) + (d['failed'] or 0) + (d['flaked'] or 0) + avg_secs = round(d['total_secs'] / d['count_timed'], 1) if d['count_timed'] else None + by_date.append({ + 'date': d['date'], 'passed': d['passed'] or 0, + 'failed': d['failed'] or 0, 'flaked': d['flaked'] or 0, + 'count': count, 'avg_secs': avg_secs, + }) + + total_passed = sum(d['passed'] for d in by_date) + total_failed = sum(d['failed'] for d in by_date) + total_flaked = sum(d['flaked'] for d in by_date) + total_secs_all = sum(d['total_secs'] or 0 for d in by_date_rows) + count_timed_all = sum(d['count_timed'] or 0 for d in by_date_rows) + else: + # Slow fallback: status filter requires scanning test_events + te_conds = ['duration_secs IS NOT NULL', 'duration_secs > 0', + 'timestamp >= ?', "timestamp < ? || 'T23:59:59'"] + te_params = [date_from, date_to] + if dashboard: + te_conds.append('dashboard = ?') + te_params.append(dashboard) + te_conds.append('status = ?') + te_params.append(status) + if test_cmd: + te_conds.append('test_cmd = ?') + te_params.append(test_cmd) + te_where = 'WHERE ' + ' AND '.join(te_conds) + + raw = db.query(f''' + SELECT test_cmd, dashboard, + COUNT(*) as count, + ROUND(AVG(duration_secs),1) as avg_secs, + ROUND(MIN(duration_secs),1) as min_secs, + ROUND(MAX(duration_secs),1) as max_secs, + SUM(duration_secs) as total_secs, + substr(timestamp,1,10) as date + FROM test_events {te_where} + GROUP BY test_cmd + ORDER BY count DESC LIMIT 200 + ''', te_params) + _t1 = time.perf_counter() + by_test = [dict(r, pass_rate=0, passed=0, failed=r['count'] if status=='failed' else 0, + flaked=r['count'] if status=='flaked' else 0, + total_time_secs=round(r['total_secs'] or 0, 0)) for r in raw] + + by_date_raw = db.query(f''' + SELECT substr(timestamp,1,10) as date, COUNT(*) as count + FROM test_events {te_where} + GROUP BY substr(timestamp,1,10) ORDER BY date + ''', te_params) + by_date = [{'date': r['date'], 'count': r['count'], 'passed': 0, + 'failed': r['count'] if status=='failed' else 0, + 'flaked': r['count'] if status=='flaked' else 0} for r in by_date_raw] + + total_passed = 0 + total_failed = sum(r['count'] for r in by_date) if status == 'failed' else 0 + total_flaked = sum(r['count'] for r in by_date) if status == 'flaked' else 0 + total_secs_all = sum(r.get('total_secs') or 0 for r in raw) + count_timed_all = sum(r['count'] for r in raw) + _t2 = time.perf_counter() + + # Slowest individual runs — uses idx_test_events_duration index + sl_conds = ['duration_secs IS NOT NULL', 'duration_secs > 0', + 'timestamp >= ?', "timestamp <= ? || 'T23:59:59'"] + sl_params = [date_from, date_to] + if dashboard: + sl_conds.append('dashboard = ?') + sl_params.append(dashboard) + if test_cmd: + sl_conds.append('test_cmd = ?') + sl_params.append(test_cmd) + sl_where = 'WHERE ' + ' AND '.join(sl_conds) slowest = db.query(f''' SELECT test_cmd, status, duration_secs, dashboard, - substr(timestamp, 1, 10) as date, commit_author, log_url - FROM test_events {where} - ORDER BY duration_secs DESC - LIMIT 50 - ''', params) - - return _json({ + substr(timestamp,1,10) as date, commit_author, log_url + FROM test_events {sl_where} + ORDER BY duration_secs DESC LIMIT 50 + ''', sl_params) + _t3 = time.perf_counter() + + print(f"[perf] test_timings {date_from}..{date_to} | by_test={_t1-_t0:.3f}s by_date={_t2-_t1:.3f}s slowest={_t3-_t2:.3f}s total={_t3-_t0:.3f}s", flush=True) + _result = { 'by_test': by_test, 'by_date': by_date, 'slowest': slowest, + 'period': {'from': date_from, 'to': date_to}, 'summary': { - 'total_runs': s.get('count', 0), - 'avg_duration_secs': s.get('avg_secs'), - 'max_duration_secs': s.get('max_secs'), - 'total_compute_secs': round(s.get('total_secs', 0) or 0, 0), - 'passed': s.get('passed', 0), - 'failed': s.get('failed', 0), - 'flaked': s.get('flaked', 0), + 'total_runs': total_passed + total_failed + total_flaked, + 'avg_duration_secs': round(total_secs_all / count_timed_all, 1) if count_timed_all > 0 else None, + 'max_duration_secs': slowest[0]['duration_secs'] if slowest else None, + 'total_compute_secs': round(total_secs_all, 0), + 'passed': total_passed, + 'failed': total_failed, + 'flaked': total_flaked, }, - }) + } + db.cache_set(_ck, _result, _ttl) + return _json(_result) # ---- Dashboard views ---- @@ -844,5 +1091,59 @@ def test_timings(): return "Dashboard not found", 404 +@app.route('/ci-health-report') +@auth.login_required +def ci_health_report(): + path = Path(__file__).parent / 'views' / 'ci-health-report.html' + if path.exists(): + return path.read_text() + return "Report not found", 404 + + +@app.route('/commits') +@auth.login_required +def commits_page(): + path = Path(__file__).parent / 'views' / 'commits.html' + return path.read_text() + + +@app.route('/api/commits') +@auth.login_required +def api_commits(): + branch = request.args.get('branch', 'next') + page = max(1, int(request.args.get('page', 1))) + per_page = min(int(request.args.get('per_page', 50)), 100) + return _json(github_data.get_recent_commits(branch, page, per_page)) + + +@app.route('/flake-prs') +@auth.login_required +def flake_prs(): + path = Path(__file__).parent / 'views' / 'flake-prs.html' + if path.exists(): + return path.read_text() + return "Page not found", 404 + + +@app.route('/api/flake-prs') +@auth.login_required +def api_flake_prs(): + rows = db.query(''' + SELECT pa.pr_number, pa.author, pa.title, pa.branch, + pa.additions, pa.deletions, pa.fetched_at, + MIN(cr.timestamp_ms) as first_seen_ms + FROM pr_authors pa + LEFT JOIN ci_runs cr ON cr.pr_number = pa.pr_number + WHERE ( + pa.title LIKE '%flake%' OR pa.title LIKE '%deflake%' + OR pa.branch LIKE '%flake%' OR pa.branch LIKE '%deflake%' + ) + GROUP BY pa.pr_number + ORDER BY pa.pr_number DESC + LIMIT 200 + ''') + return _json([dict(r) for r in rows]) + + if __name__ == '__main__': app.run(host='0.0.0.0', port=8081) diff --git a/ci3/ci-metrics/billing/aws.py b/ci3/ci-metrics/billing/aws.py index 481393d74ec3..4dc9061b37df 100644 --- a/ci3/ci-metrics/billing/aws.py +++ b/ci3/ci-metrics/billing/aws.py @@ -54,6 +54,8 @@ # Messaging 'Amazon Simple Notification Service': 'sns', 'Amazon Simple Queue Service': 'sqs', + # Savings Plans / Reserved Instances + 'Savings Plans for AWS Compute usage': 'savings_plans', # Other 'Tax': 'tax', 'AWS Support (Business)': 'support', @@ -63,6 +65,16 @@ import re +# One-time contract payments: annual Savings Plan upfronts and monthly Reserved Instance charges. +# These appear as large single-day spikes but are not operational spend. +_ONE_TIME_CATEGORIES = frozenset({ + 'savings_plan_1yr_annual', + 'savings_plan_3yr_annual', + 'savings_plan_1yr_annual_partial', + 'savings_plan_3yr_annual_partial', + 'reserved_instance_monthly', +}) + _cache = {'rows': [], 'ts': 0} _cache_lock = threading.Lock() _detail_cache = {'rows': [], 'ts': 0} @@ -152,7 +164,10 @@ def _fetch_aws_costs(date_from: str, date_to: str) -> list[dict]: TimePeriod={'Start': date_from, 'End': date_to}, Granularity='DAILY', Metrics=['UnblendedCost'], - GroupBy=[{'Type': 'DIMENSION', 'Key': 'SERVICE'}], + GroupBy=[ + {'Type': 'DIMENSION', 'Key': 'SERVICE'}, + {'Type': 'DIMENSION', 'Key': 'USAGE_TYPE'}, + ], ) if next_token: kwargs['NextPageToken'] = next_token @@ -163,12 +178,26 @@ def _fetch_aws_costs(date_from: str, date_to: str) -> list[dict]: date = result['TimePeriod']['Start'] for group in result['Groups']: service = group['Keys'][0] + usage_type = group['Keys'][1] if len(group['Keys']) > 1 else '' amount = float(group['Metrics']['UnblendedCost']['Amount']) if amount == 0: continue category = SERVICE_CATEGORY_MAP.get(service, 'other') + # Savings plans: ComputeSP:1yrAllUpfront, ComputeSP:3yrNoUpfront, etc. + if category == 'savings_plans': + m = re.match(r'ComputeSP:(\d+yr)(\w+)', usage_type) + if m: + term = m.group(1) + payment = m.group(2) + if payment == 'NoUpfront': + category = f'savings_plan_{term}_monthly' + elif 'Upfront' in payment: + category = f'savings_plan_{term}_annual' + # EC2 reserved instances: HeavyUsage: billed monthly on 1st + elif category == 'ec2' and 'HeavyUsage:' in usage_type: + category = 'reserved_instance_monthly' if category == 'other': - print(f"[rk_aws_costs] unmapped service: {service!r} (${amount:.2f})") + print(f"[rk_aws_costs] unmapped service: {service!r} / {usage_type!r} (${amount:.2f})") rows.append({ 'date': date, 'service': service, @@ -322,26 +351,32 @@ def get_costs_overview(date_from: str, date_to: str) -> dict: for r in aws_rows: d = r['date'] if d not in by_date: - by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0} + by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0, 'aws_one_time': 0} cat = r['category'] by_date[d]['aws'][cat] = by_date[d]['aws'].get(cat, 0) + r['amount_usd'] by_date[d]['aws_total'] += r['amount_usd'] + if cat in _ONE_TIME_CATEGORIES: + by_date[d]['aws_one_time'] += r['amount_usd'] for d, cats in gcp_by_date.items(): if d not in by_date: - by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0} + by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0, 'aws_one_time': 0} by_date[d]['gcp'] = cats by_date[d]['gcp_total'] = sum(cats.values()) sorted_dates = sorted(by_date.values(), key=lambda x: x['date']) aws_total = sum(d['aws_total'] for d in sorted_dates) + aws_one_time = sum(d['aws_one_time'] for d in sorted_dates) gcp_total = sum(d['gcp_total'] for d in sorted_dates) return { 'by_date': sorted_dates, 'totals': { 'aws': round(aws_total, 2), + 'aws_operational': round(aws_total - aws_one_time, 2), + 'aws_one_time': round(aws_one_time, 2), 'gcp': round(gcp_total, 2), 'combined': round(aws_total + gcp_total, 2), + 'combined_operational': round(aws_total - aws_one_time + gcp_total, 2), } } diff --git a/ci3/ci-metrics/db.py b/ci3/ci-metrics/db.py index 93e970fe3a56..e19380902825 100644 --- a/ci3/ci-metrics/db.py +++ b/ci3/ci-metrics/db.py @@ -3,11 +3,14 @@ Stores test events (from Redis pub/sub) and merge queue daily stats (backfilled from GitHub API). """ +import json import os import sqlite3 import threading +import time -_DB_PATH = os.path.join(os.getenv('LOGS_DISK_PATH', '/logs-disk'), 'metrics.db') +_DB_PATH = os.getenv('METRICS_DB_PATH', + os.path.join(os.getenv('LOGS_DISK_PATH', '/logs-disk'), 'metrics.db')) _local = threading.local() SCHEMA = """ @@ -34,6 +37,7 @@ CREATE INDEX IF NOT EXISTS idx_test_events_ts ON test_events(timestamp); CREATE INDEX IF NOT EXISTS idx_test_events_cmd ON test_events(test_cmd); CREATE INDEX IF NOT EXISTS idx_test_events_dashboard ON test_events(dashboard); +CREATE INDEX IF NOT EXISTS idx_test_events_status_ts ON test_events(status, timestamp); CREATE TABLE IF NOT EXISTS merge_queue_daily ( date TEXT PRIMARY KEY, @@ -64,6 +68,84 @@ CREATE INDEX IF NOT EXISTS idx_ci_runs_ts ON ci_runs(timestamp_ms); CREATE INDEX IF NOT EXISTS idx_ci_runs_name ON ci_runs(name); CREATE INDEX IF NOT EXISTS idx_ci_runs_dashboard ON ci_runs(dashboard); + +CREATE TABLE IF NOT EXISTS test_daily_stats ( + date TEXT NOT NULL, + test_cmd TEXT NOT NULL, + dashboard TEXT NOT NULL DEFAULT '', + passed INTEGER NOT NULL DEFAULT 0, + failed INTEGER NOT NULL DEFAULT 0, + flaked INTEGER NOT NULL DEFAULT 0, + total_secs REAL NOT NULL DEFAULT 0, + count_timed INTEGER NOT NULL DEFAULT 0, + min_secs REAL, + max_secs REAL, + PRIMARY KEY (date, test_cmd, dashboard) +); +CREATE INDEX IF NOT EXISTS idx_tds_date ON test_daily_stats(date); +CREATE INDEX IF NOT EXISTS idx_tds_dashboard ON test_daily_stats(dashboard); + +CREATE TABLE IF NOT EXISTS merge_queue_snapshots ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + timestamp TEXT NOT NULL, + depth INTEGER NOT NULL, + entries_json TEXT +); +CREATE INDEX IF NOT EXISTS idx_mqs_ts ON merge_queue_snapshots(timestamp); + +CREATE TABLE IF NOT EXISTS ci_run_daily_stats ( + date TEXT NOT NULL, + dashboard TEXT NOT NULL, + run_count INTEGER NOT NULL DEFAULT 0, + passed INTEGER NOT NULL DEFAULT 0, + failed INTEGER NOT NULL DEFAULT 0, + sum_duration REAL NOT NULL DEFAULT 0, + min_duration REAL, + max_duration REAL, + p50_duration REAL, + p95_duration REAL, + PRIMARY KEY (date, dashboard) +); +CREATE INDEX IF NOT EXISTS idx_crds_date ON ci_run_daily_stats(date); + +CREATE TABLE IF NOT EXISTS ci_phases ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + phase TEXT NOT NULL, + duration_secs REAL NOT NULL, + exit_code INTEGER, + run_id TEXT, + job_id TEXT, + dashboard TEXT NOT NULL DEFAULT '', + ref_name TEXT, + commit_hash TEXT, + timestamp TEXT NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_ci_phases_run ON ci_phases(run_id); +CREATE INDEX IF NOT EXISTS idx_ci_phases_ts ON ci_phases(timestamp); +CREATE INDEX IF NOT EXISTS idx_ci_phases_phase ON ci_phases(phase); + +CREATE TABLE IF NOT EXISTS pr_authors ( + pr_number INTEGER PRIMARY KEY, + author TEXT NOT NULL, + title TEXT NOT NULL DEFAULT '', + branch TEXT NOT NULL DEFAULT '', + additions INTEGER DEFAULT 0, + deletions INTEGER DEFAULT 0, + fetched_at TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS api_cache ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + created_at REAL NOT NULL, + ttl_secs INTEGER NOT NULL DEFAULT 300 +); + +CREATE TABLE IF NOT EXISTS pr_cache ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated_at REAL NOT NULL +); """ @@ -73,6 +155,16 @@ "ALTER TABLE ci_runs ADD COLUMN job_id TEXT DEFAULT ''", "ALTER TABLE ci_runs ADD COLUMN arch TEXT DEFAULT ''", "CREATE INDEX IF NOT EXISTS idx_ci_runs_dashboard ON ci_runs(dashboard)", + "ALTER TABLE test_events ADD COLUMN test_hash TEXT", + "CREATE INDEX IF NOT EXISTS idx_test_events_hash ON test_events(test_hash)", + "ALTER TABLE merge_queue_daily ADD COLUMN avg_depth REAL", + "ALTER TABLE merge_queue_daily ADD COLUMN peak_depth INTEGER", + "CREATE INDEX IF NOT EXISTS idx_test_events_duration_ts ON test_events(timestamp) WHERE duration_secs IS NOT NULL AND duration_secs > 0", + "ALTER TABLE test_daily_stats ADD COLUMN total_secs REAL NOT NULL DEFAULT 0", + "ALTER TABLE test_daily_stats ADD COLUMN count_timed INTEGER NOT NULL DEFAULT 0", + "ALTER TABLE test_daily_stats ADD COLUMN min_secs REAL", + "ALTER TABLE test_daily_stats ADD COLUMN max_secs REAL", + "CREATE INDEX IF NOT EXISTS idx_test_events_duration ON test_events(duration_secs DESC) WHERE duration_secs IS NOT NULL AND duration_secs > 0", ] @@ -105,3 +197,31 @@ def execute(sql: str, params=()): conn = get_db() conn.execute(sql, params) conn.commit() + + +def cache_get(key: str): + """Return cached value (parsed JSON) if not expired, else None.""" + rows = query('SELECT value, created_at, ttl_secs FROM api_cache WHERE key = ?', (key,)) + if rows and time.time() - rows[0]['created_at'] < rows[0]['ttl_secs']: + return json.loads(rows[0]['value']) + return None + + +def cache_set(key: str, data, ttl_secs: int = 300) -> None: + """Store data as JSON in the cache with a TTL.""" + execute( + 'INSERT OR REPLACE INTO api_cache (key, value, created_at, ttl_secs) VALUES (?, ?, ?, ?)', + (key, json.dumps(data, default=str), time.time(), ttl_secs), + ) + + +def cache_invalidate_prefix(prefix: str) -> None: + """Delete all cache entries whose key starts with prefix.""" + execute('DELETE FROM api_cache WHERE key LIKE ?', (prefix + '%',)) + + +def cache_cleanup() -> None: + """Remove expired entries.""" + execute( + "DELETE FROM api_cache WHERE created_at + ttl_secs < unixepoch('now')" + ) diff --git a/ci3/ci-metrics/ec2_pricing.py b/ci3/ci-metrics/ec2_pricing.py index ace55ea4f40a..96e0561d0d70 100644 --- a/ci3/ci-metrics/ec2_pricing.py +++ b/ci3/ci-metrics/ec2_pricing.py @@ -16,12 +16,20 @@ # ---- Hardcoded fallback rates (us-east-2, USD/hr) ---- _HARDCODED_RATES = { - ('m6a.48xlarge', True): 8.31, # spot - ('m6a.48xlarge', False): 16.56, # on-demand - ('m6a.32xlarge', True): 5.54, - ('m6a.32xlarge', False): 11.04, + ('m6a.xlarge', True): 0.07, # spot + ('m6a.xlarge', False): 0.1728, # on-demand + ('m6a.4xlarge', True): 0.28, + ('m6a.4xlarge', False): 0.6912, + ('m6a.8xlarge', True): 0.55, + ('m6a.8xlarge', False): 1.3824, ('m6a.16xlarge', True): 2.77, ('m6a.16xlarge', False): 5.52, + ('m6a.24xlarge', True): 1.66, + ('m6a.24xlarge', False): 4.1472, + ('m6a.32xlarge', True): 5.54, + ('m6a.32xlarge', False): 11.04, + ('m6a.48xlarge', True): 8.31, + ('m6a.48xlarge', False): 16.56, ('m7a.48xlarge', True): 8.31, ('m7a.48xlarge', False): 16.56, ('m7a.16xlarge', True): 2.77, @@ -145,8 +153,19 @@ def _fetch_all_spot(instance_types: list[str]) -> dict[str, float]: # ---- Cache refresh ---- def _get_known_instance_types() -> list[str]: - """Return the set of instance types we need pricing for.""" - return sorted({itype for itype, _ in _HARDCODED_RATES}) + """Return the set of instance types we need pricing for (hardcoded + from DB).""" + types = {itype for itype, _ in _HARDCODED_RATES} + try: + import db + conn = db.get_db() + rows = conn.execute( + "SELECT DISTINCT instance_type FROM ci_runs " + "WHERE instance_type IS NOT NULL AND instance_type != '' AND instance_type != 'unknown'" + ).fetchall() + types.update(r['instance_type'] for r in rows) + except Exception: + pass + return sorted(types) def _refresh_cache(): diff --git a/ci3/ci-metrics/github_data.py b/ci3/ci-metrics/github_data.py index 8824d187cb81..9c36a708025d 100644 --- a/ci3/ci-metrics/github_data.py +++ b/ci3/ci-metrics/github_data.py @@ -1,15 +1,20 @@ """GitHub API polling with in-memory cache. -Fetches PR lifecycle, deployment runs, branch lag, and merge queue stats via `gh` CLI. +Fetches PR lifecycle, deployment runs, branch lag, and merge queue stats via +the GitHub REST API (using requests + GH_TOKEN env var). Most data cached in memory with TTL. Merge queue stats persisted to SQLite daily. """ import json -import subprocess +import os +import requests import threading import time from datetime import datetime, timedelta, timezone +import db as _db + REPO = 'AztecProtocol/aztec-packages' +_GH_API = 'https://api.github.com' BRANCH_PAIRS = [ ('next', 'staging-public'), @@ -25,41 +30,126 @@ _CACHE_TTL = 3600 # 1 hour _pr_cache = {'data': [], 'ts': 0} +_commits_cache: dict = {} # keyed by branch +_commits_lock = threading.Lock() _deploy_cache = {'data': [], 'ts': 0} _lag_cache = {'data': [], 'ts': 0} -_pr_author_cache = {} # {pr_number: {'author': str, 'title': str, 'branch': str}} _pr_lock = threading.Lock() _deploy_lock = threading.Lock() _lag_lock = threading.Lock() -def _gh(args: list[str]) -> str | None: +def _gh_headers() -> dict: + token = os.environ.get('GH_TOKEN') or os.environ.get('GITHUB_TOKEN', '') + h = {'Accept': 'application/vnd.github+json', 'X-GitHub-Api-Version': '2022-11-28'} + if token: + h['Authorization'] = f'Bearer {token}' + return h + + +def _github_get(path: str, paginate: bool = False) -> list | dict | None: + """GET from GitHub REST API. Returns parsed JSON (list or dict). + If paginate=True, follows Link: next headers and merges array results.""" + url = f'{_GH_API}/{path}' if not path.startswith('http') else path + headers = _gh_headers() try: - result = subprocess.run( - ['gh'] + args, - capture_output=True, text=True, timeout=30 - ) - if result.returncode == 0: - return result.stdout.strip() - except (FileNotFoundError, subprocess.TimeoutExpired) as e: - print(f"[rk_github] gh error: {e}") - return None + if not paginate: + resp = requests.get(url, headers=headers, timeout=30) + if resp.status_code != 200: + print(f"[rk_github] API {resp.status_code}: {url}") + return None + return resp.json() + # Paginated: collect all pages + all_items = [] + while url: + resp = requests.get(url, headers=headers, timeout=30) + if resp.status_code != 200: + print(f"[rk_github] API {resp.status_code}: {url}") + break + data = resp.json() + if isinstance(data, list): + all_items.extend(data) + elif isinstance(data, dict): + # For endpoints like /actions/workflows/.../runs that wrap in an object + all_items.append(data) + # Follow Link: ; rel="next" + link = resp.headers.get('Link', '') + url = None + for part in link.split(','): + if 'rel="next"' in part: + url = part.split('<')[1].split('>')[0] + return all_items + except Exception as e: + print(f"[rk_github] API error: {e}") + return None + + +def _github_graphql(query: str, variables: dict = None) -> dict | None: + """Execute a GitHub GraphQL query.""" + headers = _gh_headers() + try: + resp = requests.post(f'{_GH_API}/graphql', headers=headers, + json={'query': query, 'variables': variables or {}}, + timeout=30) + if resp.status_code != 200: + print(f"[rk_github] GraphQL {resp.status_code}") + return None + data = resp.json() + if 'errors' in data: + print(f"[rk_github] GraphQL errors: {data['errors']}") + return data.get('data') + except Exception as e: + print(f"[rk_github] GraphQL error: {e}") + return None # ---- PR lifecycle ---- +_PR_GQL = ''' +query($owner: String!, $repo: String!, $cursor: String) { + repository(owner: $owner, name: $repo) { + pullRequests(states: MERGED, first: 100, after: $cursor, orderBy: {field: UPDATED_AT, direction: DESC}) { + pageInfo { hasNextPage endCursor } + nodes { + number + author { login } + title + createdAt + mergedAt + closedAt + baseRefName + headRefName + additions + deletions + changedFiles + isDraft + reviewDecision + labels(first: 20) { nodes { name } } + } + } + } +}''' + + def _fetch_and_process_prs() -> list[dict]: - out = _gh([ - 'pr', 'list', '--repo', REPO, '--state', 'merged', - '--limit', '500', - '--json', 'number,author,title,createdAt,mergedAt,closedAt,baseRefName,' - 'headRefName,additions,deletions,changedFiles,isDraft,reviewDecision,labels' - ]) - if not out: - return [] - try: - prs = json.loads(out) - except json.JSONDecodeError: + owner, repo = REPO.split('/') + prs = [] + cursor = None + for _ in range(5): # max 5 pages = 500 PRs + data = _github_graphql(_PR_GQL, {'owner': owner, 'repo': repo, 'cursor': cursor}) + if not data: + break + pr_data = data.get('repository', {}).get('pullRequests', {}) + nodes = pr_data.get('nodes', []) + for node in nodes: + node['author'] = (node.get('author') or {}).get('login', 'unknown') + node['labels'] = [l['name'] for l in (node.get('labels') or {}).get('nodes', [])] + prs.extend(nodes) + page_info = pr_data.get('pageInfo', {}) + if not page_info.get('hasNextPage'): + break + cursor = page_info.get('endCursor') + if not prs: return [] for pr in prs: @@ -87,9 +177,20 @@ def _fetch_and_process_prs() -> list[dict]: def _ensure_prs(): + import db as _db now = time.time() if _pr_cache['data'] and now - _pr_cache['ts'] < _CACHE_TTL: return + # Try SQLite cache before hitting GitHub + if not _pr_cache['data']: + try: + rows = _db.query("SELECT value, updated_at FROM pr_cache WHERE key = 'prs'") + if rows and now - rows[0]['updated_at'] < _CACHE_TTL: + _pr_cache['data'] = json.loads(rows[0]['value']) + _pr_cache['ts'] = rows[0]['updated_at'] + return + except Exception: + pass if not _pr_lock.acquire(blocking=False): return try: @@ -97,6 +198,13 @@ def _ensure_prs(): if prs: _pr_cache['data'] = prs _pr_cache['ts'] = now + try: + _db.execute( + "INSERT OR REPLACE INTO pr_cache (key, value, updated_at) VALUES ('prs', ?, ?)", + (json.dumps(prs, default=str), now), + ) + except Exception: + pass finally: _pr_lock.release() @@ -106,20 +214,14 @@ def _ensure_prs(): def _fetch_all_deploys() -> list[dict]: all_runs = [] for workflow in DEPLOY_WORKFLOWS: - out = _gh([ - 'run', 'list', '--repo', REPO, - '--workflow', workflow, '--limit', '50', - '--json', 'databaseId,status,conclusion,createdAt,updatedAt,headBranch,name' - ]) - if not out: - continue - try: - runs = json.loads(out) - except json.JSONDecodeError: + data = _github_get( + f'repos/{REPO}/actions/workflows/{workflow}/runs?per_page=50&status=completed') + if not data: continue + runs = data.get('workflow_runs', []) for run in runs: - started = run.get('createdAt', '') - completed = run.get('updatedAt') + started = run.get('created_at', '') + completed = run.get('updated_at') duration = None if started and completed: try: @@ -129,9 +231,9 @@ def _fetch_all_deploys() -> list[dict]: except (ValueError, TypeError): pass all_runs.append({ - 'run_id': str(run.get('databaseId', '')), + 'run_id': str(run.get('id', '')), 'workflow_name': workflow.replace('.yml', ''), - 'ref_name': run.get('headBranch', ''), + 'ref_name': run.get('head_branch', ''), 'status': run.get('conclusion', run.get('status', 'unknown')), 'started_at': started, 'completed_at': completed, @@ -162,26 +264,22 @@ def _fetch_branch_lag() -> list[dict]: results = [] today = datetime.now(timezone.utc).date().isoformat() for source, target in BRANCH_PAIRS: - out = _gh([ - 'api', f'repos/{REPO}/compare/{target}...{source}', - '--jq', '.ahead_by' - ]) - if not out: + data = _github_get(f'repos/{REPO}/compare/{target}...{source}') + if not data: continue try: - commits_behind = int(out) + commits_behind = int(data.get('ahead_by', 0)) except (ValueError, TypeError): continue days_behind = None - out2 = _gh([ - 'api', f'repos/{REPO}/compare/{target}...{source}', - '--jq', '.commits[0].commit.committer.date' - ]) - if out2: + commits = data.get('commits', []) + if commits: try: - oldest = datetime.fromisoformat(out2.replace('Z', '+00:00')) - days_behind = round((datetime.now(timezone.utc) - oldest).total_seconds() / 86400, 1) + oldest_date = commits[0].get('commit', {}).get('committer', {}).get('date', '') + if oldest_date: + oldest = datetime.fromisoformat(oldest_date.replace('Z', '+00:00')) + days_behind = round((datetime.now(timezone.utc) - oldest).total_seconds() / 86400, 1) except (ValueError, TypeError): pass @@ -291,71 +389,106 @@ def get_branch_lag(date_from: str, date_to: str) -> dict: return {'pairs': pairs} +def _cache_pr_author(pr_number: int, info: dict): + """Write PR author info to SQLite cache.""" + _db.execute(''' + INSERT OR REPLACE INTO pr_authors (pr_number, author, title, branch, additions, deletions, fetched_at) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', (pr_number, info['author'], info.get('title', ''), info.get('branch', ''), + info.get('additions', 0), info.get('deletions', 0), + datetime.now(timezone.utc).isoformat())) + + +def _get_cached_pr_author(pr_number: int) -> dict | None: + """Read PR author info from SQLite cache.""" + rows = _db.query('SELECT * FROM pr_authors WHERE pr_number = ?', (pr_number,)) + if rows: + r = rows[0] + return {'author': r['author'], 'title': r['title'], 'branch': r['branch'], + 'additions': r['additions'], 'deletions': r['deletions']} + return None + + def get_pr_author(pr_number) -> dict | None: - """Look up PR author/title by number. Results are cached permanently (PR data doesn't change).""" + """Look up PR author/title by number. Results cached in SQLite.""" pr_number = int(pr_number) if pr_number else None if not pr_number: return None - if pr_number in _pr_author_cache: - return _pr_author_cache[pr_number] - # Check merged PR cache first (already fetched) + # Check SQLite cache + cached = _get_cached_pr_author(pr_number) + if cached: + return cached + + # Check merged PR cache (already fetched in-memory) for pr in _pr_cache.get('data', []): if pr.get('number') == pr_number: info = {'author': pr.get('author', 'unknown'), 'title': pr.get('title', ''), 'branch': pr.get('headRefName', ''), 'additions': pr.get('additions', 0), 'deletions': pr.get('deletions', 0)} - _pr_author_cache[pr_number] = info + _cache_pr_author(pr_number, info) return info - # Fetch from GitHub API - out = _gh(['pr', 'view', str(pr_number), '--repo', REPO, - '--json', 'author,title,headRefName,additions,deletions']) - if out: + # Fetch from GitHub REST API + data = _github_get(f'repos/{REPO}/pulls/{pr_number}') + if data: try: - data = json.loads(out) - author = data.get('author', {}) - if isinstance(author, dict): - author = author.get('login', 'unknown') + author = (data.get('user') or {}).get('login', 'unknown') info = {'author': author, 'title': data.get('title', ''), - 'branch': data.get('headRefName', ''), + 'branch': (data.get('head') or {}).get('ref', ''), 'additions': data.get('additions', 0), 'deletions': data.get('deletions', 0)} - _pr_author_cache[pr_number] = info + _cache_pr_author(pr_number, info) return info - except (json.JSONDecodeError, KeyError): + except (KeyError, TypeError): pass return None def batch_get_pr_authors(pr_numbers: set) -> dict: - """Fetch authors for multiple PR numbers, using cache. Returns {pr_number: info}.""" + """Fetch authors for multiple PR numbers, using SQLite cache. Returns {pr_number: info}.""" result = {} - to_fetch = [] - for prn in pr_numbers: - if not prn: - continue - prn = int(prn) - if prn in _pr_author_cache: - result[prn] = _pr_author_cache[prn] - else: - to_fetch.append(prn) - - # Check merged PR cache first - for pr in _pr_cache.get('data', []): - num = pr.get('number') - if num in to_fetch: - info = {'author': pr.get('author', 'unknown'), 'title': pr.get('title', ''), - 'branch': pr.get('headRefName', ''), - 'additions': pr.get('additions', 0), 'deletions': pr.get('deletions', 0)} - _pr_author_cache[num] = info - result[num] = info - to_fetch.remove(num) - - # Fetch remaining individually (with a cap to avoid API abuse) - for prn in to_fetch[:50]: - info = get_pr_author(prn) - if info: - result[prn] = info + # Batch fetch from SQLite cache in a single query + clean = [int(prn) for prn in pr_numbers if prn] + if not clean: + return result + placeholders = ','.join('?' * len(clean)) + cached_rows = _db.query( + f'SELECT * FROM pr_authors WHERE pr_number IN ({placeholders})', clean) + cached_set = set() + for r in cached_rows: + prn = r['pr_number'] + result[prn] = {'author': r['author'], 'title': r['title'], 'branch': r['branch'], + 'additions': r['additions'], 'deletions': r['deletions']} + cached_set.add(prn) + to_fetch = [prn for prn in clean if prn not in cached_set] + + # Check merged PR cache (in-memory) + if to_fetch: + to_fetch_set = set(to_fetch) + for pr in _pr_cache.get('data', []): + num = pr.get('number') + if num in to_fetch_set: + info = {'author': pr.get('author', 'unknown'), 'title': pr.get('title', ''), + 'branch': pr.get('headRefName', ''), + 'additions': pr.get('additions', 0), 'deletions': pr.get('deletions', 0)} + _cache_pr_author(num, info) + result[num] = info + to_fetch_set.discard(num) + to_fetch = list(to_fetch_set) + + # Fetch remaining concurrently (with a cap to avoid API abuse) + if to_fetch: + from concurrent.futures import ThreadPoolExecutor, as_completed + with ThreadPoolExecutor(max_workers=10) as pool: + futures = {pool.submit(get_pr_author, prn): prn for prn in to_fetch[:50]} + for fut in as_completed(futures): + prn = futures[fut] + try: + info = fut.result() + if info: + result[prn] = info + except Exception: + pass return result @@ -495,33 +628,29 @@ def _median(vals): def _fetch_merge_queue_runs(date_str: str) -> dict: """Fetch merge_group workflow runs for a single date. Returns daily summary.""" - out = _gh([ - 'api', '--paginate', + pages = _github_get( f'repos/{REPO}/actions/workflows/{CI3_WORKFLOW}/runs' f'?event=merge_group&created={date_str}&per_page=100', - '--jq', '.workflow_runs[] | [.conclusion, .status] | @tsv', - ]) + paginate=True) summary = {'date': date_str, 'total': 0, 'success': 0, 'failure': 0, 'cancelled': 0, 'in_progress': 0} - if not out: + if not pages: return summary - for line in out.strip().split('\n'): - if not line.strip(): - continue - parts = line.split('\t') - conclusion = parts[0] if parts[0] else '' - status = parts[1] if len(parts) > 1 else '' - summary['total'] += 1 - if conclusion == 'success': - summary['success'] += 1 - elif conclusion == 'failure': - summary['failure'] += 1 - elif conclusion == 'cancelled': - summary['cancelled'] += 1 - elif status in ('in_progress', 'queued', 'waiting'): - summary['in_progress'] += 1 - else: - summary['failure'] += 1 # treat unknown conclusions as failures + for page in pages: + for run in (page.get('workflow_runs') or []) if isinstance(page, dict) else []: + conclusion = run.get('conclusion') or '' + status = run.get('status') or '' + summary['total'] += 1 + if conclusion == 'success': + summary['success'] += 1 + elif conclusion == 'failure': + summary['failure'] += 1 + elif conclusion == 'cancelled': + summary['cancelled'] += 1 + elif status in ('in_progress', 'queued', 'waiting'): + summary['in_progress'] += 1 + else: + summary['failure'] += 1 # treat unknown conclusions as failures return summary @@ -597,13 +726,14 @@ def _backfill_merge_queue(): def refresh_merge_queue_today(): - """Refresh today's (and yesterday's) merge queue stats. Called periodically.""" + """Refresh recent merge queue stats. Re-fetches the last 7 days to fix any + zero rows written during transient API failures.""" import db conn = db.get_db() - today = datetime.now(timezone.utc).date().isoformat() - yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).date().isoformat() + today = datetime.now(timezone.utc).date() - for ds in [yesterday, today]: + for i in range(7): + ds = (today - timedelta(days=i)).isoformat() summary = _fetch_merge_queue_runs(ds) conn.execute( 'INSERT OR REPLACE INTO merge_queue_daily (date, total, success, failure, cancelled, in_progress) ' @@ -613,6 +743,80 @@ def refresh_merge_queue_today(): conn.commit() +_MQ_DEPTH_GQL = ''' +query($owner: String!, $repo: String!, $branch: String!) { + repository(owner: $owner, name: $repo) { + mergeQueue(branch: $branch) { + entries(first: 100) { + totalCount + nodes { position state enqueuedAt pullRequest { number title author { login } } } + } + } + } +}''' + +_MQ_BRANCH = 'next' + + +def poll_merge_queue_depth(): + """Snapshot the current merge queue depth into SQLite.""" + import db + owner, repo = REPO.split('/') + data = _github_graphql(_MQ_DEPTH_GQL, + {'owner': owner, 'repo': repo, 'branch': _MQ_BRANCH}) + if not data: + return + mq = (data.get('repository') or {}).get('mergeQueue') + if mq is None: + return + entries = mq.get('entries', {}) + depth = entries.get('totalCount', 0) + nodes = entries.get('nodes', []) + entries_json = json.dumps([{ + 'position': n.get('position'), + 'state': n.get('state'), + 'pr': (n.get('pullRequest') or {}).get('number'), + 'author': ((n.get('pullRequest') or {}).get('author') or {}).get('login'), + } for n in nodes]) if nodes else None + + now = datetime.now(timezone.utc).isoformat() + db.execute('INSERT INTO merge_queue_snapshots (timestamp, depth, entries_json) VALUES (?, ?, ?)', + (now, depth, entries_json)) + + +def _aggregate_depth_stats(): + """Aggregate merge_queue_snapshots into avg/peak depth on merge_queue_daily.""" + import db + conn = db.get_db() + rows = conn.execute(''' + SELECT substr(timestamp, 1, 10) as date, + ROUND(AVG(depth), 1) as avg_depth, + MAX(depth) as peak_depth + FROM merge_queue_snapshots + GROUP BY substr(timestamp, 1, 10) + ''').fetchall() + for row in rows: + conn.execute(''' + UPDATE merge_queue_daily SET avg_depth = ?, peak_depth = ? + WHERE date = ? + ''', (row['avg_depth'], row['peak_depth'], row['date'])) + conn.commit() + + +def start_merge_queue_poller(): + """Start background thread that polls merge queue depth every 5 minutes.""" + def loop(): + while True: + try: + poll_merge_queue_depth() + except Exception as e: + print(f"[rk_github] queue depth poll error: {e}") + time.sleep(300) # 5 minutes + t = threading.Thread(target=loop, daemon=True, name='mq-depth-poller') + t.start() + return t + + _mq_backfill_lock = threading.Lock() _mq_last_refresh = 0 _MQ_REFRESH_TTL = 3600 # refresh today's data every hour @@ -629,6 +833,7 @@ def ensure_merge_queue_data(): try: _backfill_merge_queue() refresh_merge_queue_today() + _aggregate_depth_stats() _mq_last_refresh = now finally: _mq_backfill_lock.release() @@ -646,7 +851,7 @@ def get_merge_queue_stats(date_from: str, date_to: str) -> dict: threading.Thread(target=ensure_merge_queue_data, daemon=True).start() rows = db.query( - 'SELECT date, total, success, failure, cancelled, in_progress ' + 'SELECT date, total, success, failure, cancelled, in_progress, avg_depth, peak_depth ' 'FROM merge_queue_daily WHERE date >= ? AND date <= ? ORDER BY date', (date_from, date_to)) @@ -664,3 +869,144 @@ def get_merge_queue_stats(date_from: str, date_to: str) -> dict: 'days': len([r for r in rows if r['total'] > 0]), }, } + + +import re as _re + +_COMMIT_TYPE_RE = _re.compile( + r'^(fix|feat|chore|refactor|docs|style|test|perf|ci|build|revert)(\([^)]+\))?(!)?: ' +) +_PR_NUM_RE = _re.compile(r'\(#(\d+)\)\s*$') +_MERGE_TRAIN_RE = _re.compile(r'merge-train/([^\s]+)') + + +def _parse_commit(raw: dict) -> dict: + """Normalise a GitHub REST commit object into a compact dict.""" + sha = raw.get('sha', '') + msg = raw.get('commit', {}).get('message', '') or '' + subject = msg.split('\n')[0] + c_author = raw.get('commit', {}).get('author', {}) or {} + # Prefer committer login if available (shows GitHub username not git display name) + login = (raw.get('author') or {}).get('login', '') + author = login or c_author.get('name', '') + date = c_author.get('date', '') # ISO-8601 + + # Parse conventional commit type + scope + m = _COMMIT_TYPE_RE.match(subject) + commit_type = m.group(1) if m else 'other' + breaking = bool(m and m.group(3)) + scope_raw = m.group(2) if m else '' + scope = scope_raw[1:-1] if scope_raw else '' # strip parens + + # Extract PR number from "(#NNNNN)" at end of subject + pr_m = _PR_NUM_RE.search(subject) + pr_number = int(pr_m.group(1)) if pr_m else None + clean_subject = _PR_NUM_RE.sub('', subject).rstrip() + + # Detect merge-train commits + mt_m = _MERGE_TRAIN_RE.search(subject) + merge_train = mt_m.group(1) if mt_m else None + is_merge = len(raw.get('parents', [])) > 1 + + return { + 'sha': sha, + 'subject': clean_subject, + 'type': commit_type, + 'scope': scope, + 'breaking': breaking, + 'pr': pr_number, + 'author': author, + 'date': date, + 'merge_train': merge_train, + 'is_merge': is_merge, + 'dirs': None, # populated by caller if Redis cache available + } + + +_pr_dirs_cache: dict = {} # {pr_number: [dirs]} in-memory cache (long TTL) +_pr_dirs_lock = threading.Lock() +_pr_dirs_fetch_queue: set = set() +_pr_dirs_worker_started = False + + +def _compute_pr_dirs(pr_number: int) -> list[str]: + """Fetch changed files for a PR and return 2-level path buckets.""" + data = _github_get(f'repos/{REPO}/pulls/{pr_number}/files?per_page=100') + if not data or not isinstance(data, list): + return [] + dirs: set[str] = set() + for f in data: + filename = f.get('filename', '') + if not filename: + continue + parts = filename.split('/') + top = parts[0] + dirs.add(top) + # For yarn-project, include 2nd level for sub-project drill-down + if top == 'yarn-project' and len(parts) > 1: + dirs.add(f'yarn-project/{parts[1]}') + return sorted(dirs) + + +def _pr_dirs_worker(): + """Background worker: drains the fetch queue, caches results.""" + while True: + time.sleep(2) + with _pr_dirs_lock: + if not _pr_dirs_fetch_queue: + continue + pr_number = _pr_dirs_fetch_queue.pop() + try: + dirs = _compute_pr_dirs(pr_number) + with _pr_dirs_lock: + _pr_dirs_cache[pr_number] = dirs + except Exception as e: + print(f'[github_data] pr_dirs fetch error for #{pr_number}: {e}') + + +def start_pr_dirs_worker(): + """Start the background PR dirs fetcher (call once at startup).""" + global _pr_dirs_worker_started + if _pr_dirs_worker_started: + return + _pr_dirs_worker_started = True + t = threading.Thread(target=_pr_dirs_worker, daemon=True, name='pr-dirs-fetcher') + t.start() + + +def get_pr_dirs(pr_number: int) -> list[str] | None: + """Return cached dirs for a PR, or None if not yet fetched (queues async fetch).""" + with _pr_dirs_lock: + if pr_number in _pr_dirs_cache: + return _pr_dirs_cache[pr_number] + _pr_dirs_fetch_queue.add(pr_number) + return None + + +def get_recent_commits(branch: str = 'next', page: int = 1, per_page: int = 50) -> list[dict]: + """Fetch a page of commits from GitHub API with 5-minute in-memory cache.""" + per_page = min(per_page, 100) + cache_key = f'{branch}:{page}:{per_page}' + now = time.time() + with _commits_lock: + cached = _commits_cache.get(cache_key) + if cached and now - cached['ts'] < 300: + return cached['data'] + + data = _github_get( + f'repos/{REPO}/commits?sha={branch}&per_page={per_page}&page={page}' + ) + if not data or not isinstance(data, list): + result = [] + else: + result = [_parse_commit(raw) for raw in data] + + with _commits_lock: + _commits_cache[cache_key] = {'data': result, 'ts': now} + + # Enrich with cached dirs (non-blocking) + for c in result: + if c.get('pr'): + c['dirs'] = get_pr_dirs(c['pr']) + + return result diff --git a/ci3/ci-metrics/metrics.py b/ci3/ci-metrics/metrics.py index 5c0d1610e06b..8582ef0ad90c 100644 --- a/ci3/ci-metrics/metrics.py +++ b/ci3/ci-metrics/metrics.py @@ -1,9 +1,11 @@ -"""CI metrics: direct Redis reads + test event listener. +"""CI metrics: SQLite source of truth + Redis ingestion + test event listener. -Reads CI run data directly from Redis sorted sets on each request. +CI runs are ingested from Redis (written by log_ci_run on CI instances) and +stored in SQLite. All reads go through SQLite so enriched fields (instance_type +from CloudTrail, recalculated costs) are preserved. Test events stored in SQLite since they only arrive via pub/sub. -CI runs periodically synced from Redis to SQLite for flake correlation. """ +import hashlib import json import re import time @@ -21,6 +23,18 @@ _URL_PR_RE = re.compile(r'/pull/(\d+)') +def hash_str_orig(s: str) -> str: + """Replicate bash's `echo "$s" | git hash-object --stdin | cut -c1-16`. + + git hash-object computes SHA-1 of "blob \\0" where content + includes the trailing newline from echo. Length is byte length, not + Unicode code points. + """ + content = (s + "\n").encode('utf-8') + blob = f"blob {len(content)}\0".encode('utf-8') + content + return hashlib.sha1(blob).hexdigest()[:16] + + def compute_run_cost(data: dict) -> float | None: complete = data.get('complete') ts = data.get('timestamp') @@ -31,7 +45,9 @@ def compute_run_cost(data: dict) -> float | None: is_spot = bool(data.get('spot')) rate = ec2_pricing.get_instance_rate(instance_type, is_spot) if not rate: - vcpus = data.get('instance_vcpus', 192) + vcpus = data.get('instance_vcpus') + if not vcpus: + return None # unknown instance type and no vCPU data rate = vcpus * ec2_pricing.get_fallback_vcpu_rate(is_spot) return round(hours * rate, 4) @@ -116,31 +132,37 @@ def _get_ci_runs_from_sqlite(date_from_ms=None, date_to_ms=None): return runs -def get_ci_runs(redis_conn, date_from_ms=None, date_to_ms=None): - """Read CI runs from Redis, backfilled with SQLite for data that Redis has flushed.""" - redis_runs = _get_ci_runs_from_redis(redis_conn, date_from_ms, date_to_ms) - - # Find the earliest timestamp in Redis to know what SQLite needs to fill - redis_keys = set() - redis_min_ts = float('inf') - for run in redis_runs: - ts = run.get('timestamp', 0) - redis_keys.add((run.get('dashboard', ''), ts, run.get('name', ''))) - if ts < redis_min_ts: - redis_min_ts = ts - - # If requesting data older than what Redis has, backfill from SQLite - sqlite_runs = [] - need_sqlite = (date_from_ms is not None and date_from_ms < redis_min_ts) or not redis_runs - if need_sqlite: - sqlite_to = int(redis_min_ts) if redis_runs else date_to_ms - sqlite_runs = _get_ci_runs_from_sqlite(date_from_ms, sqlite_to) - # Deduplicate: only include SQLite runs not already in Redis - sqlite_runs = [r for r in sqlite_runs - if (r.get('dashboard', ''), r.get('timestamp', 0), r.get('name', '')) - not in redis_keys] - - return sqlite_runs + redis_runs +def get_ci_runs(date_from_ms=None, date_to_ms=None): + """Read CI runs from SQLite (the source of truth). + + Redis is only an ingestion pipe — sync_ci_runs_to_sqlite() copies data in. + All reads go through SQLite so enriched fields (instance_type from CloudTrail, + recalculated costs) are always reflected. + """ + return _get_ci_runs_from_sqlite(date_from_ms, date_to_ms) + + +def get_ci_runs_for_pr(pr_number: int, limit: int = 100) -> list: + """Return CI runs for a specific PR, most recent first.""" + rows = db.query( + 'SELECT * FROM ci_runs WHERE pr_number = ? ORDER BY timestamp_ms DESC LIMIT ?', + (pr_number, limit) + ) + return [{ + 'dashboard': row['dashboard'], + 'name': row['name'], + 'timestamp': row['timestamp_ms'], + 'complete': row['complete_ms'], + 'status': row['status'], + 'author': row['author'], + 'pr_number': row['pr_number'], + 'instance_type': row['instance_type'], + 'instance_vcpus': row.get('instance_vcpus'), + 'spot': bool(row['spot']), + 'cost_usd': row['cost_usd'], + 'job_id': row.get('job_id', ''), + 'arch': row.get('arch', ''), + } for row in rows] def _ts_to_date(ts_ms): @@ -149,6 +171,32 @@ def _ts_to_date(ts_ms): # ---- Test event handling (only thing needing SQLite) ---- +def _upsert_daily_stats(status: str, test_cmd: str, dashboard: str, timestamp: str, duration_secs=None): + """Increment the daily counter for a test status.""" + date = timestamp[:10] # 'YYYY-MM-DD' + col = status if status in ('passed', 'failed', 'flaked') else None + if not col: + return + d = duration_secs if duration_secs and duration_secs > 0 else None + if d: + db.execute(f''' + INSERT INTO test_daily_stats (date, test_cmd, dashboard, {col}, total_secs, count_timed, min_secs, max_secs) + VALUES (?, ?, ?, 1, ?, 1, ?, ?) + ON CONFLICT(date, test_cmd, dashboard) DO UPDATE SET + {col} = {col} + 1, + total_secs = total_secs + excluded.total_secs, + count_timed = count_timed + 1, + min_secs = CASE WHEN min_secs IS NULL OR excluded.min_secs < min_secs THEN excluded.min_secs ELSE min_secs END, + max_secs = CASE WHEN max_secs IS NULL OR excluded.max_secs > max_secs THEN excluded.max_secs ELSE max_secs END + ''', (date, test_cmd, dashboard, d, d, d)) + else: + db.execute(f''' + INSERT INTO test_daily_stats (date, test_cmd, dashboard, {col}) + VALUES (?, ?, ?, 1) + ON CONFLICT(date, test_cmd, dashboard) DO UPDATE SET {col} = {col} + 1 + ''', (date, test_cmd, dashboard)) + + def _handle_test_event(channel: str, data: dict): status = channel.split(':')[-1] # Handle field name mismatches: run_test_cmd publishes 'cmd' for failed/flaked @@ -157,12 +205,19 @@ def _handle_test_event(channel: str, data: dict): log_url = data.get('log_url') or data.get('log_key') if log_url and not log_url.startswith('http'): log_url = f'http://ci.aztec-labs.com/{log_url}' + dashboard = data.get('dashboard', '') + timestamp = data.get('timestamp', datetime.now(timezone.utc).isoformat()) + test_hash = hash_str_orig(test_cmd) if test_cmd else None + + # Always update daily stats (lightweight aggregate) + _upsert_daily_stats(status, test_cmd, dashboard, timestamp, data.get('duration_secs')) + db.execute(''' INSERT INTO test_events (status, test_cmd, log_url, ref_name, commit_hash, commit_author, commit_msg, exit_code, duration_secs, is_scenario, owners, - flake_group_id, dashboard, timestamp) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + flake_group_id, dashboard, timestamp, test_hash) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( status, test_cmd, @@ -176,14 +231,15 @@ def _handle_test_event(channel: str, data: dict): 1 if data.get('is_scenario_test') else 0, json.dumps(data['owners']) if data.get('owners') else None, data.get('flake_group_id'), - data.get('dashboard', ''), - data.get('timestamp', datetime.now(timezone.utc).isoformat()), + dashboard, + timestamp, + test_hash, )) def start_test_listener(redis_conn): """Subscribe to test event channels only. Reconnects on failure.""" - channels = [b'ci:test:started', b'ci:test:passed', b'ci:test:failed', b'ci:test:flaked'] + channels = [b'ci:test:passed', b'ci:test:failed', b'ci:test:flaked'] def listener(): backoff = 1 @@ -215,6 +271,163 @@ def listener(): return t +# ---- CI Phase timing listener ---- + +def _handle_phase_event(data: dict): + """Insert a CI phase timing event into SQLite.""" + db.execute(''' + INSERT INTO ci_phases + (phase, duration_secs, exit_code, run_id, job_id, dashboard, + ref_name, commit_hash, timestamp) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + data.get('phase', ''), + data.get('duration_secs', 0), + data.get('exit_code'), + data.get('run_id', ''), + data.get('job_id', ''), + data.get('dashboard', ''), + data.get('ref_name', ''), + data.get('commit_hash', ''), + datetime.now(timezone.utc).isoformat(), + )) + + +def start_phase_listener(redis_conn): + """Subscribe to ci:phase:complete and store in ci_phases table.""" + def listener(): + backoff = 1 + while True: + try: + pubsub = redis_conn.pubsub() + pubsub.subscribe(b'ci:phase:complete') + backoff = 1 + for message in pubsub.listen(): + if message['type'] != 'message': + continue + try: + payload = message['data'] + if isinstance(payload, bytes): + payload = payload.decode() + _handle_phase_event(json.loads(payload)) + except Exception as e: + print(f"[rk_metrics] Error parsing phase event: {e}") + except Exception as e: + print(f"[rk_metrics] Phase listener error (reconnecting in {backoff}s): {e}") + time.sleep(backoff) + backoff = min(backoff * 2, 60) + + t = threading.Thread(target=listener, daemon=True, name='phase-listener') + t.start() + return t + + +def get_phases(date_from: str, date_to: str, dashboard: str = '', + run_id: str = '') -> dict: + """Query CI phase timing data for the API.""" + conditions = ['timestamp >= ?', 'timestamp < ?'] + params: list = [date_from, date_to + 'T23:59:59'] + if dashboard: + conditions.append('dashboard = ?') + params.append(dashboard) + if run_id: + conditions.append('run_id = ?') + params.append(run_id) + where = 'WHERE ' + ' AND '.join(conditions) + + # Aggregate by phase name + by_phase = db.query(f''' + SELECT phase, + COUNT(*) as count, + ROUND(AVG(duration_secs), 1) as avg_secs, + ROUND(MIN(duration_secs), 1) as min_secs, + ROUND(MAX(duration_secs), 1) as max_secs, + ROUND(SUM(duration_secs), 0) as total_secs + FROM ci_phases {where} + GROUP BY phase + ORDER BY total_secs DESC + ''', params) + + # Aggregate by date: avg duration per phase per day + date_rows = db.query(f''' + SELECT substr(timestamp, 1, 10) as date, phase, + ROUND(AVG(duration_secs), 1) as avg_secs, + COUNT(*) as count + FROM ci_phases {where} + GROUP BY date, phase + ORDER BY date + ''', params) + by_date: dict[str, dict] = {} + for row in date_rows: + d = row['date'] + if d not in by_date: + by_date[d] = {'date': d, 'phases': {}} + by_date[d]['phases'][row['phase']] = row['avg_secs'] + + # Recent individual runs with their phases + recent_runs = db.query(f''' + SELECT run_id, job_id, dashboard, ref_name, commit_hash, + phase, duration_secs, exit_code, timestamp + FROM ci_phases {where} + ORDER BY timestamp DESC + LIMIT 500 + ''', params) + runs_map: dict[str, dict] = {} + for row in recent_runs: + rid = row['run_id'] or row['timestamp'] + if rid not in runs_map: + runs_map[rid] = { + 'run_id': row['run_id'], 'job_id': row['job_id'], + 'dashboard': row['dashboard'], 'ref_name': row['ref_name'], + 'commit_hash': row['commit_hash'], 'phases': [], + } + runs_map[rid]['phases'].append({ + 'phase': row['phase'], + 'duration_secs': row['duration_secs'], + 'exit_code': row['exit_code'], + }) + + # Aggregate by dashboard: P95 duration per phase per pipeline. + # Step 1: sum durations within each (dashboard, phase, run_id) — multiple machines + # running the same phase in one run are summed, not counted separately. + # Step 2: compute P95 across run_ids in Python. + per_run_rows = db.query(f''' + SELECT dashboard, phase, run_id, + ROUND(SUM(duration_secs), 3) as run_total + FROM ci_phases {where} + AND dashboard != '' + AND run_id != '' + GROUP BY dashboard, phase, run_id + ''', params) + + import math + from collections import defaultdict + run_totals: dict[tuple, list] = defaultdict(list) + for row in per_run_rows: + run_totals[(row['dashboard'], row['phase'])].append(row['run_total']) + + by_dashboard: dict[str, dict] = {} + for (dash, phase), totals in sorted(run_totals.items()): + totals_s = sorted(totals) + n = len(totals_s) + p95_idx = min(math.ceil(0.95 * n) - 1, n - 1) + p95 = round(totals_s[p95_idx], 1) + if dash not in by_dashboard: + by_dashboard[dash] = {'dashboard': dash, 'phases': {}, 'total_secs': 0, 'count': 0} + by_dashboard[dash]['phases'][phase] = p95 + by_dashboard[dash]['total_secs'] += sum(totals_s) + by_dashboard[dash]['count'] = max(by_dashboard[dash]['count'], n) + for d in by_dashboard.values(): + d['total_secs'] = round(d['total_secs'], 1) + + return { + 'by_phase': by_phase, + 'by_date': list(by_date.values()), + 'by_dashboard': list(by_dashboard.values()), + 'recent_runs': list(runs_map.values())[:50], + } + + # ---- Sync failed_tests_{section} lists from Redis into SQLite ---- _ANSI_STRIP = re.compile(r'\x1b\[[^m]*m|\x1b\]8;;[^\x07]*\x07') @@ -326,18 +539,18 @@ def sync_failed_tests_to_sqlite(redis_conn): _failed_tests_sync_ts = now conn = db.get_db() - # Track existing entries to avoid duplicates: log_url for entries that have one, - # (test_cmd, timestamp, dashboard) composite key for entries without log_url + # Track existing failed/flaked entries to avoid duplicates (this sync only + # processes failed/flaked from Redis lists, so no need to scan passed rows). existing_urls = {row['log_url'] for row in conn.execute( - "SELECT DISTINCT log_url FROM test_events WHERE log_url IS NOT NULL" + "SELECT DISTINCT log_url FROM test_events WHERE log_url IS NOT NULL AND status IN ('failed', 'flaked')" ).fetchall()} existing_keys = {(row['test_cmd'], row['timestamp'], row['dashboard']) for row in conn.execute( - "SELECT test_cmd, timestamp, dashboard FROM test_events WHERE log_url IS NULL" + "SELECT test_cmd, timestamp, dashboard FROM test_events WHERE log_url IS NULL AND status IN ('failed', 'flaked')" ).fetchall()} total = 0 - for section in SECTIONS: - key = f'failed_tests_{section}' + for section in SECTIONS + ['']: + key = f'failed_tests_{section}' if section else 'failed_tests' try: entries = redis_conn.lrange(key, 0, -1) except Exception as e: @@ -363,21 +576,27 @@ def sync_failed_tests_to_sqlite(redis_conn): INSERT INTO test_events (status, test_cmd, log_url, ref_name, commit_author, commit_msg, duration_secs, flake_group_id, dashboard, - timestamp) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + timestamp, test_hash) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( parsed['status'], parsed['test_cmd'], parsed['log_url'], parsed['ref_name'], parsed['commit_author'], parsed['commit_msg'], parsed['duration_secs'], parsed['flake_group_id'], parsed['dashboard'], parsed['timestamp'], + hash_str_orig(parsed['test_cmd']) if parsed['test_cmd'] else None, )) + _upsert_daily_stats( + parsed['status'], parsed['test_cmd'], + parsed['dashboard'], parsed['timestamp']) total += 1 except Exception as e: print(f"[rk_metrics] Error inserting test event: {e}") conn.commit() if total: print(f"[rk_metrics] Synced {total} test events from Redis lists") + db.cache_invalidate_prefix('flakes:') + db.cache_invalidate_prefix('timings:') # ---- Seed loading ---- @@ -437,15 +656,16 @@ def _load_seed_data(): events = data['test_events'] for ev in events: try: + te_cmd = ev.get('test_cmd', '') conn.execute(''' INSERT OR IGNORE INTO test_events (status, test_cmd, log_url, ref_name, commit_hash, commit_author, commit_msg, exit_code, duration_secs, is_scenario, owners, - flake_group_id, dashboard, timestamp) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + flake_group_id, dashboard, timestamp, test_hash) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( ev.get('status', ''), - ev.get('test_cmd', ''), + te_cmd, ev.get('log_url'), ev.get('ref_name', ''), ev.get('commit_hash'), @@ -458,6 +678,7 @@ def _load_seed_data(): ev.get('flake_group_id'), ev.get('dashboard', ''), ev.get('timestamp', ''), + hash_str_orig(te_cmd) if te_cmd else None, )) except Exception: continue @@ -472,14 +693,19 @@ def _load_seed_data(): def sync_ci_runs_to_sqlite(redis_conn): - """Sync all CI runs from Redis into SQLite for persistence.""" + """Ingest CI runs from Redis into SQLite. + + Redis is the ingestion pipe (log_ci_run writes there from CI instances). + SQLite is the source of truth. Fields enriched post-ingestion (instance_type, + cost_usd from CloudTrail resolution) are preserved — only overwritten if + Redis has a non-empty value. + """ global _ci_sync_ts now = time.time() if now - _ci_sync_ts < _CI_SYNC_TTL: return _ci_sync_ts = now - # Sync everything Redis has (not just 30 days) runs = _get_ci_runs_from_redis(redis_conn) now_iso = datetime.now(timezone.utc).isoformat() @@ -488,11 +714,32 @@ def sync_ci_runs_to_sqlite(redis_conn): for run in runs: try: conn.execute(''' - INSERT OR REPLACE INTO ci_runs + INSERT INTO ci_runs (dashboard, name, timestamp_ms, complete_ms, status, author, pr_number, instance_type, instance_vcpus, spot, cost_usd, job_id, arch, synced_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(dashboard, timestamp_ms, name) DO UPDATE SET + complete_ms = excluded.complete_ms, + status = excluded.status, + author = excluded.author, + pr_number = excluded.pr_number, + instance_vcpus = excluded.instance_vcpus, + spot = excluded.spot, + job_id = excluded.job_id, + arch = excluded.arch, + synced_at = excluded.synced_at, + -- Preserve enriched fields: only overwrite if Redis has real data + instance_type = CASE + WHEN excluded.instance_type IS NOT NULL AND excluded.instance_type != '' + THEN excluded.instance_type + ELSE ci_runs.instance_type + END, + cost_usd = CASE + WHEN excluded.instance_type IS NOT NULL AND excluded.instance_type != '' + THEN excluded.cost_usd + ELSE ci_runs.cost_usd + END ''', ( run.get('dashboard', ''), run.get('name', ''), @@ -514,17 +761,372 @@ def sync_ci_runs_to_sqlite(redis_conn): print(f"[rk_metrics] Error syncing run: {e}") conn.commit() print(f"[rk_metrics] Synced {count} CI runs to SQLite") + db.cache_invalidate_prefix('perf:') + + +def _backfill_daily_stats(): + """Populate test_daily_stats from existing test_events rows. + + Uses INSERT OR IGNORE to fill gaps without overwriting data from the + real-time listener. Safe to call repeatedly — skips dates/tests that + already have rows. + """ + conn = db.get_db() + cur = conn.execute(''' + INSERT OR IGNORE INTO test_daily_stats (date, test_cmd, dashboard, passed, failed, flaked) + SELECT substr(timestamp, 1, 10) as date, test_cmd, dashboard, + SUM(CASE WHEN status = 'passed' THEN 1 ELSE 0 END), + SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END), + SUM(CASE WHEN status = 'flaked' THEN 1 ELSE 0 END) + FROM test_events + GROUP BY substr(timestamp, 1, 10), test_cmd, dashboard + ''') + conn.commit() + if cur.rowcount and cur.rowcount > 0: + print(f"[rk_metrics] Backfilled {cur.rowcount} daily stat rows from test_events") + + +def _materialize_ci_run_daily_stats(): + """Recompute ci_run_daily_stats from ci_runs. + + Replaces all rows — safe to call repeatedly. Stores pre-aggregated + duration percentiles so the API doesn't need to scan raw rows. + """ + conn = db.get_db() + # Fetch raw daily durations grouped by date + dashboard + rows = conn.execute(''' + SELECT + strftime('%Y-%m-%d', timestamp_ms / 1000, 'unixepoch') AS date, + dashboard, + (complete_ms - timestamp_ms) / 60000.0 AS dur_mins + FROM ci_runs + WHERE status IN ('PASSED', 'FAILED') + AND complete_ms IS NOT NULL AND complete_ms > timestamp_ms + ''').fetchall() + + # Group durations: {(date, dashboard): [dur_mins, ...]} + groups = {} + for r in rows: + key = (r['date'], r['dashboard']) + groups.setdefault(key, {'passed': 0, 'failed': 0, 'durs': []}) + groups[key]['durs'].append(r['dur_mins']) + + # Also count pass/fail per group + status_rows = conn.execute(''' + SELECT + strftime('%Y-%m-%d', timestamp_ms / 1000, 'unixepoch') AS date, + dashboard, status, COUNT(*) as cnt + FROM ci_runs + WHERE status IN ('PASSED', 'FAILED') + GROUP BY date, dashboard, status + ''').fetchall() + for r in status_rows: + key = (r['date'], r['dashboard']) + if key not in groups: + groups[key] = {'passed': 0, 'failed': 0, 'durs': []} + if r['status'] == 'PASSED': + groups[key]['passed'] = r['cnt'] + else: + groups[key]['failed'] = r['cnt'] + + conn.execute('DELETE FROM ci_run_daily_stats') + inserted = 0 + for (date, dashboard), g in groups.items(): + durs = sorted(g['durs']) + n = len(durs) + conn.execute(''' + INSERT INTO ci_run_daily_stats + (date, dashboard, run_count, passed, failed, + sum_duration, min_duration, max_duration, p50_duration, p95_duration) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + date, dashboard, g['passed'] + g['failed'], + g['passed'], g['failed'], + round(sum(durs), 2) if durs else 0, + round(min(durs), 1) if durs else None, + round(max(durs), 1) if durs else None, + round(durs[n // 2], 1) if durs else None, + round(durs[int(n * 0.95)], 1) if durs else None, + )) + inserted += 1 + conn.commit() + print(f"[rk_metrics] Materialized {inserted} ci_run_daily_stats rows") + + +def _backfill_test_hashes(): + """Populate test_hash for existing test_events rows that are missing it.""" + conn = db.get_db() + rows = conn.execute( + "SELECT DISTINCT test_cmd FROM test_events WHERE test_hash IS NULL AND test_cmd != ''" + ).fetchall() + if not rows: + return + for row in rows: + cmd = row['test_cmd'] + h = hash_str_orig(cmd) + conn.execute( + "UPDATE test_events SET test_hash = ? WHERE test_cmd = ? AND test_hash IS NULL", + (h, cmd)) + conn.commit() + print(f"[rk_metrics] Backfilled test_hash for {len(rows)} distinct test commands") + + +# ---- CloudTrail instance type resolution ---- + +_ct_resolve_ts = 0 +_CT_RESOLVE_TTL = 6 * 3600 # 6 hours + + +def _fetch_cloudtrail_daily(ct, event_name, start_time, end_time, max_per_day=10000): + """Fetch CloudTrail events in daily chunks to avoid the 5000-event global limit.""" + events = [] + day = start_time.replace(hour=0, minute=0, second=0, microsecond=0) + while day < end_time: + day_end = min(day + timedelta(days=1), end_time) + kwargs = { + 'LookupAttributes': [ + {'AttributeKey': 'EventName', 'AttributeValue': event_name}, + ], + 'StartTime': day, + 'EndTime': day_end, + 'MaxResults': 50, + } + while True: + resp = ct.lookup_events(**kwargs) + events.extend(resp.get('Events', [])) + token = resp.get('NextToken') + if not token or len(events) >= max_per_day: + break + kwargs['NextToken'] = token + day += timedelta(days=1) + return events + + +# Name tag format: _[_] +_NAME_TAG_RE = re.compile(r'^(.+)_(amd64|arm64)(?:_.*)?$') + + +def _normalize_branch_name(name): + """Normalize a branch name the same way bootstrap_ec2 does for the EC2 Name tag.""" + m = re.match(r'^gh-readonly-queue/[^/]+/pr-(\d+)', name) + if m: + return f'pr-{m.group(1)}' + name = re.sub(r'\s*\(queue\)$', '', name) + return re.sub(r'[^a-zA-Z0-9-]', '_', name[:50]) + + +def resolve_unknown_instance_types(): + """Query CloudTrail for RunInstances + CreateTags events to resolve unknown instance types. + + Strategy: + 1. Fetch RunInstances events (daily chunks) → instance_id → instance_type + launch_time + 2. Fetch CreateTags events (daily chunks) → instance_id → {Name, Group, Dashboard, ...} + Tags are accumulated across multiple events then filtered to Group=build-instance. + 3. Join by instance_id, then match to ci_runs by normalized branch name + arch + time window. + """ + global _ct_resolve_ts + now = time.time() + if now - _ct_resolve_ts < _CT_RESOLVE_TTL: + return + _ct_resolve_ts = now + + conn = db.get_db() + unknown_runs = conn.execute(''' + SELECT dashboard, name, timestamp_ms, complete_ms, instance_vcpus, spot, + cost_usd, arch, pr_number + FROM ci_runs + WHERE (instance_type IS NULL OR instance_type = '' OR instance_type = 'unknown') + AND timestamp_ms > ? + ''', (int((time.time() - 90 * 86400) * 1000),)).fetchall() + + if not unknown_runs: + return + + try: + import boto3 + except ImportError: + return + + try: + ct = boto3.client('cloudtrail', region_name='us-east-2') + start_time = datetime.fromtimestamp( + min(r['timestamp_ms'] for r in unknown_runs) / 1000 - 300, tz=timezone.utc) + end_time = datetime.now(timezone.utc) + + # Step 1: Fetch RunInstances events in daily chunks → instance_id → type + launch time + run_events = _fetch_cloudtrail_daily(ct, 'RunInstances', start_time, end_time) + instance_types = {} + instance_launch_times = {} + for event in run_events: + try: + detail = json.loads(event.get('CloudTrailEvent', '{}')) + itype = detail.get('requestParameters', {}).get('instanceType', '') + items = (detail.get('responseElements') or {}).get('instancesSet', {}).get('items', []) + for item in items: + iid = item.get('instanceId', '') + item_type = item.get('instanceType', '') or itype + if iid and item_type: + instance_types[iid] = item_type + instance_launch_times[iid] = int(event['EventTime'].timestamp() * 1000) + except Exception: + continue + + if not instance_types: + print("[rk_metrics] CloudTrail: no RunInstances events found") + return + + # Step 2: Fetch CreateTags events in daily chunks. + # Accumulate ALL tags per instance first, then filter to build instances. + # This handles the case where Name, Group, and Dashboard are set in separate + # create-tags API calls (aws_request_instance_type lines 97, 126, 127). + tag_events = _fetch_cloudtrail_daily(ct, 'CreateTags', start_time, end_time) + all_instance_tags = {} + for event in tag_events: + try: + detail = json.loads(event.get('CloudTrailEvent', '{}')) + req = detail.get('requestParameters', {}) + resources = req.get('resourcesSet', {}).get('items', []) + tags = req.get('tagSet', {}).get('items', []) + tag_dict = {t.get('key', ''): t.get('value', '') for t in tags} + for res in resources: + rid = res.get('resourceId', '') + if rid.startswith('i-'): + if rid not in all_instance_tags: + all_instance_tags[rid] = {} + all_instance_tags[rid].update(tag_dict) + except Exception: + continue + + # Filter to build instances + instance_tags = { + iid: tags for iid, tags in all_instance_tags.items() + if tags.get('Group') == 'build-instance' + } + + # Step 3: Join RunInstances + CreateTags by instance_id + instances = [] + for iid, itype in instance_types.items(): + tags = instance_tags.get(iid, {}) + if not tags.get('Name'): + continue + instances.append({ + 'instance_type': itype, + 'launch_ms': instance_launch_times.get(iid, 0), + 'dashboard': tags.get('Dashboard', ''), + 'name_tag': tags.get('Name', ''), + }) + + # Build index: normalized branch name → [instances] + tag_index = {} + for inst in instances: + m = _NAME_TAG_RE.match(inst['name_tag']) + if m: + tag_index.setdefault(m.group(1), []).append(inst) + else: + tag_index.setdefault(inst['name_tag'], []).append(inst) + + # Step 4: Match unknown runs to instances + updated = 0 + for run in unknown_runs: + run_name = run['name'] + run_arch = run['arch'] or '' + run_ts = run['timestamp_ms'] + run_dashboard = run['dashboard'] + + expected_name = _normalize_branch_name(run_name) + candidates = tag_index.get(expected_name, []) + + best = None + for inst in candidates: + # Verify arch matches + if run_arch: + m = _NAME_TAG_RE.match(inst['name_tag']) + if m and m.group(2) != run_arch: + continue + + # Verify dashboard matches (if tag present) + if inst['dashboard'] and inst['dashboard'] != run_dashboard: + continue + + # CI run starts after instance launch; allow up to 90 min (instance lifetime) + delta = run_ts - inst['launch_ms'] + if delta < -60_000 or delta > 5400_000: + continue + + # Prefer most recently launched instance before the run + if delta >= 0 and (best is None or inst['launch_ms'] > best['launch_ms']): + best = inst + elif best is None and abs(delta) < 60_000: + best = inst + + if best: + itype = best['instance_type'] + is_spot = bool(run['spot']) + rate = ec2_pricing.get_instance_rate(itype, is_spot) + new_cost = run['cost_usd'] + if rate and run['complete_ms'] and run['timestamp_ms']: + hours = (run['complete_ms'] - run['timestamp_ms']) / 3_600_000 + new_cost = round(hours * rate, 4) + conn.execute(''' + UPDATE ci_runs SET instance_type = ?, cost_usd = ? + WHERE dashboard = ? AND timestamp_ms = ? AND name = ? + ''', (itype, new_cost, run['dashboard'], run['timestamp_ms'], run['name'])) + updated += 1 + + conn.commit() + if updated: + print(f"[rk_metrics] CloudTrail: resolved {updated}/{len(unknown_runs)} unknown instance types") + else: + print(f"[rk_metrics] CloudTrail: {len(instances)} instances, " + f"0/{len(unknown_runs)} matched") + except Exception as e: + print(f"[rk_metrics] CloudTrail resolution failed: {e}") + + +def recalculate_all_costs(): + """Recalculate cost_usd for all ci_runs based on current instance_type and pricing.""" + conn = db.get_db() + runs = conn.execute(''' + SELECT dashboard, name, timestamp_ms, complete_ms, instance_type, + instance_vcpus, spot, cost_usd + FROM ci_runs + WHERE complete_ms IS NOT NULL AND complete_ms > 0 + ''').fetchall() + updated = 0 + for run in runs: + cost = compute_run_cost({ + 'complete': run['complete_ms'], + 'timestamp': run['timestamp_ms'], + 'instance_type': run['instance_type'] or 'unknown', + 'spot': run['spot'], + 'instance_vcpus': run['instance_vcpus'], + }) + if cost is not None and cost != run['cost_usd']: + conn.execute(''' + UPDATE ci_runs SET cost_usd = ? + WHERE dashboard = ? AND timestamp_ms = ? AND name = ? + ''', (cost, run['dashboard'], run['timestamp_ms'], run['name'])) + updated += 1 + conn.commit() + print(f"[rk_metrics] Recalculated costs: {updated}/{len(runs)} changed") + return updated def start_ci_run_sync(redis_conn): """Start periodic CI run + test event sync thread.""" _load_seed_data() + _backfill_daily_stats() + _backfill_test_hashes() + _materialize_ci_run_daily_stats() def loop(): while True: try: sync_ci_runs_to_sqlite(redis_conn) sync_failed_tests_to_sqlite(redis_conn) + resolve_unknown_instance_types() + _materialize_ci_run_daily_stats() + db.cache_cleanup() except Exception as e: print(f"[rk_metrics] sync error: {e}") time.sleep(600) # check every 10 min (TTL gates actual work) @@ -600,3 +1202,21 @@ def get_flakes_by_command(date_from, date_to, dashboard=''): 'total_failures': sum(failures_by_command.values()), }, } + + +def get_test_history(test_hash: str, branch: str = '', limit: int = 1000) -> list[dict]: + """Get test event history by test_hash, matching Redis history_{hash}[_{branch}] lists.""" + conditions = ['test_hash = ?'] + params: list = [test_hash] + if branch: + conditions.append('ref_name = ?') + params.append(branch) + where = 'WHERE ' + ' AND '.join(conditions) + params.append(limit) + return db.query(f''' + SELECT status, test_cmd, log_url, ref_name, commit_author, + commit_msg, duration_secs, dashboard, timestamp + FROM test_events {where} + ORDER BY timestamp DESC + LIMIT ? + ''', params) diff --git a/ci3/ci-metrics/requirements.txt b/ci3/ci-metrics/requirements.txt index d6516263133f..310ecadf230a 100644 --- a/ci3/ci-metrics/requirements.txt +++ b/ci3/ci-metrics/requirements.txt @@ -6,3 +6,4 @@ Flask-HTTPAuth requests google-cloud-bigquery boto3 +pytest diff --git a/ci3/ci-metrics/rk.py b/ci3/ci-metrics/rk.py new file mode 100644 index 000000000000..d099b92dbbd7 --- /dev/null +++ b/ci3/ci-metrics/rk.py @@ -0,0 +1 @@ +from app import app diff --git a/ci3/ci-metrics/test_cache.py b/ci3/ci-metrics/test_cache.py new file mode 100644 index 000000000000..5538a3810958 --- /dev/null +++ b/ci3/ci-metrics/test_cache.py @@ -0,0 +1,154 @@ +"""Automated performance tests: SQLite response cache makes 1-year ci-insights fast. + +Usage: + pip install pytest + METRICS_DB_PATH=/tmp/t.db DASHBOARD_PASSWORD=test REDIS_HOST=invalid pytest test_cache.py -v + +All 18 parametrised tests should pass. Cold requests may take several seconds; +warm (cached) requests must be < 100 ms each. +""" +import base64 +import json +import os +import tempfile +import time +from datetime import date, timedelta + +# Set env vars BEFORE importing the app so db path and Redis host are fixed +_db_path = tempfile.mktemp(suffix='.db') +os.environ.setdefault('METRICS_DB_PATH', _db_path) +os.environ.setdefault('DASHBOARD_PASSWORD', 'test') +os.environ.setdefault('REDIS_HOST', 'invalid') # causes Redis errors, swallowed silently + +import pytest + +# Import app after env vars are set; background threads start but Redis fails gracefully +from app import app +import db + +# Basic-auth header for 'test:test' +_AUTH = {'Authorization': 'Basic ' + base64.b64encode(b'test:test').decode()} + +YEAR_FROM = '2025-02-24' +YEAR_TO = '2026-02-24' + +ENDPOINTS = [ + f'/api/ci/performance?from={YEAR_FROM}&to={YEAR_TO}&granularity=daily', + f'/api/ci/phases?from={YEAR_FROM}&to={YEAR_TO}', + f'/api/ci/flakes-by-command?from={YEAR_FROM}&to={YEAR_TO}', + f'/api/tests/timings?from={YEAR_FROM}&to={YEAR_TO}', + f'/api/merge-queue/stats?from={YEAR_FROM}&to={YEAR_TO}', + f'/api/prs/metrics?from={YEAR_FROM}&to={YEAR_TO}', +] + + +def _seed(): + """Insert one year of synthetic data covering all 6 ci-insights endpoints.""" + conn = db.get_db() + dashboards = ['next', 'prs', 'master'] + start = date(2025, 2, 24) + end = date(2026, 2, 24) + ts_base = int(time.mktime(start.timetuple())) * 1000 + ms_per_day = 86_400_000 + + for i, day in enumerate( + start + timedelta(days=n) for n in range((end - start).days + 1) + ): + ds = day.isoformat() + ts = ts_base + i * ms_per_day + + # merge_queue_daily — one row per day + conn.execute( + 'INSERT OR IGNORE INTO merge_queue_daily (date, total, success, failure) VALUES (?,10,8,2)', + (ds,), + ) + + for dash in dashboards: + # ci_runs — 5 per pipeline per day + for j in range(5): + conn.execute( + '''INSERT OR IGNORE INTO ci_runs + (dashboard, name, timestamp_ms, complete_ms, status, author, synced_at) + VALUES (?,?,?,?,?,?,?)''', + ( + dash, f'run-{i}-{dash}-{j}', + ts + j * 60_000, + ts + j * 60_000 + 3_600_000, + 'PASSED' if j % 5 != 0 else 'FAILED', + 'ci-bot', ds, + ), + ) + + # test_daily_stats + conn.execute( + '''INSERT OR IGNORE INTO test_daily_stats + (date, test_cmd, dashboard, passed, failed, flaked) VALUES (?,?,?,80,5,2)''', + (ds, f'test_{dash}', dash), + ) + + # test_events — 3 per pipeline per day (one flaked for flakes endpoint) + for j in range(3): + conn.execute( + '''INSERT OR IGNORE INTO test_events + (status, test_cmd, ref_name, dashboard, timestamp, duration_secs) + VALUES (?,?,?,?,?,?)''', + ( + 'passed' if j < 2 else 'flaked', + f'test_{dash}', + 'main', dash, + f'{ds}T12:00:0{j}', + 30.0 + j, + ), + ) + + # ci_phases — build + test phases per pipeline per day + for phase in ('build', 'test'): + conn.execute( + '''INSERT OR IGNORE INTO ci_phases + (phase, duration_secs, dashboard, timestamp) VALUES (?,?,?,?)''', + (phase, 1200.0, dash, f'{ds}T12:00:00'), + ) + + conn.commit() + + +@pytest.fixture(scope='session', autouse=True) +def seeded_db(): + _seed() + + +@pytest.fixture(scope='session') +def client(): + app.config['TESTING'] = True + with app.test_client() as c: + yield c + + +@pytest.mark.parametrize('url', ENDPOINTS) +def test_cold_returns_valid_json(client, url): + """First request computes from SQLite and returns valid JSON.""" + r = client.get(url, headers=_AUTH) + assert r.status_code == 200, f'HTTP {r.status_code}: {r.data[:200]}' + data = json.loads(r.data) + assert data # non-empty response + + +@pytest.mark.parametrize('url', ENDPOINTS) +def test_warm_hit_under_100ms(client, url): + """Second request is served from cache and completes in < 100 ms.""" + # Ensure cold request ran (order not guaranteed across parametrised tests) + client.get(url, headers=_AUTH) + # Warm request — must hit cache + t0 = time.perf_counter() + r = client.get(url, headers=_AUTH) + elapsed_ms = (time.perf_counter() - t0) * 1000 + assert r.status_code == 200 + assert elapsed_ms < 100, f'{url}: cache hit took {elapsed_ms:.1f} ms (limit 100 ms)' + + +@pytest.mark.parametrize('url', ENDPOINTS) +def test_cached_response_matches_original(client, url): + """Cached response is byte-for-byte identical to the original.""" + r1 = client.get(url, headers=_AUTH) + r2 = client.get(url, headers=_AUTH) + assert r1.data == r2.data diff --git a/ci3/ci-metrics/test_cloudtrail.py b/ci3/ci-metrics/test_cloudtrail.py new file mode 100644 index 000000000000..8acd71925cec --- /dev/null +++ b/ci3/ci-metrics/test_cloudtrail.py @@ -0,0 +1,302 @@ +#!/usr/bin/env python3 +"""Test CloudTrail instance type resolution against real data + SQLite. + +Usage: + python3 test_cloudtrail.py /path/to/metrics.db --dry-run # preview matches + python3 test_cloudtrail.py /path/to/metrics.db # apply updates + python3 test_cloudtrail.py --days 7 --dry-run # only last 7 days +""" +import json +import os +import re +import sqlite3 +import sys +from datetime import datetime, timedelta, timezone + +try: + import boto3 +except ImportError: + print("ERROR: boto3 not installed") + sys.exit(1) + +DB_PATH = os.getenv('METRICS_DB_PATH', + os.path.join(os.getenv('LOGS_DISK_PATH', '/logs-disk'), 'metrics.db')) +for arg in sys.argv[1:]: + if not arg.startswith('-') and os.path.exists(arg): + DB_PATH = arg + break + +dry_run = '--dry-run' in sys.argv +days_back = 90 +for i, arg in enumerate(sys.argv): + if arg == '--days' and i + 1 < len(sys.argv): + days_back = int(sys.argv[i + 1]) + +ct = boto3.client('cloudtrail', region_name='us-east-2') + + +def fetch_events(event_name, start, end, max_events=10000): + events = [] + kwargs = { + 'LookupAttributes': [{'AttributeKey': 'EventName', 'AttributeValue': event_name}], + 'StartTime': start, 'EndTime': end, 'MaxResults': 50, + } + while True: + resp = ct.lookup_events(**kwargs) + events.extend(resp.get('Events', [])) + token = resp.get('NextToken') + if not token or len(events) >= max_events: + break + kwargs['NextToken'] = token + return events + + +def normalize_branch_name(name): + """Normalize a branch name the same way bootstrap_ec2 does for the EC2 Name tag.""" + # Strip merge queue prefix: gh-readonly-queue/next/pr-123-... → pr-123 + m = re.match(r'^gh-readonly-queue/[^/]+/pr-(\d+)', name) + if m: + return f'pr-{m.group(1)}' + # Strip " (queue)" suffix from log_ci_run simplified names + name = re.sub(r'\s*\(queue\)$', '', name) + # Same as: echo -n "$REF_NAME" | head -c 50 | tr -c 'a-zA-Z0-9-' '_' + return re.sub(r'[^a-zA-Z0-9-]', '_', name[:50]) + + +# ---- Step 1: Fetch RunInstances events in daily chunks ---- +end_time = datetime.now(timezone.utc) +start_time = end_time - timedelta(days=days_back) + +print(f"Fetching RunInstances events in daily chunks ({start_time.date()} to {end_time.date()})...") +instance_types = {} # instance_id → instance_type +instance_times = {} # instance_id → launch_time_ms +total_run_events = 0 + +day_start = start_time.replace(hour=0, minute=0, second=0, microsecond=0) +while day_start < end_time: + day_end = min(day_start + timedelta(days=1), end_time) + events = fetch_events('RunInstances', day_start, day_end) + total_run_events += len(events) + + for event in events: + try: + detail = json.loads(event.get('CloudTrailEvent', '{}')) + itype = detail.get('requestParameters', {}).get('instanceType', '') + items = (detail.get('responseElements') or {}).get('instancesSet', {}).get('items', []) + for item in items: + iid = item.get('instanceId', '') + item_type = item.get('instanceType', '') or itype + if iid and item_type: + instance_types[iid] = item_type + instance_times[iid] = int(event['EventTime'].timestamp() * 1000) + except Exception: + continue + + day_start = day_start + timedelta(days=1) + sys.stdout.write(f"\r {day_start.strftime('%Y-%m-%d')}: {total_run_events} events, {len(instance_types)} instances") + sys.stdout.flush() + +print(f"\n Total: {total_run_events} RunInstances events, {len(instance_types)} unique instances") + +if not instance_types: + print("No RunInstances data. Exiting.") + sys.exit(1) + +# ---- Step 2: Fetch CreateTags events in daily chunks ---- +# NOTE: Tags are applied to CI instances in multiple create-tags calls: +# 1. aws_request_instance_type line 97: Name + Group + GithubActor + CICommand + Dashboard (all at once) +# 2. aws_request_instance_type line 126: Name only (redundant, after SSH) +# 3. aws_request_instance_type line 127: Group only (redundant, after SSH) +# CloudTrail sometimes misses event #1, so we must accumulate tags from ALL events +# for each instance, then filter to build instances afterwards. +print(f"\nFetching CreateTags events in daily chunks...") +all_instance_tags = {} # instance_id → accumulated tags (unfiltered) +total_tag_events = 0 + +day_start = start_time.replace(hour=0, minute=0, second=0, microsecond=0) +while day_start < end_time: + day_end = min(day_start + timedelta(days=1), end_time) + events = fetch_events('CreateTags', day_start, day_end) + total_tag_events += len(events) + + for event in events: + try: + detail = json.loads(event.get('CloudTrailEvent', '{}')) + req = detail.get('requestParameters', {}) + resources = req.get('resourcesSet', {}).get('items', []) + tags = req.get('tagSet', {}).get('items', []) + tag_dict = {t.get('key', ''): t.get('value', '') for t in tags} + for res in resources: + rid = res.get('resourceId', '') + if rid.startswith('i-'): + if rid not in all_instance_tags: + all_instance_tags[rid] = {} + all_instance_tags[rid].update(tag_dict) + except Exception: + continue + + day_start = day_start + timedelta(days=1) + sys.stdout.write(f"\r {day_start.strftime('%Y-%m-%d')}: {total_tag_events} events, {len(all_instance_tags)} instances") + sys.stdout.flush() + +# Filter to build instances (those with Group=build-instance tag) +instance_tags = { + iid: tags for iid, tags in all_instance_tags.items() + if tags.get('Group') == 'build-instance' +} +print(f"\n Total: {total_tag_events} CreateTags events, {len(all_instance_tags)} total instances, {len(instance_tags)} build instances") + +# ---- Step 3: Join RunInstances + CreateTags by instance_id ---- +instances = [] +joined_count = 0 +for iid, itype in instance_types.items(): + tags = instance_tags.get(iid, {}) + has_tags = bool(tags.get('Name')) + if has_tags: + joined_count += 1 + instances.append({ + 'instance_id': iid, + 'instance_type': itype, + 'launch_ms': instance_times.get(iid, 0), + 'dashboard': tags.get('Dashboard', ''), + 'name_tag': tags.get('Name', ''), + 'actor': tags.get('GithubActor', ''), + }) + +print(f"\n Joined: {len(instances)} total RunInstances, {joined_count} with Name tag from CreateTags") +print(f" CreateTags instances NOT in RunInstances: {len(instance_tags) - joined_count}") + +# Show type distribution +type_counts = {} +for inst in instances: + if inst['name_tag']: + type_counts[inst['instance_type']] = type_counts.get(inst['instance_type'], 0) + 1 +print(f"\n Instance types (from joined data):") +for t, c in sorted(type_counts.items(), key=lambda x: -x[1]): + print(f" {t}: {c}") + +# ---- Step 4: Load SQLite and match ---- +if not os.path.exists(DB_PATH): + print(f"\nNo database at {DB_PATH}. Exiting after CloudTrail summary.") + sys.exit(0) + +conn = sqlite3.connect(DB_PATH) +conn.row_factory = sqlite3.Row + +cutoff_ms = int((datetime.now(timezone.utc) - timedelta(days=days_back)).timestamp() * 1000) +unknown_runs = conn.execute(''' + SELECT dashboard, name, timestamp_ms, complete_ms, instance_vcpus, spot, + cost_usd, arch, pr_number + FROM ci_runs + WHERE (instance_type IS NULL OR instance_type = '' OR instance_type = 'unknown') + AND timestamp_ms > ? +''', (cutoff_ms,)).fetchall() +print(f"\n{len(unknown_runs)} unknown ci_runs in last {days_back} days") + +# Build lookup: normalized_name → [instances] for fast matching +# Name tag format: _[_] +# Examples: +# next_amd64 → branch=next +# merge-train_spartan_amd64_17 → branch=merge-train_spartan +# pr-20419_arm64_a1-fast → branch=pr-20419 +# cl_all_in_makefile_arm64_a1-fast → branch=cl_all_in_makefile +_NAME_TAG_RE = re.compile(r'^(.+)_(amd64|arm64)(?:_.*)?$') +tag_index = {} +for inst in instances: + if inst['name_tag']: + m = _NAME_TAG_RE.match(inst['name_tag']) + if m: + branch = m.group(1) + tag_index.setdefault(branch, []).append(inst) + else: + # No arch suffix found — use name as-is + tag_index.setdefault(inst['name_tag'], []).append(inst) + +updated = 0 +unmatched_dashboards = {} +matches = [] +for run in unknown_runs: + run_name = run['name'] + run_arch = run['arch'] or '' + run_ts = run['timestamp_ms'] + run_dashboard = run['dashboard'] + + # Compute expected EC2 instance name (same as bootstrap_ec2) + expected_name = normalize_branch_name(run_name) + + # Look up by normalized name + candidates = tag_index.get(expected_name, []) + + best = None + best_delta = float('inf') + for inst in candidates: + # Verify arch matches — Name tag format: branch_[_postfix] + if run_arch: + m = _NAME_TAG_RE.match(inst['name_tag']) + if m and m.group(2) != run_arch: + continue + # Verify dashboard matches (if tag present) + if inst['dashboard'] and inst['dashboard'] != run_dashboard: + continue + # CI run should start AFTER instance launch. Instance runs multiple steps + # over its ~90-minute lifetime (default shutdown timer). + delta = run_ts - inst['launch_ms'] + if delta < -60_000: # run shouldn't start >1 min before launch + continue + if delta > 5400_000: # 90 min max lifetime + continue + # Prefer the most recently launched instance (closest launch BEFORE run) + if delta >= 0 and (best is None or inst['launch_ms'] > best['launch_ms']): + best_delta = delta + best = inst + elif best is None and abs(delta) < 60_000: + # Allow small negative delta (clock skew) + best_delta = abs(delta) + best = inst + + if best: + matches.append({ + 'dashboard': run_dashboard, + 'name': run_name, + 'timestamp_ms': run_ts, + 'new_type': best['instance_type'], + 'delta_s': round(best_delta / 1000), + 'tag': best['name_tag'], + 'iid': best['instance_id'], + }) + if not dry_run: + conn.execute(''' + UPDATE ci_runs SET instance_type = ? + WHERE dashboard = ? AND timestamp_ms = ? AND name = ? + ''', (best['instance_type'], run_dashboard, run_ts, run_name)) + updated += 1 + else: + unmatched_dashboards[run_dashboard] = unmatched_dashboards.get(run_dashboard, 0) + 1 + +if not dry_run and updated: + conn.commit() + +print(f"\n{'Would resolve' if dry_run else 'Resolved'} {updated}/{len(unknown_runs)} unknown instance types") + +if matches: + print(f"\nSample matches:") + for m in matches[:30]: + print(f" [{m['dashboard']:6s}] {m['name']:45s} -> {m['new_type']:15s} " + f"(dt={m['delta_s']:4d}s, tag={m['tag']}, id={m['iid']})") + if len(matches) > 30: + print(f" ... and {len(matches) - 30} more") + + # Summary by type + type_counts = {} + for m in matches: + type_counts[m['new_type']] = type_counts.get(m['new_type'], 0) + 1 + print(f"\nResolved types:") + for t, c in sorted(type_counts.items(), key=lambda x: -x[1]): + print(f" {t}: {c}") + +if unmatched_dashboards: + print(f"\nUnmatched by dashboard:") + for d, c in sorted(unmatched_dashboards.items(), key=lambda x: -x[1]): + print(f" {d}: {c}") + +conn.close() diff --git a/ci3/ci-metrics/views/ci-health-report.html b/ci3/ci-metrics/views/ci-health-report.html new file mode 100644 index 000000000000..e23b0165116e --- /dev/null +++ b/ci3/ci-metrics/views/ci-health-report.html @@ -0,0 +1,999 @@ + + + + + +Aztec CI Health Report + + + + + +
+ + +
+
aztec-packages · Jan 21 – Feb 23, 2026
+

CI Health Report

+
34-day view · spend · flake · pipeline health · actions
+
+
$6,107
AWS CI EC2 (34d · metrics.db)
+
$22,738
AWS operational total (34d · Cost Explorer, ex-tax, ex-contract)
+
$9,972
GCP compute (30d · 31 namespaces)
+
63.5%
merge queue success (712 attempts)
+
92.3%
next pipeline pass rate
+
33.6 min
next P50 · +8% in 3 weeks
+
+
source: ci.aztec-labs.com metrics.db + BigQuery · 5,136 tracked runs · Jan 21–Feb 23, 2026
+
+ + +
+
§1 spend · overview
+

$6,107 AWS CI EC2 in 34 days — network is the cost hotspot at 30% of spend, 18.4% spot

+
Jan 21–Feb 23 · metrics.db cost_usd · CI pipelines only (GCP on next slide)
+
+
+
+
+
+
+
network pipeline
+
$5.03/run avg
+
523 runs · only 15.5% spot · $2,632 total = 37% of all CI spend
+
+
+
prs pipeline
+
$0.72/run avg
+
1,864 runs · 94.4% spot · $1,340 total · most cost-efficient pipeline
+
+
Network spot rate 15.5% vs 87–99% everywhere else. Spot costs ~86% less than on-demand. Network pipeline is the single largest AWS cost lever.
+
+
+
+ + +
+
§1 spend · full picture
+

$16K/month total: $9K dev/testing across both clouds, $6K always-on production

+
AWS CI EC2 $6,107 (34d · metrics.db) · AWS operational $22,738 (34d · Cost Explorer, ex-tax, ex-contract) · GCP $9,972 (30d)
+
+
+
+
+
+
+
+
+
+
GCP production clusters
+
$6,379/mo
+
testnet $997 · next-net $859 · mainnet $816 · staging $792 · fisherman $334. Always-on regardless of CI activity.
+
+
$6,107 = CI run EC2 tracked in metrics.db — what we directly pay per CI job. Gap to $16,221 total EC2 bill = always-on infra (NAT, VPC, bastion, EFS) not tagged as CI runs. $22,738 = full AWS operational (Cost Explorer, 34d): EC2 $16,221 + CloudFront $1,829 + EC2-Other $2,177 + ElastiCache $637 + misc $1,874. Excluded: $17,520 annual Savings Plan commitment (one-time contract) + $6,172 tax → $46,430 total AWS bill. GCP: $2,129 CI namespaces + $6,379 prod clusters (always-on, CI-independent).
+
+
+
+ + +
+
§1 spend · weekly trend
+

W04 network anomaly: $999 in one week — Feb 14 instance change cut it 91%

+
Stacked weekly CI spend by pipeline · m6a.48xlarge eliminated from network CI Feb 14, replaced by m6a.4xlarge
+
+
+
+
+
+
+
W04 network (m6a.48xlarge, on-demand)
+
$9.79/run
+
192 vCPUs, on-demand pricing. Long-failing jobs during p2p/epoch instability. $999 in one week.
+
+
+
W07 network (m6a.4xlarge)
+
$0.85/run
+
$94 total for the week. Network tests no longer run on the 192-vCPU on-demand beast.
+
+
W07 total: $988 vs W06 peak $1,947. 49% weekly reduction driven by eliminating m6a.48xlarge from network tests. kind tests also moved to spot this period.
+
+
+
+ + +
+
§1 spend · spot discipline
+

Network pipeline: 15.5% spot — every other pipeline runs 87–99% spot

+
Spot instances cost ~86% less than on-demand at equivalent capacity. The gap is isolated to network.
+
+
+
+
+
+
Network at 15.5% spot: 442 of 523 runs were on-demand. At prs pipeline spot rates (94.4%) the network bill drops from $2,632 to ~$370. ~$2,200/month opportunity.
+
GCP spot migration: $1,397/month in 0%-spot namespaces (eth-mainnet, sepolia, monitoring). Ops complexity — preemption handling for archive nodes. See §4.
+
kind tests moved to spot (done). Network pipeline is the next highest-ROI action on AWS.
+
+
+
+ + +
+
§2 flake · weekly pattern
+

One cluster drove every spike — p2p/epoch cleared completely in W07

+
Jan 14–Feb 23 · stacked: p2p/epoch (orange) · other flakes (blue) · hard fails (red)
+
+
+
+
+
+
+
p2p/epoch cluster (W02–W06)
+
2,034 events
+
56 distinct tests · all classified as flakes (0 hard fails) · 747 events in W06 → 2 in W07
+
+
W06 (Feb 9): 515 hard fails — highest of any week. Three high-risk PRs merged same day (see slide 8).
+
W08 (Feb 23, partial): 36 events, 0 p2p/epoch. Too early to call — partial week, low volume.
+
+
+
+ + +
+
§2 flake · anatomy
+

2,034 labeled events across 56 tests — and 1,912 unlabeled failures behind them

+
The e2e-p2p-epoch-flakes cluster is tagged. Everything else is anonymous.
+
+
+
+
+
+
+
e2e-p2p-epoch-flakes · 2,034 events · 56 tests · 0 hard fails
+

Known cluster. All events classified as flakes — timing-driven, not code bugs. Cleared in W07 after Santiago PRs #20351 (mbps test fix) and #20462 (remove hardcoded 10s timeout), plus ludamad #20613 (CI parallelism). Root cause: PXE using latest (not checkpointed) block across distributed epoch transitions. Will recur.

+
+
+
Unlabeled · 1,912 events · 938 distinct tests · merge queue failures
+

These are failures in the merge queue (next pipeline). In the queue, any failure blocks a PR — whether it's a real bug, infrastructure noise, or a timing race. 938 distinct failing tests with no cluster assignment and no assigned owner. This is the long tail of test rot that grows with every feature sprint.

+
+
The labeled cluster cleared. The unlabeled tail didn't. 938 distinct failing tests is residual risk that accumulates without active maintenance.
+
+
+
+ + +
+
§2 flake · signal quality
+

New flakes keep appearing — the floor doesn't clear, and 4 grinds didn't catch W04

+
0.12% overall flake rate (6,632 flakes / 5.5M test runs) — low headline, lumpy reality
+
+
+
W04: the gate failed
+
+
valid_epoch_pruned_slash: 0 events W03 → 346 events W04
+

This test passed the 4-run deflake gate and was merged. In its first full week in CI it produced 346 fail/flake events — 33% of the entire W04 spike. Similarly, tx_proposal_collector (180 events) and inactivity_slash_with_consequence (74 events) debuted in W04 with immediate instability. All passed the gate.

+
+
+
team-alpha now running 10 grinds
+

A stricter informal standard adopted by one team. Not yet universal policy. Reduces risk of introducing flaky tests but increases CI time per new test by 2.5×. Proposal: targeted deflake — instrument tests for determinism before setting a grind count. Outstanding from offsite.

+
+
+
+
W07: new regressions after p2p fix
+
+
profile_gates.test.ts (12 events) + compile.test.ts (11 events)
+

Both appeared for the first time in W07 — the same week the p2p cluster cleared. CLI-layer tests, hash-prefixed job context. Not present in W06. New regressions introduced during the W07 fix sprint, or surface area exposed once p2p noise was removed.

+
+
+
~7% of CI run failures are genuine code bugs
+

Most failures are infrastructure noise (nightly 61% quick-fail) or p2p timing (labeled cluster). Merge-queue failures often behave like flakes regardless of their root cause — they block PRs that may be perfectly correct. New failing tests appear every week; old ones aren't fully removed.

+
+
+
+
+ + +
+
§2 flake · PR correlation
+

What caused the spikes — and who fixed them

+
Attribution via ci_runs × pr_authors. Santiago Palladino: 18+ fix PRs in 6 weeks.
+
+
+
spike causes
+
+ +
+
W02 — Jan 13 · 2,072 flakes
+
spalladino refactors (e2e setup, archiver, test setup). Changed patterns exposing latent timing races across p2p/epoch simultaneously. Tipping point for existing instability.
+
+
+
+ +
+
W04 — Jan 26 · 935 flakes
+
PhilWindle added cross-chain mbps e2e tests without pre-deflaking. valid_epoch_pruned_slash: 0→346 events. tx_proposal_collector: 180 events. inactivity_slash_with_consequence: 74 events. All absent the prior week.
+
+
+
+ +
+
W06 — Feb 9 · 850 flakes + 515 hard fails (worst week)
+
Three high-risk PRs same day: #20047 peer scoring (15:27, mrzeszutko), #20241 max checkpoints→32 (11:21), #20257 hash constants (14:38, LeilaWang). Spanning p2p + epoch config + consensus hashes — the exact subsystems where all flakes live.
+
+
+
+
+
fixes that worked
+
+ +
+
W03 — Jan 23 · −57% · root fix
+
spalladino: checkpointed chain tip for PXE. PXE was using latest (not checkpointed) block, causing epoch boundary races. 6 of 7 CI runs passed cleanly. Most impactful single fix of the period.
+
+
+
+ +
+
W05 — Feb 3–5 · −33%
+
spalladino #20088 slasher multi-block handling. PhilWindle #20140 deflake discv5. 20+ CI runs over 2 days before clean merge.
+
+
+
+ +
+
W07 — Feb 10–17 · −65% · p2p cluster cleared
+
spalladino #20351 fix mbps chain test (p2p_client 311→0 flakes). #20462 remove hardcoded 10s timeout. ludamad #20613 CI parallelism — insufficient workers caused timeout cascades misclassified as flakes.
+
+
+
+
+
Pattern: every spike triggered by new unstable tests or multi-subsystem config changes without pre-deflaking. Will recur without structural change.
+
+ + +
+
§2 flake · maintenance cost
+

30 fix/deflake PRs in 34 days — nearly one per day of ongoing maintenance overhead

+
PRs with "fix/flake/deflake" in title or branch · Jan 14–Feb 23 · engineers patching unstable tests continuously
+
+
+
recent (Feb)
+
+
#20636spypsy · Feb 18
+
#20609alexghr · Feb 17
+
#20439ludamad · Feb 12
+
#20335spypsy · Feb 10
+
#20330ludamad · Feb 10
+
#19322charlielye · Feb 10
+
#20246mverzilli · Feb 6
+
#20243mverzilli · Feb 6
+
#20240mverzilli · Feb 6
+
#20215ludamad · Feb 5
+
#20160mverzilli · Feb 4
+
#20095suyash67 · Feb 4
+
#20140PhilWindle · Feb 3
+
#20131alexghr · Feb 3
+
#20119alexghr · Feb 3
+
#20115danielntmd · Feb 2
+
+
+
+
earlier (Jan)
+
+
#20090mralj · Jan 30
+
#20070mralj · Jan 30
+
#20068mralj · Jan 30
+
#20040mralj · Jan 29
+
#20024spalladino · Jan 28
+
#20004spalladino · Jan 28
+
#19952ludamad · Jan 26
+
#19910spalladino · Jan 23
+
#19782alexghr · Jan 21
+
#19767spalladino · Jan 20
+
#19705PhilWindle · Jan 19
+
#19618alexghr · Jan 15
+
#19588spalladino · Jan 15
+
#19580PhilWindle · Jan 14
+
+
+
+
+
deflake maintenance rate
+
30
+
PRs in 34 days = 0.88/day. 8 distinct contributors. spalladino (6), alexghr (5), ludamad (4), mverzilli (4), mralj (4), PhilWindle (3).
+
+
This is the hidden cost of test rot — not paid in dollars but in senior engineer hours. Each deflake PR is an interrupt to feature work.
+
+
+
+ + +
+
§3 health · pass rates
+

next at 92% — nightly and network stuck at 58%, different failure modes

+
Jan 21–Feb 23 · ci_run_daily_stats · 4,870 total runs across 5 pipelines
+
+
+
+
+
+
+
nightly (573 runs)
+
58.1%
+
61% of failures complete in under 5 min — single job, crashes at startup before tests run. Environment instability, not code.
+
+
+
network (450 runs)
+
58.0%
+
9.5% quick-fail, 87.9min avg fail duration. Network runs often partially pass — a FAIL may have completed 80% of its test suite before one test times out.
+
+
Nightly failures are environment crashes (startup), not code regressions. Network failures are long-running timeouts — pass rate understates how much actually succeeds.
+
+
+
+ + +
+
§3 health · failure anatomy
+

61% of nightly failures complete in under 5 minutes — infra crashes, not code

+
Quick-fail proxy: FAILED runs completing in <5 min. Heuristic — some short unit tests legitimately fail fast.
+
+
+
+
+
+
nightly: 60.8% quick-fail (avg 5.2min to failure). Single job type (next), bailing at startup. The pipeline is crashing on environment setup — likely a Docker pull, secret mount, or k8s scheduling failure — before any test code runs.
+
prs: 30.3% quick-fail (168 of 555 failures). 1 in 3 PR CI failures is infrastructure, not the engineer's code. Significant developer friction that inflates apparent failure rates.
+
next: 9.0% quick-fail — next failures are mostly genuine test failures. The highest-signal pipeline for code quality.
+
+
+
+ + +
+
§3 health · build time
+

next P50 +8% in 3 weeks — P95 up 22%, tail latency accelerating

+
W05=Feb 2 · W06=Feb 9 · W07=Feb 16 · individual ci_runs PASSED records · exact percentiles
+
+
+
+
+
+
+
P95 growth (3 weeks)
+
+22%
+
45.3min (W05) → 55.1min (W07). At this rate: P95 exceeds 70min by May.
+
+
+
P50 growth (3 weeks)
+
+8%
+
31.0min (W05) → 33.6min (W07). Compounded across 1,100+ next runs/month.
+
+
W08 early data (Feb 23, 10 runs only): P50=32.9, P95=34.5. Too small a sample to conclude. No identified root cause for the growth trend.
+
+
+
+ + +
+
§3 health · merge queue
+

63.5% merge queue success — W04 bottomed at 57.6%, W07 still only 59.5%

+
712 total attempts · 452 successes · 251 failures · Jan 21–Feb 23
+
+
+
+
+
+
+
34-day success rate
+
63.5%
+
1 in 3 merge attempts needs a retry. Target for a healthy queue: 85%+.
+
+
W07 (59.5%) is worse than W06 (69.6%) despite the p2p fix. Hard test failures replaced flakes as merge blockers — the queue improved in one dimension and degraded in another.
+
W04 worst week: 91 failures out of 217 attempts (57.6%). Coincided with the p2p/epoch spike and the new unstable test introductions.
+
+
+
+ + +
+
§3 health · scheduling patterns
+

Tuesday 72% vs Thursday 82% — the Monday queue effect is measurable

+
Jan 21–Feb 23 · all pipelines · 4,870 runs · 9.5pp spread between worst and best weekday
+
+
+
+
+
+
Mon/Tue dip: PRs accumulate over the weekend and all enter the merge queue simultaneously Monday morning. Pipeline contention drives higher failure rates. By Thursday the queue has cleared and pass rates recover.
+
9.5pp spread (Tue 72.3% → Thu 81.8%). Thursday–Friday is measurably safer for large feature merges. Batch rollouts on Monday are higher risk.
+
+
+
+ + +
+
§4 GCP · optimization
+

$1,397/month in 0%-spot namespaces — ops complexity vs savings is the open question

+
Five GCP namespaces running 100% on-demand at 86% spot discount foregone · Jan 21–Feb 20
+
+
+
+
+
+
+
Monthly savings potential
+
$1,397/mo
+
eth-mainnet + sepolia ($800) · monitoring ($599). At 86% spot discount, matching cluster-wide spot rate.
+
+
eth-mainnet and sepolia: Ethereum archive nodes, 80% memory utilization. Spot preemption means re-syncing from chain tip. Engineering cost to handle graceful preemption needs evaluation.
+
metrics + public-telemetry: Prometheus/Grafana. Spot-tolerant with PVC persistence — a platform ops task, not a code change. Lower risk than archive nodes.
+
+
+
+ + +
+
§5 actions · done
+

Three improvements landed in February

+
Concrete changes with measured impact on cost and flake counts.
+
+
+
cost — instance type
+
+
m6a.48xlarge → m6a.4xlarge (Feb 14)
+

The 192-vCPU on-demand instance running network tests is gone. At W04's failure rate it cost $9.79/run and $999 in a single week. m6a.4xlarge costs $0.85/run. W07 network spend: $94 vs W04's $999. 91% per-run cost reduction. Network tests no longer run on an oversized on-demand machine. Done.

+
+
+
kind tests moved to spot
+

kind test runs moved from on-demand to spot instances. ~86% cost reduction on affected runs. Done.

+
+
+
+
flake — p2p/epoch cleared
+
+
e2e-p2p-epoch-flakes: 2,034 → ~0 events
+

The cluster that drove every spike since Jan 6 cleared in W07. Two p2p/epoch events in W07 vs 747 in W06. W08: zero p2p events so far.

+
+
+
Santiago Palladino: 18+ fix PRs (Jan–Feb)
+

#19914 checkpointed PXE chain tip (root fix), #20088 slasher multi-block, #20351 mbps chain test (311→0 flakes), #20462 remove hardcoded 10s timeout. ludamad #20613 CI parallelism fix.

+
+
+
+
open — not yet resolved
+
+
p2p cluster root cause
+

Cleared via targeted fixes, not architectural resolution. PXE epoch boundary races and test isolation issues remain. Will recur when new p2p/epoch features merge. No assigned owner for permanent fix.

+
+
+
Build time creep unresolved
+

next P50 +8%, P95 +22% in 3 weeks. No identified cause. At current rate P95 exceeds 70 min by May.

+
+
+
+
+ + +
+
§5 actions · in progress
+

The deflake gate exists — but W04 proved 4 grinds isn't enough for epoch/slashing tests

+
Outstanding from offsite: reassess deflake strategy. team-alpha already at 10 grinds.
+
+
+
current state
+
+
4-run deflake gate (universal, always existed)
+

All new e2e tests must pass 4 consecutive CI runs before merging. Has been standard for some time. Cost: ~$11 in deflake CI spend over 34 days — not a cost constraint.

+
+
+
W04 evidence: the gate failed
+

valid_epoch_pruned_slash passed 4 grinds and produced 346 events in its first week. tx_proposal_collector (180 events) and inactivity_slash_with_consequence (74 events) debuted similarly. The gate is not filtering unstable epoch/slashing tests adequately.

+
+
+
team-alpha: 10 grinds
+

Informal higher standard on one team. Not yet universal. Reduces risk but increases CI wait time by 2.5× per new test.

+
+
+
+
options
+
+
Option A: raise blanket gate to 10 grinds
+

Universal. Simple to enforce. team-alpha already doing this. Tradeoff: 2.5× CI cost per new e2e test; developers wait longer before merge. Not evidence-based — fast-passing tests still get 10 runs.

+ A-533 · set up hard grinding tests in spartan merge train +
+
+
Option B: targeted deflake (proposed)
+

Analyze test pass rate distribution over N runs. Flag tests above a variance threshold. Grind count becomes adaptive: stable tests get 4, timing-sensitive tests get 20+. More surgical, higher tooling cost to implement.

+
+
+
Decision needed from offsite follow-up
+

Who owns the deflake gate upgrade? What is the target flake rate threshold before a test is considered acceptable for CI merge?

+
+
+
+
+ + +
+
§5 actions · open questions
+

Three questions for discussion

+
Not answered here — require team input, owner assignment, or cost-benefit analysis.
+
+
+
deflake strategy
+
+
Targeted deflake vs blanket grinds?
+

team-alpha doing 10 grinds — should this become universal? Or build adaptive tooling that sets the grind count per-test based on variance? Who owns the implementation and the policy?

+ A-533 +
+
Evidence: W04 proved 4 isn't enough for epoch/slashing tests. 10 is pragmatic but not evidence-based. Targeted deflake is surgical but requires tooling investment.
+
+
+
GCP spot migration
+
+
$1,397/month savings — worth the ops complexity?
+

eth-mainnet and sepolia archive nodes: spot preemption means re-syncing from chain tip. metrics and public-telemetry: lower risk, Prometheus/Grafana with PVC persistence. What's the engineering cost to make each namespace preemption-safe?

+
+
AWS network pipeline spot migration (~$2,200/mo) is likely the easier, higher-ROI first step — no archive node complexity.
+
+
+
p2p/epoch ownership
+
+
Who owns the cluster to permanent resolution?
+

The cluster cleared through 18+ targeted fix PRs. Root cause — PXE epoch boundary races, test isolation, distributed timing — is not architecturally resolved. Next p2p feature merge will likely trigger another spike without a named owner and explicit SLO.

+
+
Without assignment: the next spike will be diagnosed reactively, same as this one. Expected recurrence: within 1–2 major p2p feature merges.
+
+
+
+ +
+ +
1 / 19
+ + + + + diff --git a/ci3/ci-metrics/views/ci-insights.html b/ci3/ci-metrics/views/ci-insights.html index 533b6bfb62cd..bb483af6f1d9 100644 --- a/ci3/ci-metrics/views/ci-insights.html +++ b/ci3/ci-metrics/views/ci-insights.html @@ -22,6 +22,15 @@ .msg { color:#888; padding:8px 0; } .msg.err { color:#f85149; } + /* Tabs */ + .tabs { display:flex; gap:0; margin:12px 0 0 0; } + .tab { background:#111; border:1px solid #333; border-bottom:none; color:#888; + font-family:monospace; font-size:13px; padding:6px 16px; cursor:pointer; } + .tab:hover { color:#ccc; } + .tab.active { background:#0a0a0a; color:#fff; border-color:#58a6ff; border-bottom:1px solid #0a0a0a; position:relative; z-index:1; } + .tab-content { display:none; border:1px solid #333; border-top:1px solid #333; padding:12px; background:#0a0a0a; margin-top:-1px; } + .tab-content.active { display:block; } + /* KPI strip */ .kpi-strip { display:flex; gap:12px; margin:16px 0; flex-wrap:wrap; } .kpi { background:#0a0a0a; border:1px solid #222; padding:12px 16px; flex:1; min-width:180px; max-height:120px; overflow:hidden; } @@ -53,23 +62,41 @@ .amt { font-variant-numeric:tabular-nums; text-align:right; } th.amt { text-align:right; } .detail-scroll { max-height:500px; overflow:auto; } + .detail-table { width:100%; border-collapse:collapse; font-size:12px; } + .detail-table th { text-align:left; padding:4px 6px; border-bottom:1px solid #333; color:#888; white-space:nowrap; position:sticky; top:0; background:#0a0a0a; } + .detail-table td { padding:4px 6px; border-bottom:1px solid #111; white-space:nowrap; } + .detail-table .amt { text-align:right; font-variant-numeric:tabular-nums; } + .detail-table th.amt { text-align:right; } + .stats { margin:12px 0; color:#888; } + .stats span { color:#ccc; } + /* Test details */ + .cmd { max-width:500px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; } + .pass { color:#3fb950; } + .fail { color:#f85149; } + .flake { color:#d29922; } -

ci insights

+

ci insights

- + + - + + + + | @@ -82,71 +109,205 @@

ci insights

- - -
-
daily ci spend
--
-
cost / merge
--
-
mq success rate
--
-
flakes / day
--
-
prs merged / day
--
+
+
Overview
+
Attribution
- -
-
-

daily ci cost + 7-day rolling cost per merge

-
+ +
+ -
-

merge queue: daily outcomes + success rate

-
+ +
+
mq success rate
--
+
flakes / day
--
+
prs merged / day
--
+
avg mq duration
--
-
-

flakes + test failures per day

-
+ +
+
+

merge queue: daily outcomes + success rate

+
+
+
+

test outcomes per day

+
+
+
+

ci run duration by pipeline (avg mins)

+
+
+
+

total ci time by pipeline (hours)

+
+
+
+

P95 build time by pipeline & phase (secs)

+
+
+
+ +
+
+
top flaky tests
+
+ + + +
testflakesaffected
+
+
+
+
top failing tests
+
+ + + +
testfailuresaffected
+
+
-
- -
flakes by pipeline
-
- - - -
+
flakes by pipeline
+
+ + + +
+
+ + +
+
+ test details + | + + + + +
+
+
+
+
+
+

avg duration by day

+
+
+
+

test run count by day

+
+
+
+

tests by duration

+
+ + + + + + + + + + + + + + + + +
test commandrunsavg (s)min (s)max (s)total (h)pass %passedfailedflaked
+
+

slowest individual runs

+
+ + + + + + + + + + + + + +
test commandduration (s)statusdateauthorpipelinelog
+
- -
author ci profile
-
- - - -
+ +
+
+ +
+
+
+

ci cost by run type (time series)

+
+
+
+

cost by user (AWS + GCP)

+
+
+
+

cost by run type

+
+
+
+ +
author ci profile
+
+ + + +
+
+ +
instances
+
+ + + +
+
- + diff --git a/ci3/ci-metrics/views/commits.html b/ci3/ci-metrics/views/commits.html new file mode 100644 index 000000000000..03143adff905 --- /dev/null +++ b/ci3/ci-metrics/views/commits.html @@ -0,0 +1,345 @@ + + + + + Commits — aztec-packages + + + + + +

commits — next

+ +
+ + + + + + + + + + +
+ +
Loading…
+ + + + + + diff --git a/ci3/ci-metrics/views/cost-overview.html b/ci3/ci-metrics/views/cost-overview.html index 53424a2d2d70..da8d73e0b7f8 100644 --- a/ci3/ci-metrics/views/cost-overview.html +++ b/ci3/ci-metrics/views/cost-overview.html @@ -71,17 +71,23 @@ -

cost overview

+

cost overview

- - + + + + + + | @@ -98,7 +104,6 @@

cost overview

Overview
Resource Details
-
CI Attribution
@@ -136,35 +141,6 @@

aws vs gcp split

-
-
- -
-
-
-

ci cost by run type (time series)

-
-
-
-

cost by user (AWS + GCP)

-
-
-
-

cost by run type

-
-
-
-

instances

-
- - - -
-
-
+ + diff --git a/ci3/ci-metrics/views/test-timings.html b/ci3/ci-metrics/views/test-timings.html index 0bf6c7213bd6..63cc54bb3690 100644 --- a/ci3/ci-metrics/views/test-timings.html +++ b/ci3/ci-metrics/views/test-timings.html @@ -1,289 +1,7 @@ - - - ACI - Test Timings - - +Redirecting... - - -

test timings

- -
- - - - - | - - - | - - - | - - -
- -
loading...
- -
- -
-
-

avg duration by day

-
-
-
-

test run count by day

-
-
-
- -

tests by duration

-
- - - - - - - - - - - - - - - - -
test commandrunsavg (s)min (s)max (s)total (h)pass %passedfailedflaked
-
- -

slowest individual runs

-
- - - - - - - - - - - - - -
test commandduration (s)statusdateauthorpipelinelog
-
- - - + diff --git a/ci3/dashboard/Dockerfile b/ci3/dashboard/Dockerfile index 2da7805ffa83..cd2e5b1f9b1d 100644 --- a/ci3/dashboard/Dockerfile +++ b/ci3/dashboard/Dockerfile @@ -24,4 +24,4 @@ RUN pip install --no-cache-dir -r ci-metrics/requirements.txt RUN git config --global --add safe.directory /aztec-packages COPY . . EXPOSE 8080 8081 -CMD ["gunicorn", "-w", "100", "-b", "0.0.0.0:8080", "rk:app"] +CMD ["gunicorn", "-w", "50", "-b", "0.0.0.0:8080", "rk:app"] diff --git a/ci3/dashboard/requirements.txt b/ci3/dashboard/requirements.txt index 9c1526f5b7a8..f3e1e9c53c08 100644 --- a/ci3/dashboard/requirements.txt +++ b/ci3/dashboard/requirements.txt @@ -5,3 +5,4 @@ ansi2html Flask-Compress requests Flask-HTTPAuth +boto3 diff --git a/ci3/dashboard/rk.py b/ci3/dashboard/rk.py index aedf35a824e2..12b9a414077f 100644 --- a/ci3/dashboard/rk.py +++ b/ci3/dashboard/rk.py @@ -1,6 +1,8 @@ from flask import Flask, render_template_string, request, Response, redirect from flask_compress import Compress from flask_httpauth import HTTPBasicAuth +import boto3 +from botocore.exceptions import ClientError import gzip import json import os @@ -9,6 +11,7 @@ import shlex import subprocess import threading +import time as _time import uuid from ansi2html import Ansi2HTMLConverter from pathlib import Path @@ -19,6 +22,10 @@ hyperlink, r, get_section_data, get_list_as_string ) LOGS_DISK_PATH = os.getenv('LOGS_DISK_PATH', '/logs-disk') +S3_LOGS_BUCKET = os.getenv('S3_LOGS_BUCKET', 'aztec-ci-artifacts') +S3_LOGS_PREFIX = os.getenv('S3_LOGS_PREFIX', 'logs') + +_s3 = boto3.client('s3', region_name='us-east-2') DASHBOARD_PASSWORD = os.getenv('DASHBOARD_PASSWORD', 'password') CI_METRICS_PORT = int(os.getenv('CI_METRICS_PORT', '8081')) CI_METRICS_URL = os.getenv('CI_METRICS_URL', f'http://localhost:{CI_METRICS_PORT}') @@ -27,37 +34,47 @@ Compress(app) auth = HTTPBasicAuth() -# Start the ci-metrics server as a subprocess -# Check sibling dir (repo layout) then subdirectory (Docker layout) +# Start the ci-metrics server as a subprocess (once across all workers). +# Uses a file lock so only the first gunicorn worker to import this module +# actually spawns the process; the rest skip silently. +import fcntl +import signal + _ci_metrics_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'ci-metrics') if not os.path.isdir(_ci_metrics_dir): _ci_metrics_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'ci-metrics') if os.path.isdir(_ci_metrics_dir): - # Kill any stale process on the port (e.g. leftover from previous reload) - import signal + _lock_path = f'/tmp/ci-metrics-{CI_METRICS_PORT}.lock' try: - out = subprocess.check_output( - ['lsof', '-ti', f':{CI_METRICS_PORT}'], stderr=subprocess.DEVNULL, text=True) - for pid in out.strip().split('\n'): - if pid: - os.kill(int(pid), signal.SIGTERM) - import time; time.sleep(0.5) - except (subprocess.CalledProcessError, OSError): + _lock_fd = open(_lock_path, 'w') + fcntl.flock(_lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + # We hold the lock — kill stale process and spawn fresh one + try: + out = subprocess.check_output( + ['lsof', '-ti', f':{CI_METRICS_PORT}'], stderr=subprocess.DEVNULL, text=True) + for pid in out.strip().split('\n'): + if pid: + os.kill(int(pid), signal.SIGTERM) + _time.sleep(0.5) + except (subprocess.CalledProcessError, OSError): + pass + _ci_metrics_env = {**os.environ, 'CI_METRICS_PORT': str(CI_METRICS_PORT)} + subprocess.Popen( + ['gunicorn', '-w', '1', '-b', f'0.0.0.0:{CI_METRICS_PORT}', + '--timeout', '120', 'app:app'], + cwd=_ci_metrics_dir, + env=_ci_metrics_env, + ) + print(f"[rk.py] ci-metrics server started on port {CI_METRICS_PORT}") + # Hold the lock until this process exits so other workers skip + except OSError: + # Another worker already holds the lock — nothing to do pass - _ci_metrics_env = {**os.environ, 'CI_METRICS_PORT': str(CI_METRICS_PORT)} - subprocess.Popen( - ['gunicorn', '-w', '4', '-b', f'0.0.0.0:{CI_METRICS_PORT}', '--timeout', '120', 'app:app'], - cwd=_ci_metrics_dir, - env=_ci_metrics_env, - ) - print(f"[rk.py] ci-metrics server started on port {CI_METRICS_PORT}") def read_from_disk(key): - """Read log from disk as fallback when Redis key not found.""" + """Read log from disk.""" try: - # Use first 4 chars as subdirectory prefix = key[:4] - log_file = f"/logs-disk/{prefix}/{key}.log.gz" log_file = f"{LOGS_DISK_PATH}/{prefix}/{key}.log.gz" if os.path.exists(log_file): with gzip.open(log_file, 'rb') as f: @@ -66,6 +83,20 @@ def read_from_disk(key): print(f"Error reading from disk: {e}") return None +def read_from_s3(key): + """Read log from S3 (fallback when Redis and disk both miss).""" + try: + prefix = key[:4] + s3_key = f"{S3_LOGS_PREFIX}/{prefix}/{key}.log.gz" + obj = _s3.get_object(Bucket=S3_LOGS_BUCKET, Key=s3_key) + return gzip.decompress(obj['Body'].read()).decode('utf-8', errors='replace') + except ClientError as e: + if e.response['Error']['Code'] != 'NoSuchKey': + print(f"S3 error reading {key}: {e}") + except Exception as e: + print(f"Error reading from S3: {e}") + return None + def read_breakdown_from_disk(runtime, flow_name, sha): """Read benchmark breakdown JSON from disk.""" try: @@ -178,7 +209,6 @@ def root() -> str: f"{hyperlink('/cost-overview', 'cost overview (AWS + GCP)')}\n" f"{hyperlink('/namespace-billing', 'namespace billing')}\n" f"{hyperlink('/ci-insights', 'ci insights')}\n" - f"{hyperlink('/test-timings', 'test timings')}\n" f"{RESET}" ) @@ -528,32 +558,32 @@ def make_options(param_name, options, current_value, suffix=''): _proxy_session = requests.Session() _HOP_BY_HOP = frozenset([ 'connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', - 'te', 'trailers', 'transfer-encoding', 'upgrade', 'content-length', - # `requests` auto-decompresses gzip responses, so Content-Encoding is - # stale — strip it so the browser doesn't try to decompress plain content. - # Flask-Compress on rkapp handles browser compression. - 'content-encoding', + 'te', 'trailers', 'transfer-encoding', 'upgrade', ]) -# Don't forward Accept-Encoding — let `requests` negotiate with ci-metrics -# (it adds its own and auto-decompresses). -_STRIP_REQUEST_HEADERS = frozenset(['host', 'accept-encoding']) +_STRIP_REQUEST_HEADERS = frozenset(['host']) def _proxy(path): - """Forward request to ci-metrics, streaming the response back.""" + """Forward request to ci-metrics, streaming the response back. + + Passes the browser's Accept-Encoding through to ci-metrics so it + compresses directly for the browser. We stream the raw (still + compressed) bytes back without decompression. + """ url = f'{CI_METRICS_URL}/{path.lstrip("/")}' try: + fwd_headers = {k: v for k, v in request.headers if k.lower() not in _STRIP_REQUEST_HEADERS} resp = _proxy_session.request( method=request.method, url=url, params=request.args, data=request.get_data(), - headers={k: v for k, v in request.headers if k.lower() not in _STRIP_REQUEST_HEADERS}, + headers=fwd_headers, stream=True, - timeout=60, + timeout=180, ) - # Strip hop-by-hop headers + # Stream raw bytes (skip requests auto-decompression) headers = {k: v for k, v in resp.headers.items() if k.lower() not in _HOP_BY_HOP} - return Response(resp.iter_content(chunk_size=8192), + return Response(resp.raw.stream(8192), status=resp.status_code, headers=headers) except Exception as e: return Response(json.dumps({'error': f'ci-metrics unavailable: {e}'}), @@ -564,10 +594,13 @@ def _proxy(path): @app.route('/ci-insights') @app.route('/cost-overview') @app.route('/test-timings') +@app.route('/ci-health-report') +@app.route('/flake-prs') @auth.login_required def proxy_dashboard(): return _proxy(request.path) + @app.route('/api/', methods=['GET', 'POST', 'PUT', 'DELETE']) @auth.login_required def proxy_api(path): @@ -583,11 +616,13 @@ def get_value(key): value = r.get(key) if value is None: - # Try disk fallback value = read_from_disk(key) - if value is None: - value = "Key not found" - else: + if value is None: + value = read_from_s3(key) + if value is None: + value = "Key not found" + elif isinstance(value, bytes): + # Redis returns raw bytes — decompress if gzip. try: if value.startswith(b"\x1f\x8b"): value = gzip.decompress(value).decode()