diff --git a/.github/workflows/ci3.yml b/.github/workflows/ci3.yml index a706e33fd021..8dbd61bd090c 100644 --- a/.github/workflows/ci3.yml +++ b/.github/workflows/ci3.yml @@ -88,6 +88,7 @@ jobs: PR_COMMITS: ${{ github.event.pull_request.commits }} PR_NUMBER: ${{ github.event.pull_request.number }} GITHUB_REF_NAME: ${{ github.ref_name }} + GITHUB_ACTOR: ${{ github.actor }} # NOTE: $CI_MODE is set in the Determine CI Mode step. run: ./.github/ci3.sh $CI_MODE diff --git a/ci3/aws_request_instance_type b/ci3/aws_request_instance_type index 5f6aafbe4dd7..a48c5b58c241 100755 --- a/ci3/aws_request_instance_type +++ b/ci3/aws_request_instance_type @@ -86,6 +86,21 @@ if [ -z "${iid:-}" -o "${iid:-}" == "None" ]; then echo $iid > $iid_path fi +tags="Key=Name,Value=$name Key=Group,Value=build-instance" +[ -n "${GITHUB_ACTOR:-}" ] && tags+=" Key=GithubActor,Value=$GITHUB_ACTOR" +[ -n "${CI_MODE:-}" ] && tags+=" Key=CICommand,Value=$CI_MODE" +[ -n "${CI_DASHBOARD:-}" ] && tags+=" Key=Dashboard,Value=$CI_DASHBOARD" +if [ "${UNSAFE_AWS_KEEP_ALIVE:-0}" -eq 1 ]; then + echo_stderr "You have set UNSAFE_AWS_KEEP_ALIVE=1, so the instance will not be terminated after 1.5 hours by the reaper script. Make sure you shut the machine down when done." + tags+=" Key=Keep-Alive,Value=true" +fi +aws ec2 create-tags --resources $iid --tags $tags + +# Record the instance type so callers can pass it downstream (e.g. into Docker). +echo $instance_type > $state_dir/instance_type +# Record whether this is spot or on-demand. +[ -f "$sir_path" ] && echo spot > $state_dir/spot || echo ondemand > $state_dir/spot + while [ -z "${ip:-}" ]; do sleep 1 ip=$(aws ec2 describe-instances \ diff --git a/ci3/bootstrap_ec2 b/ci3/bootstrap_ec2 index a24f0cfc177b..eeffb180d390 100755 --- a/ci3/bootstrap_ec2 +++ b/ci3/bootstrap_ec2 @@ -89,6 +89,8 @@ if [[ -f "$state_dir/sir" ]]; then sir=$(cat $state_dir/sir) fi iid=$(cat $state_dir/iid) +export EC2_INSTANCE_TYPE=$(cat $state_dir/instance_type 2>/dev/null || echo "unknown") +export EC2_SPOT=$(cat $state_dir/spot 2>/dev/null || echo "unknown") # If AWS credentials are not set, try to load them from ~/.aws/build_instance_credentials. if [ -z "${AWS_ACCESS_KEY_ID:-}" ] || [ -z "${AWS_SECRET_ACCESS_KEY:-}" ]; then @@ -192,16 +194,6 @@ container_script=$( log_ci_run FAILED \$ci_log_id merge_train_failure_slack_notify \$ci_log_id release_canary_slack_notify \$ci_log_id - ci_failed_data=\$(jq -n \\ - --arg status "failed" \\ - --arg log_id "\$ci_log_id" \\ - --arg ref_name "\${TARGET_BRANCH:-\$REF_NAME}" \\ - --arg commit_hash "\$COMMIT_HASH" \\ - --arg commit_author "\$COMMIT_AUTHOR" \\ - --arg commit_msg "\$COMMIT_MSG" \\ - --argjson exit_code "\$code" \\ - '{status: \$status, log_id: \$log_id, ref_name: \$ref_name, commit_hash: \$commit_hash, commit_author: \$commit_author, commit_msg: \$commit_msg, exit_code: \$exit_code, timestamp: now | todate}') - redis_publish "ci:run:failed" "\$ci_failed_data" ;; esac exit \$code @@ -331,6 +323,9 @@ function run { -e AWS_TOKEN=\$aws_token \ -e NAMESPACE=${NAMESPACE:-} \ -e NETWORK=${NETWORK:-} \ + -e GITHUB_ACTOR=${GITHUB_ACTOR:-} \ + -e EC2_INSTANCE_TYPE=${EC2_INSTANCE_TYPE:-unknown} \ + -e EC2_SPOT=${EC2_SPOT:-unknown} \ --pids-limit=65536 \ --shm-size=2g \ aztecprotocol/devbox:3.0 bash -c $(printf '%q' "$container_script") diff --git a/ci3/ci-metrics/Dockerfile b/ci3/ci-metrics/Dockerfile new file mode 100644 index 000000000000..4013545da66d --- /dev/null +++ b/ci3/ci-metrics/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.12 + +RUN apt update && apt install -y jq redis-tools && rm -rf /var/lib/apt/lists/* + +WORKDIR /app +COPY requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt gunicorn +RUN git config --global --add safe.directory /aztec-packages +COPY . . +EXPOSE 8081 +CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:8081", "app:app"] diff --git a/ci3/ci-metrics/app.py b/ci3/ci-metrics/app.py new file mode 100644 index 000000000000..c62875e7d19a --- /dev/null +++ b/ci3/ci-metrics/app.py @@ -0,0 +1,848 @@ +from flask import Flask, request, Response, redirect +from flask_compress import Compress +from flask_httpauth import HTTPBasicAuth +from datetime import datetime, timedelta +import json +import os +import re +import redis +import threading +from pathlib import Path + +import db +import metrics +import github_data +import billing.aws as billing_aws +from billing import ( + get_billing_files_in_range, + aggregate_billing_weekly, aggregate_billing_monthly, + serve_billing_dashboard, +) + +REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') +REDIS_PORT = int(os.getenv('REDIS_PORT', '6379')) +LOGS_DISK_PATH = os.getenv('LOGS_DISK_PATH', '/logs-disk') +DASHBOARD_PASSWORD = os.getenv('DASHBOARD_PASSWORD', 'password') + +r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=False) + +app = Flask(__name__) +Compress(app) +auth = HTTPBasicAuth() + + +@auth.verify_password +def verify_password(username, password): + return password == DASHBOARD_PASSWORD + + +def _init(): + """Initialize SQLite and start background threads.""" + try: + db.get_db() + metrics.start_test_listener(r) + metrics.start_ci_run_sync(r) + print("[ci-metrics] Background threads started") + except Exception as e: + print(f"[ci-metrics] Warning: startup failed: {e}") + +threading.Thread(target=_init, daemon=True, name='metrics-init').start() + + +# ---- Helpers ---- + +def _aggregate_dates(by_date_list, granularity, sum_fields, avg_fields=None): + """Aggregate a list of {date, ...} dicts by weekly/monthly granularity.""" + if granularity == 'daily' or not by_date_list: + return by_date_list + + buckets = {} + for entry in by_date_list: + d = datetime.strptime(entry['date'], '%Y-%m-%d') + if granularity == 'weekly': + key = (d - timedelta(days=d.weekday())).strftime('%Y-%m-%d') + else: # monthly + key = d.strftime('%Y-%m') + '-01' + + if key not in buckets: + buckets[key] = {'date': key} + for f in sum_fields: + buckets[key][f] = 0 + if avg_fields: + for f in avg_fields: + buckets[key][f'_avg_sum_{f}'] = 0 + buckets[key][f'_avg_cnt_{f}'] = 0 + + for f in sum_fields: + buckets[key][f] += entry.get(f) or 0 + if avg_fields: + for f in avg_fields: + val = entry.get(f) + if val is not None: + buckets[key][f'_avg_sum_{f}'] += val + buckets[key][f'_avg_cnt_{f}'] += 1 + + result = [] + for key in sorted(buckets): + b = buckets[key] + out = {'date': b['date']} + for f in sum_fields: + out[f] = round(b[f], 2) if isinstance(b[f], float) else b[f] + if avg_fields: + for f in avg_fields: + cnt = b[f'_avg_cnt_{f}'] + out[f] = round(b[f'_avg_sum_{f}'] / cnt, 1) if cnt else None + result.append(out) + + return result + + +def _json(data): + return Response(json.dumps(data), mimetype='application/json') + + +# ---- Namespace billing ---- + +@app.route('/namespace-billing') +@auth.login_required +def namespace_billing(): + html = serve_billing_dashboard() + if html: + return html + return "Billing dashboard not found", 404 + + +@app.route('/api/billing/data') +@auth.login_required +def billing_data(): + date_from_str = request.args.get('from') + date_to_str = request.args.get('to') + granularity = request.args.get('granularity', 'daily') + + if not date_from_str or not date_to_str: + return _json({'error': 'from and to date params required (YYYY-MM-DD)'}), 400 + try: + date_from = datetime.strptime(date_from_str, '%Y-%m-%d') + date_to = datetime.strptime(date_to_str, '%Y-%m-%d') + except ValueError: + return _json({'error': 'Invalid date format, use YYYY-MM-DD'}), 400 + + daily_data = get_billing_files_in_range(date_from, date_to) + + # Filter out namespaces costing less than $1 total across the range + ns_totals = {} + for entry in daily_data: + for ns, ns_data in entry.get('namespaces', {}).items(): + ns_totals[ns] = ns_totals.get(ns, 0) + ns_data.get('total', 0) + cheap_ns = {ns for ns, total in ns_totals.items() if total < 1.0} + if cheap_ns: + for entry in daily_data: + entry['namespaces'] = {ns: d for ns, d in entry.get('namespaces', {}).items() + if ns not in cheap_ns} + + if granularity == 'weekly': + result = aggregate_billing_weekly(daily_data) + elif granularity == 'monthly': + result = aggregate_billing_monthly(daily_data) + else: + result = daily_data + + return _json(result) + + +# ---- CI runs ---- + +@app.route('/api/ci/runs') +@auth.login_required +def api_ci_runs(): + date_from = request.args.get('from', '') + date_to = request.args.get('to', '') + status_filter = request.args.get('status', '') + author = request.args.get('author', '') + dashboard = request.args.get('dashboard', '') + limit = min(int(request.args.get('limit', 100)), 1000) + offset = int(request.args.get('offset', 0)) + + ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) if date_from else None + ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) if date_to else None + + runs = metrics.get_ci_runs(r, ts_from, ts_to) + + if status_filter: + runs = [run for run in runs if run.get('status') == status_filter] + if author: + runs = [run for run in runs if run.get('author') == author] + if dashboard: + runs = [run for run in runs if run.get('dashboard') == dashboard] + + runs.sort(key=lambda x: x.get('timestamp', 0), reverse=True) + runs = runs[offset:offset + limit] + + return _json(runs) + + +@app.route('/api/ci/stats') +@auth.login_required +def api_ci_stats(): + ts_from = int((datetime.now() - timedelta(days=7)).timestamp() * 1000) + runs = metrics.get_ci_runs(r, ts_from) + + total = len(runs) + passed = sum(1 for run in runs if run.get('status') == 'PASSED') + failed = sum(1 for run in runs if run.get('status') == 'FAILED') + costs = [run['cost_usd'] for run in runs if run.get('cost_usd') is not None] + durations = [] + for run in runs: + complete = run.get('complete') + ts = run.get('timestamp') + if complete and ts: + durations.append((complete - ts) / 60000.0) + + return _json({ + 'total_runs': total, + 'passed': passed, + 'failed': failed, + 'total_cost': round(sum(costs), 2) if costs else None, + 'avg_duration_mins': round(sum(durations) / len(durations), 1) if durations else None, + }) + + +# ---- Cost endpoints ---- + +@app.route('/api/costs/overview') +@auth.login_required +def api_costs_overview(): + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + granularity = request.args.get('granularity', 'daily') + result = billing_aws.get_costs_overview(date_from, date_to) + if granularity != 'daily' and result.get('by_date'): + buckets = {} + for entry in result['by_date']: + d = datetime.strptime(entry['date'], '%Y-%m-%d') + if granularity == 'weekly': + key = (d - timedelta(days=d.weekday())).strftime('%Y-%m-%d') + else: + key = d.strftime('%Y-%m') + '-01' + if key not in buckets: + buckets[key] = {'date': key, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0} + for cat, amt in entry.get('aws', {}).items(): + buckets[key]['aws'][cat] = buckets[key]['aws'].get(cat, 0) + amt + for cat, amt in entry.get('gcp', {}).items(): + buckets[key]['gcp'][cat] = buckets[key]['gcp'].get(cat, 0) + amt + buckets[key]['aws_total'] += entry.get('aws_total', 0) + buckets[key]['gcp_total'] += entry.get('gcp_total', 0) + result['by_date'] = sorted(buckets.values(), key=lambda x: x['date']) + return _json(result) + + +@app.route('/api/costs/details') +@auth.login_required +def api_costs_details(): + """Per-resource (USAGE_TYPE) cost breakdown.""" + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + + rows = billing_aws.get_aws_cost_details(date_from, date_to) + + usage_map = {} + for row in rows: + ut = row['usage_type'] + if ut not in usage_map: + usage_map[ut] = { + 'usage_type': ut, + 'service': row['service'], + 'category': row['category'], + 'total': 0, + 'by_date': {}, + 'is_ri': 'HeavyUsage' in ut, + } + usage_map[ut]['total'] += row['amount_usd'] + d = row['date'] + usage_map[ut]['by_date'][d] = usage_map[ut]['by_date'].get(d, 0) + row['amount_usd'] + + items = sorted(usage_map.values(), key=lambda x: -x['total']) + for item in items: + item['total'] = round(item['total'], 2) + item['by_date'] = {d: round(v, 4) for d, v in sorted(item['by_date'].items())} + + all_dates = sorted({row['date'] for row in rows}) + ri_items = [i for i in items if i['is_ri']] + ri_total = round(sum(i['total'] for i in ri_items), 2) + + return _json({ + 'items': items, + 'dates': all_dates, + 'ri_total': ri_total, + 'grand_total': round(sum(i['total'] for i in items), 2), + }) + + +@app.route('/api/costs/attribution') +@auth.login_required +def api_costs_attribution(): + """CI cost attribution by user, branch, instance.""" + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) + ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) + + runs = metrics.get_ci_runs(r, ts_from, ts_to) + runs_with_cost = [run for run in runs if run.get('cost_usd') is not None] + + # Enrich merge queue runs with PR author from GitHub + pr_numbers = {run.get('pr_number') for run in runs_with_cost if run.get('pr_number')} + pr_authors = github_data.batch_get_pr_authors(pr_numbers) + + granularity = request.args.get('granularity', 'daily') + + instances = [] + by_user = {} + by_branch = {} + by_type = {} + by_date_type = {} + + for run in runs_with_cost: + info = billing_aws.decode_branch_info(run) + cost = run['cost_usd'] + date = metrics._ts_to_date(run.get('timestamp', 0)) + + author = info['author'] + prn = info['pr_number'] + if prn and int(prn) in pr_authors: + author = pr_authors[int(prn)]['author'] + + inst_type = run.get('instance_type', 'unknown') + vcpus = run.get('instance_vcpus') + if inst_type == 'unknown' and vcpus: + inst_type = f'{vcpus}vcpu' + + instances.append({ + 'instance_name': info['instance_name'], + 'date': date, + 'cost_usd': cost, + 'author': author, + 'branch': info['branch'], + 'pr_number': prn, + 'type': info['type'], + 'instance_type': inst_type, + 'spot': run.get('spot', False), + 'job_id': run.get('job_id', ''), + 'duration_mins': round((run.get('complete', 0) - run.get('timestamp', 0)) / 60000, 1) if run.get('complete') else None, + }) + + if author not in by_user: + by_user[author] = {'aws_cost': 0, 'gcp_cost': 0, 'runs': 0, 'by_date': {}} + by_user[author]['aws_cost'] += cost + by_user[author]['runs'] += 1 + by_user[author]['by_date'][date] = by_user[author]['by_date'].get(date, 0) + cost + + branch_key = info['branch'] or info['type'] + if branch_key not in by_branch: + by_branch[branch_key] = {'cost': 0, 'runs': 0, 'type': info['type'], 'author': author} + by_branch[branch_key]['cost'] += cost + by_branch[branch_key]['runs'] += 1 + + rt = info['type'] + if rt not in by_type: + by_type[rt] = {'cost': 0, 'runs': 0} + by_type[rt]['cost'] += cost + by_type[rt]['runs'] += 1 + + if date not in by_date_type: + by_date_type[date] = {} + by_date_type[date][rt] = by_date_type[date].get(rt, 0) + cost + + # GCP costs — reported as total, no namespace→user heuristic + gcp_total = 0 + try: + from billing.gcp import get_billing_files_in_range as get_gcp_billing + gcp_data = get_gcp_billing( + datetime.strptime(date_from, '%Y-%m-%d'), + datetime.strptime(date_to, '%Y-%m-%d'), + ) + for entry in gcp_data: + for ns, ns_data in entry.get('namespaces', {}).items(): + gcp_total += ns_data.get('total', 0) + except Exception as e: + print(f"[attribution] GKE billing error: {e}") + + # Sort and format + user_list = [{'author': a, 'aws_cost': round(v['aws_cost'], 2), 'gcp_cost': round(v['gcp_cost'], 2), + 'total_cost': round(v['aws_cost'] + v['gcp_cost'], 2), 'runs': v['runs'], + 'by_date': {d: round(c, 2) for d, c in sorted(v['by_date'].items())}} + for a, v in sorted(by_user.items(), key=lambda x: -(x[1]['aws_cost'] + x[1]['gcp_cost']))] + + branch_list = [{'branch': b, 'cost': round(v['cost'], 2), 'runs': v['runs'], + 'type': v['type'], 'author': v['author']} + for b, v in sorted(by_branch.items(), key=lambda x: -x[1]['cost'])[:100]] + + type_list = [{'type': t, 'cost': round(v['cost'], 2), 'runs': v['runs']} + for t, v in sorted(by_type.items(), key=lambda x: -x[1]['cost'])] + + instances.sort(key=lambda x: -(x['cost_usd'] or 0)) + + all_types = sorted(by_type.keys()) + by_date_list = [] + for date in sorted(by_date_type): + entry = {'date': date, 'total': 0, 'runs': 0} + for rt in all_types: + entry[rt] = round(by_date_type[date].get(rt, 0), 2) + entry['total'] += by_date_type[date].get(rt, 0) + entry['total'] = round(entry['total'], 2) + entry['runs'] = sum(1 for inst in instances if inst['date'] == date) + by_date_list.append(entry) + + by_date_list = _aggregate_dates(by_date_list, granularity, + sum_fields=['total', 'runs'] + all_types) + + total_aws = sum(u['aws_cost'] for u in user_list) + + return _json({ + 'by_user': user_list, + 'by_branch': branch_list, + 'by_type': type_list, + 'by_date': by_date_list, + 'run_types': all_types, + 'instances': instances[:500], + 'totals': {'aws': round(total_aws, 2), 'gcp': round(gcp_total, 2), + 'gcp_unattributed': round(gcp_total, 2), + 'combined': round(total_aws + gcp_total, 2)}, + }) + + +@app.route('/api/costs/runners') +@auth.login_required +def api_costs_runners(): + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + granularity = request.args.get('granularity', 'daily') + dashboard = request.args.get('dashboard', '') + ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) + ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) + + runs = metrics.get_ci_runs(r, ts_from, ts_to) + runs_with_cost = [run for run in runs if run.get('cost_usd') is not None] + if dashboard: + runs_with_cost = [run for run in runs_with_cost if run.get('dashboard') == dashboard] + + by_date_map = {} + for run in runs_with_cost: + date = metrics._ts_to_date(run.get('timestamp', 0)) + if date not in by_date_map: + by_date_map[date] = {'spot_cost': 0, 'ondemand_cost': 0, 'total': 0} + cost = run['cost_usd'] + if run.get('spot'): + by_date_map[date]['spot_cost'] += cost + else: + by_date_map[date]['ondemand_cost'] += cost + by_date_map[date]['total'] += cost + + by_date = [{'date': date, 'spot_cost': round(d['spot_cost'], 2), + 'ondemand_cost': round(d['ondemand_cost'], 2), 'total': round(d['total'], 2), + 'spot_pct': round(100.0 * d['spot_cost'] / max(d['total'], 0.01), 1)} + for date, d in sorted(by_date_map.items())] + + by_date = _aggregate_dates(by_date, granularity, + sum_fields=['spot_cost', 'ondemand_cost', 'total']) + for d in by_date: + d['spot_pct'] = round(100.0 * d['spot_cost'] / max(d['total'], 0.01), 1) + + by_instance_map = {} + for run in runs_with_cost: + inst = run.get('instance_type', 'unknown') + if inst not in by_instance_map: + by_instance_map[inst] = {'cost': 0, 'runs': 0} + by_instance_map[inst]['cost'] += run['cost_usd'] + by_instance_map[inst]['runs'] += 1 + by_instance = [{'instance_type': k, 'cost': round(v['cost'], 2), 'runs': v['runs']} + for k, v in sorted(by_instance_map.items(), key=lambda x: -x[1]['cost'])] + + by_dash_map = {} + for run in runs_with_cost: + dash = run.get('dashboard', 'unknown') + if dash not in by_dash_map: + by_dash_map[dash] = {'cost': 0, 'runs': 0} + by_dash_map[dash]['cost'] += run['cost_usd'] + by_dash_map[dash]['runs'] += 1 + by_dashboard = [{'dashboard': k, 'cost': round(v['cost'], 2), 'runs': v['runs']} + for k, v in sorted(by_dash_map.items(), key=lambda x: -x[1]['cost'])] + + total_cost = sum(run['cost_usd'] for run in runs_with_cost) + spot_cost = sum(run['cost_usd'] for run in runs_with_cost if run.get('spot')) + + return _json({ + 'by_date': by_date, + 'by_instance_type': by_instance, + 'by_dashboard': by_dashboard, + 'summary': { + 'total_cost': round(total_cost, 2), + 'spot_pct': round(100.0 * spot_cost / max(total_cost, 0.01), 1), + 'avg_cost_per_run': round(total_cost / max(len(runs_with_cost), 1), 2), + 'total_runs': len(runs_with_cost), + }, + }) + + +# ---- CI Performance ---- + +@app.route('/api/ci/performance') +@auth.login_required +def api_ci_performance(): + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + dashboard = request.args.get('dashboard', '') + granularity = request.args.get('granularity', 'daily') + ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) + ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) + + runs = metrics.get_ci_runs(r, ts_from, ts_to) + runs = [run for run in runs if run.get('status') in ('PASSED', 'FAILED')] + if dashboard: + runs = [run for run in runs if run.get('dashboard') == dashboard] + + by_date_map = {} + for run in runs: + date = metrics._ts_to_date(run.get('timestamp', 0)) + if date not in by_date_map: + by_date_map[date] = {'total': 0, 'passed': 0, 'failed': 0, 'durations': []} + by_date_map[date]['total'] += 1 + if run.get('status') == 'PASSED': + by_date_map[date]['passed'] += 1 + else: + by_date_map[date]['failed'] += 1 + complete = run.get('complete') + ts = run.get('timestamp') + if complete and ts: + by_date_map[date]['durations'].append((complete - ts) / 60000.0) + + by_date = [] + for date in sorted(by_date_map): + d = by_date_map[date] + by_date.append({ + 'date': date, + 'total': d['total'], + 'passed': d['passed'], + 'failed': d['failed'], + 'pass_rate': round(100.0 * d['passed'] / max(d['total'], 1), 1), + 'failure_rate': round(100.0 * d['failed'] / max(d['total'], 1), 1), + 'avg_duration_mins': round(sum(d['durations']) / len(d['durations']), 1) if d['durations'] else None, + }) + + by_date = _aggregate_dates(by_date, granularity, + sum_fields=['total', 'passed', 'failed'], + avg_fields=['avg_duration_mins']) + for d in by_date: + d['pass_rate'] = round(100.0 * d['passed'] / max(d['total'], 1), 1) + d['failure_rate'] = round(100.0 * d['failed'] / max(d['total'], 1), 1) + + # Daily flake/failure counts from test_events + if dashboard: + flake_daily = db.query(''' + SELECT substr(timestamp, 1, 10) as date, COUNT(*) as count + FROM test_events WHERE status = 'flaked' AND dashboard = ? + AND timestamp >= ? AND timestamp < ? + GROUP BY substr(timestamp, 1, 10) + ''', (dashboard, date_from, date_to + 'T23:59:59')) + fail_test_daily = db.query(''' + SELECT substr(timestamp, 1, 10) as date, COUNT(*) as count + FROM test_events WHERE status = 'failed' AND dashboard = ? + AND timestamp >= ? AND timestamp < ? + GROUP BY substr(timestamp, 1, 10) + ''', (dashboard, date_from, date_to + 'T23:59:59')) + else: + flake_daily = db.query(''' + SELECT substr(timestamp, 1, 10) as date, COUNT(*) as count + FROM test_events WHERE status = 'flaked' + AND timestamp >= ? AND timestamp < ? + GROUP BY substr(timestamp, 1, 10) + ''', (date_from, date_to + 'T23:59:59')) + fail_test_daily = db.query(''' + SELECT substr(timestamp, 1, 10) as date, COUNT(*) as count + FROM test_events WHERE status = 'failed' + AND timestamp >= ? AND timestamp < ? + GROUP BY substr(timestamp, 1, 10) + ''', (date_from, date_to + 'T23:59:59')) + flake_daily_map = {r['date']: r['count'] for r in flake_daily} + fail_test_daily_map = {r['date']: r['count'] for r in fail_test_daily} + for d in by_date: + d['flake_count'] = flake_daily_map.get(d['date'], 0) + d['test_failure_count'] = fail_test_daily_map.get(d['date'], 0) + + # Top flakes/failures + if dashboard: + top_flakes = db.query(''' + SELECT test_cmd, COUNT(*) as count, ref_name + FROM test_events WHERE status='flaked' AND dashboard = ? + AND timestamp >= ? AND timestamp <= ? + GROUP BY test_cmd ORDER BY count DESC LIMIT 15 + ''', (dashboard, date_from, date_to + 'T23:59:59')) + top_failures = db.query(''' + SELECT test_cmd, COUNT(*) as count + FROM test_events WHERE status='failed' AND dashboard = ? + AND timestamp >= ? AND timestamp <= ? + GROUP BY test_cmd ORDER BY count DESC LIMIT 15 + ''', (dashboard, date_from, date_to + 'T23:59:59')) + else: + top_flakes = db.query(''' + SELECT test_cmd, COUNT(*) as count, ref_name + FROM test_events WHERE status='flaked' AND timestamp >= ? AND timestamp <= ? + GROUP BY test_cmd ORDER BY count DESC LIMIT 15 + ''', (date_from, date_to + 'T23:59:59')) + top_failures = db.query(''' + SELECT test_cmd, COUNT(*) as count + FROM test_events WHERE status='failed' AND timestamp >= ? AND timestamp <= ? + GROUP BY test_cmd ORDER BY count DESC LIMIT 15 + ''', (date_from, date_to + 'T23:59:59')) + + # Summary + total = len(runs) + passed = sum(1 for run in runs if run.get('status') == 'PASSED') + failed = total - passed + durations = [] + for run in runs: + complete = run.get('complete') + ts = run.get('timestamp') + if complete and ts: + durations.append((complete - ts) / 60000.0) + + if dashboard: + flake_count = db.query(''' + SELECT COUNT(*) as c FROM test_events WHERE status='flaked' AND dashboard = ? + AND timestamp >= ? AND timestamp <= ? + ''', (dashboard, date_from, date_to + 'T23:59:59')) + total_tests = db.query(''' + SELECT COUNT(*) as c FROM test_events WHERE status IN ('failed','flaked') AND dashboard = ? + AND timestamp >= ? AND timestamp <= ? + ''', (dashboard, date_from, date_to + 'T23:59:59')) + total_failures_count = db.query(''' + SELECT COUNT(*) as c FROM test_events WHERE status='failed' AND dashboard = ? + AND timestamp >= ? AND timestamp <= ? + ''', (dashboard, date_from, date_to + 'T23:59:59')) + else: + flake_count = db.query(''' + SELECT COUNT(*) as c FROM test_events WHERE status='flaked' AND timestamp >= ? AND timestamp <= ? + ''', (date_from, date_to + 'T23:59:59')) + total_tests = db.query(''' + SELECT COUNT(*) as c FROM test_events WHERE status IN ('failed','flaked') AND timestamp >= ? AND timestamp <= ? + ''', (date_from, date_to + 'T23:59:59')) + total_failures_count = db.query(''' + SELECT COUNT(*) as c FROM test_events WHERE status='failed' AND timestamp >= ? AND timestamp <= ? + ''', (date_from, date_to + 'T23:59:59')) + + fc = flake_count[0]['c'] if flake_count else 0 + tc = total_tests[0]['c'] if total_tests else 0 + tfc = total_failures_count[0]['c'] if total_failures_count else 0 + + return _json({ + 'by_date': by_date, + 'top_flakes': top_flakes, + 'top_failures': top_failures, + 'summary': { + 'total_runs': total, + 'pass_rate': round(100.0 * passed / max(total, 1), 1), + 'failure_rate': round(100.0 * failed / max(total, 1), 1), + 'avg_duration_mins': round(sum(durations) / len(durations), 1) if durations else None, + 'flake_rate': round(100.0 * fc / max(tc, 1), 1) if tc else 0, + 'total_flakes': fc, + 'total_test_failures': tfc, + }, + }) + + +# ---- GitHub integration ---- + +@app.route('/api/deployments/speed') +@auth.login_required +def api_deploy_speed(): + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + workflow = request.args.get('workflow', '') + granularity = request.args.get('granularity', 'daily') + result = github_data.get_deployment_speed(date_from, date_to, workflow) + if granularity != 'daily' and result.get('by_date'): + result['by_date'] = _aggregate_dates( + result['by_date'], granularity, + sum_fields=['count', 'success', 'failure'], + avg_fields=['median_mins', 'p95_mins']) + return _json(result) + + +@app.route('/api/branches/lag') +@auth.login_required +def api_branch_lag(): + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + return _json(github_data.get_branch_lag(date_from, date_to)) + + +@app.route('/api/prs/metrics') +@auth.login_required +def api_pr_metrics(): + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + author = request.args.get('author', '') + ts_from = int(datetime.strptime(date_from, '%Y-%m-%d').timestamp() * 1000) + ts_to = int((datetime.strptime(date_to, '%Y-%m-%d') + timedelta(days=1)).timestamp() * 1000) + ci_runs = metrics.get_ci_runs(r, ts_from, ts_to) + return _json(github_data.get_pr_metrics(date_from, date_to, author, ci_runs)) + + +@app.route('/api/merge-queue/stats') +@auth.login_required +def api_merge_queue_stats(): + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + return _json(github_data.get_merge_queue_stats(date_from, date_to)) + + +@app.route('/api/ci/flakes-by-command') +@auth.login_required +def api_flakes_by_command(): + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + dashboard = request.args.get('dashboard', '') + metrics.sync_failed_tests_to_sqlite(r) + return _json(metrics.get_flakes_by_command(date_from, date_to, dashboard)) + + +# ---- Test timings ---- + +@app.route('/api/tests/timings') +@auth.login_required +def api_test_timings(): + """Test timing statistics: duration by test command, with trends.""" + date_from = request.args.get('from', (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')) + date_to = request.args.get('to', datetime.now().strftime('%Y-%m-%d')) + dashboard = request.args.get('dashboard', '') + status = request.args.get('status', '') # filter to specific status + test_cmd = request.args.get('test_cmd', '') # filter to specific test + + conditions = ['duration_secs IS NOT NULL', 'duration_secs > 0', + 'timestamp >= ?', "timestamp < ? || 'T23:59:59'"] + params = [date_from, date_to] + + if dashboard: + conditions.append('dashboard = ?') + params.append(dashboard) + if status: + conditions.append('status = ?') + params.append(status) + if test_cmd: + conditions.append('test_cmd = ?') + params.append(test_cmd) + + where = 'WHERE ' + ' AND '.join(conditions) + + # Per-test stats + by_test = db.query(f''' + SELECT test_cmd, + COUNT(*) as count, + ROUND(AVG(duration_secs), 1) as avg_secs, + ROUND(MIN(duration_secs), 1) as min_secs, + ROUND(MAX(duration_secs), 1) as max_secs, + SUM(CASE WHEN status = 'passed' THEN 1 ELSE 0 END) as passed, + SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed, + SUM(CASE WHEN status = 'flaked' THEN 1 ELSE 0 END) as flaked, + dashboard + FROM test_events {where} + GROUP BY test_cmd + ORDER BY count DESC + LIMIT 200 + ''', params) + + # Add pass rate + for row in by_test: + total = row['passed'] + row['failed'] + row['flaked'] + row['pass_rate'] = round(100.0 * row['passed'] / max(total, 1), 1) + row['total_time_secs'] = round(row['avg_secs'] * row['count'], 0) + + # Daily time series (aggregate across all tests or filtered test) + by_date = db.query(f''' + SELECT substr(timestamp, 1, 10) as date, + COUNT(*) as count, + ROUND(AVG(duration_secs), 1) as avg_secs, + ROUND(MAX(duration_secs), 1) as max_secs, + SUM(CASE WHEN status = 'passed' THEN 1 ELSE 0 END) as passed, + SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed, + SUM(CASE WHEN status = 'flaked' THEN 1 ELSE 0 END) as flaked + FROM test_events {where} + GROUP BY substr(timestamp, 1, 10) + ORDER BY date + ''', params) + + # Summary + summary_rows = db.query(f''' + SELECT COUNT(*) as count, + ROUND(AVG(duration_secs), 1) as avg_secs, + ROUND(MAX(duration_secs), 1) as max_secs, + SUM(duration_secs) as total_secs, + SUM(CASE WHEN status = 'passed' THEN 1 ELSE 0 END) as passed, + SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed, + SUM(CASE WHEN status = 'flaked' THEN 1 ELSE 0 END) as flaked + FROM test_events {where} + ''', params) + s = summary_rows[0] if summary_rows else {} + + # Slowest individual test runs + slowest = db.query(f''' + SELECT test_cmd, status, duration_secs, dashboard, + substr(timestamp, 1, 10) as date, commit_author, log_url + FROM test_events {where} + ORDER BY duration_secs DESC + LIMIT 50 + ''', params) + + return _json({ + 'by_test': by_test, + 'by_date': by_date, + 'slowest': slowest, + 'summary': { + 'total_runs': s.get('count', 0), + 'avg_duration_secs': s.get('avg_secs'), + 'max_duration_secs': s.get('max_secs'), + 'total_compute_secs': round(s.get('total_secs', 0) or 0, 0), + 'passed': s.get('passed', 0), + 'failed': s.get('failed', 0), + 'flaked': s.get('flaked', 0), + }, + }) + + +# ---- Dashboard views ---- + +@app.route('/ci-health') +@auth.login_required +def ci_health(): + return redirect('/ci-insights') + + +@app.route('/ci-insights') +@auth.login_required +def ci_insights(): + path = Path(__file__).parent / 'views' / 'ci-insights.html' + if path.exists(): + return path.read_text() + return "Dashboard not found", 404 + + +@app.route('/cost-overview') +@auth.login_required +def cost_overview(): + path = Path(__file__).parent / 'views' / 'cost-overview.html' + if path.exists(): + return path.read_text() + return "Dashboard not found", 404 + + +@app.route('/test-timings') +@auth.login_required +def test_timings(): + path = Path(__file__).parent / 'views' / 'test-timings.html' + if path.exists(): + return path.read_text() + return "Dashboard not found", 404 + + +if __name__ == '__main__': + app.run(host='0.0.0.0', port=8081) diff --git a/ci3/ci-metrics/billing/__init__.py b/ci3/ci-metrics/billing/__init__.py new file mode 100644 index 000000000000..e097751047c2 --- /dev/null +++ b/ci3/ci-metrics/billing/__init__.py @@ -0,0 +1,14 @@ +"""Billing package: GKE namespace billing and AWS cost data.""" + +from billing.gcp import ( + get_billing_files_in_range, + aggregate_billing_weekly, + aggregate_billing_monthly, + serve_billing_dashboard, +) +from billing.aws import ( + get_costs_overview, + get_aws_cost_details, + decode_branch_info, + decode_instance_name, +) diff --git a/ci3/ci-metrics/billing/aws.py b/ci3/ci-metrics/billing/aws.py new file mode 100644 index 000000000000..481393d74ec3 --- /dev/null +++ b/ci3/ci-metrics/billing/aws.py @@ -0,0 +1,347 @@ +"""AWS Cost Explorer fetch with in-memory cache. + +Fetches on first request, caches for 6 hours. No SQLite, no background threads. +""" +import threading +import time +from datetime import datetime, timedelta, timezone + +SERVICE_CATEGORY_MAP = { + # Compute + 'Amazon Elastic Compute Cloud - Compute': 'ec2', + 'EC2 - Other': 'ec2', # EBS volumes, snapshots, NAT gateways, data transfer + 'Amazon Elastic Container Service': 'ecs', + 'Amazon Elastic Kubernetes Service': 'eks', + 'Amazon EC2 Container Registry (ECR)': 'ecr', + 'AWS Lambda': 'lambda', + 'Amazon Lightsail': 'lightsail', + # Storage + 'Amazon Simple Storage Service': 's3', + 'Amazon Elastic File System': 'efs', + 'Amazon Elastic Block Store': 'ebs', + 'Amazon ElastiCache': 'elasticache', + 'Amazon Relational Database Service': 'rds', + 'Amazon DynamoDB': 'dynamodb', + 'AWS Backup': 'backup', + # Networking + 'Amazon CloudFront': 'cloudfront', + 'CloudFront Flat-Rate Plans': 'cloudfront', + 'Amazon Virtual Private Cloud': 'vpc', + 'Elastic Load Balancing': 'elb', + 'Amazon Elastic Load Balancing': 'elb', + 'Amazon Route 53': 'route53', + 'Amazon API Gateway': 'apigateway', + 'AWS Data Transfer': 'data_transfer', + 'AWS Global Accelerator': 'global_accelerator', + # Monitoring & Security + 'AmazonCloudWatch': 'cloudwatch', + 'AWS CloudTrail': 'cloudtrail', + 'AWS Secrets Manager': 'secrets', + 'AWS Key Management Service': 'kms', + 'AWS WAF': 'waf', + 'AWS Config': 'config', + 'AWS Certificate Manager': 'acm', + # CI/CD & Dev Tools + 'AWS CodeBuild': 'codebuild', + 'AWS CodePipeline': 'codepipeline', + 'AWS CloudFormation': 'cloudformation', + 'AWS Amplify': 'amplify', + # Data & Analytics + 'AWS Glue': 'glue', + # IoT + 'AWS IoT': 'iot', + 'Amazon Location Service': 'location', + # Messaging + 'Amazon Simple Notification Service': 'sns', + 'Amazon Simple Queue Service': 'sqs', + # Other + 'Tax': 'tax', + 'AWS Support (Business)': 'support', + 'AWS Support (Enterprise)': 'support', + 'AWS Cost Explorer': 'cost_explorer', +} + +import re + +_cache = {'rows': [], 'ts': 0} +_cache_lock = threading.Lock() +_detail_cache = {'rows': [], 'ts': 0} +_detail_cache_lock = threading.Lock() +_CACHE_TTL = 6 * 3600 + +# Known job postfixes from ci.sh (these become INSTANCE_POSTFIX) +_JOB_POSTFIXES = re.compile( + r'_(x[0-9]+-(?:full|fast)|a[0-9]+-(?:full|fast)|n-deploy-[0-9]+|grind-test-[a-f0-9]+)$' +) +_ARCH_SUFFIXES = ('_amd64', '_arm64', '_x86_64', '_aarch64') + + +def decode_instance_name(run: dict) -> str: + """Reconstruct the EC2 instance name from CI run metadata. + + bootstrap_ec2 naming: + merge queue: pr-{number}_{arch}[_{postfix}] + branch: {sanitized_branch}_{arch}[_{postfix}] + """ + name = run.get('name', '') + pr = run.get('pr_number') + arch = run.get('arch', 'amd64') + # Normalize arch names + if arch in ('x86_64', 'amd64'): + arch = 'amd64' + elif arch in ('aarch64', 'arm64'): + arch = 'arm64' + job = run.get('job_id', '') + + if '(queue)' in name and pr: + base = f'pr-{pr}_{arch}' + elif pr: + base = f'pr-{pr}_{arch}' + else: + # Replicate: echo -n "$REF_NAME" | head -c 50 | tr -c 'a-zA-Z0-9-' '_' + sanitized = re.sub(r'[^a-zA-Z0-9-]', '_', name[:50]) + base = f'{sanitized}_{arch}' + if job: + return f'{base}_{job}' + return base + + +def decode_branch_info(run: dict) -> dict: + """Extract branch/PR/user context from a CI run.""" + name = run.get('name', '') + dashboard = run.get('dashboard', '') + pr = run.get('pr_number') + author = run.get('author', 'unknown') + + if '(queue)' in name or dashboard == 'next': + run_type = 'merge-queue' + branch = name.replace(' (queue)', '') + elif dashboard == 'prs': + run_type = 'pr' + branch = name + elif dashboard in ('nightly', 'releases', 'network', 'deflake'): + run_type = dashboard + branch = name + else: + run_type = 'other' + branch = name + + return { + 'type': run_type, + 'branch': branch, + 'pr_number': pr, + 'author': author, + 'instance_name': decode_instance_name(run), + } + + +def _fetch_aws_costs(date_from: str, date_to: str) -> list[dict]: + try: + import boto3 + except ImportError: + print("[rk_aws_costs] boto3 not installed, skipping") + return [] + + try: + client = boto3.client('ce', region_name='us-east-2') + rows = [] + next_token = None + + while True: + kwargs = dict( + TimePeriod={'Start': date_from, 'End': date_to}, + Granularity='DAILY', + Metrics=['UnblendedCost'], + GroupBy=[{'Type': 'DIMENSION', 'Key': 'SERVICE'}], + ) + if next_token: + kwargs['NextPageToken'] = next_token + + response = client.get_cost_and_usage(**kwargs) + + for result in response['ResultsByTime']: + date = result['TimePeriod']['Start'] + for group in result['Groups']: + service = group['Keys'][0] + amount = float(group['Metrics']['UnblendedCost']['Amount']) + if amount == 0: + continue + category = SERVICE_CATEGORY_MAP.get(service, 'other') + if category == 'other': + print(f"[rk_aws_costs] unmapped service: {service!r} (${amount:.2f})") + rows.append({ + 'date': date, + 'service': service, + 'category': category, + 'amount_usd': round(amount, 4), + }) + + next_token = response.get('NextPageToken') + if not next_token: + break + + return rows + except Exception as e: + print(f"[rk_aws_costs] Error: {e}") + return [] + + +def _ensure_cached(): + now = time.time() + if _cache['rows'] and now - _cache['ts'] < _CACHE_TTL: + return + if not _cache_lock.acquire(blocking=False): + return + try: + today = datetime.now(timezone.utc).date() + rows = _fetch_aws_costs( + (today - timedelta(days=365)).isoformat(), + today.isoformat(), + ) + if rows: + _cache['rows'] = rows + _cache['ts'] = now + finally: + _cache_lock.release() + + +def get_aws_costs(date_from: str, date_to: str) -> list[dict]: + """Get AWS costs for date range. Blocks on first fetch, async refresh after.""" + if not _cache['rows']: + _ensure_cached() # block on first load so dashboard isn't empty + else: + threading.Thread(target=_ensure_cached, daemon=True).start() + return [r for r in _cache['rows'] if date_from <= r['date'] <= date_to] + + +def _fetch_aws_cost_details(date_from: str, date_to: str) -> list[dict]: + """Fetch per-resource (USAGE_TYPE) cost breakdown from AWS Cost Explorer.""" + try: + import boto3 + except ImportError: + return [] + + try: + client = boto3.client('ce', region_name='us-east-2') + rows = [] + next_token = None + + while True: + kwargs = dict( + TimePeriod={'Start': date_from, 'End': date_to}, + Granularity='DAILY', + Metrics=['UnblendedCost'], + GroupBy=[ + {'Type': 'DIMENSION', 'Key': 'SERVICE'}, + {'Type': 'DIMENSION', 'Key': 'USAGE_TYPE'}, + ], + ) + if next_token: + kwargs['NextPageToken'] = next_token + + response = client.get_cost_and_usage(**kwargs) + + for result in response['ResultsByTime']: + date = result['TimePeriod']['Start'] + for group in result['Groups']: + service = group['Keys'][0] + usage_type = group['Keys'][1] + amount = float(group['Metrics']['UnblendedCost']['Amount']) + if amount == 0: + continue + category = SERVICE_CATEGORY_MAP.get(service, 'other') + rows.append({ + 'date': date, + 'service': service, + 'usage_type': usage_type, + 'category': category, + 'amount_usd': round(amount, 4), + }) + + next_token = response.get('NextPageToken') + if not next_token: + break + + return rows + except Exception as e: + print(f"[rk_aws_costs] Detail fetch error: {e}") + return [] + + +def _ensure_detail_cached(): + now = time.time() + if _detail_cache['rows'] and now - _detail_cache['ts'] < _CACHE_TTL: + return + if not _detail_cache_lock.acquire(blocking=False): + return + try: + today = datetime.now(timezone.utc).date() + rows = _fetch_aws_cost_details( + (today - timedelta(days=365)).isoformat(), + today.isoformat(), + ) + if rows: + _detail_cache['rows'] = rows + _detail_cache['ts'] = now + finally: + _detail_cache_lock.release() + + +def get_aws_cost_details(date_from: str, date_to: str) -> list[dict]: + """Get per-resource AWS cost details. Blocks on first fetch, async refresh after.""" + if not _detail_cache['rows']: + _ensure_detail_cached() + else: + threading.Thread(target=_ensure_detail_cached, daemon=True).start() + return [r for r in _detail_cache['rows'] if date_from <= r['date'] <= date_to] + + +def get_costs_overview(date_from: str, date_to: str) -> dict: + """Combined AWS + GCP cost overview. GCP data comes from billing JSON files.""" + aws_rows = get_aws_costs(date_from, date_to) + + # GCP data from billing files (already on disk, no SQLite needed) + gcp_by_date = {} + try: + from billing.gcp import get_billing_files_in_range + billing_data = get_billing_files_in_range( + datetime.strptime(date_from, '%Y-%m-%d'), + datetime.strptime(date_to, '%Y-%m-%d'), + ) + for entry in billing_data: + d = entry['date'] + if d not in gcp_by_date: + gcp_by_date[d] = {} + for ns_data in entry.get('namespaces', {}).values(): + for cat, amt in ns_data.get('breakdown', {}).items(): + gcp_by_date[d][cat] = gcp_by_date[d].get(cat, 0) + amt + except Exception as e: + print(f"[rk_aws_costs] GCP billing read failed: {e}") + + by_date = {} + for r in aws_rows: + d = r['date'] + if d not in by_date: + by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0} + cat = r['category'] + by_date[d]['aws'][cat] = by_date[d]['aws'].get(cat, 0) + r['amount_usd'] + by_date[d]['aws_total'] += r['amount_usd'] + + for d, cats in gcp_by_date.items(): + if d not in by_date: + by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0} + by_date[d]['gcp'] = cats + by_date[d]['gcp_total'] = sum(cats.values()) + + sorted_dates = sorted(by_date.values(), key=lambda x: x['date']) + aws_total = sum(d['aws_total'] for d in sorted_dates) + gcp_total = sum(d['gcp_total'] for d in sorted_dates) + + return { + 'by_date': sorted_dates, + 'totals': { + 'aws': round(aws_total, 2), + 'gcp': round(gcp_total, 2), + 'combined': round(aws_total + gcp_total, 2), + } + } diff --git a/ci3/ci-metrics/billing/billing-dashboard.html b/ci3/ci-metrics/billing/billing-dashboard.html new file mode 100644 index 000000000000..87193ffae207 --- /dev/null +++ b/ci3/ci-metrics/billing/billing-dashboard.html @@ -0,0 +1,415 @@ + + + + + ACI - Namespace Billing + + + + +

namespace billing

+ +
+ + + + | + + + | + + + + | + + + + + + | + + + +
+ +
+ +
+ +
+
+

cost over time

+
+
+
+

cost by namespace

+
+
+
+

cost by category

+
+
+
+ + + + +
+ + + + + diff --git a/ci3/ci-metrics/billing/explore.py b/ci3/ci-metrics/billing/explore.py new file mode 100644 index 000000000000..c591d8c847ef --- /dev/null +++ b/ci3/ci-metrics/billing/explore.py @@ -0,0 +1,619 @@ +#!/usr/bin/env python3 +"""CLI tool to explore GCP billing data from the Cloud Billing BigQuery export. + +Queries the actual billing export tables (not usage metering) to get real +invoice-level costs. Caches results in SQLite for fast re-queries. + +Usage: + python billing_explore.py discover # find billing export tables + python billing_explore.py fetch [--months N] # fetch & cache billing data + python billing_explore.py monthly # show monthly totals + python billing_explore.py monthly --by service # monthly by service + python billing_explore.py monthly --by sku # monthly by SKU + python billing_explore.py monthly --by project # monthly by project + python billing_explore.py daily [--month 2024-12] # daily for a month + python billing_explore.py top [--month 2024-12] # top costs for a month + python billing_explore.py compare # compare billing export vs usage metering +""" +import argparse +import os +import sqlite3 +import sys +from datetime import datetime, timedelta, timezone + +DB_PATH = os.path.join(os.getenv('LOGS_DISK_PATH', '/tmp'), 'billing_explore.db') + +SCHEMA = """ +CREATE TABLE IF NOT EXISTS gcp_billing ( + date TEXT NOT NULL, + project_id TEXT NOT NULL DEFAULT '', + service TEXT NOT NULL DEFAULT '', + sku TEXT NOT NULL DEFAULT '', + cost REAL NOT NULL DEFAULT 0, + credits REAL NOT NULL DEFAULT 0, + usage_amount REAL NOT NULL DEFAULT 0, + usage_unit TEXT NOT NULL DEFAULT '', + currency TEXT NOT NULL DEFAULT 'USD', + fetched_at TEXT NOT NULL, + PRIMARY KEY (date, project_id, service, sku) +); +CREATE INDEX IF NOT EXISTS idx_gcp_billing_date ON gcp_billing(date); +CREATE INDEX IF NOT EXISTS idx_gcp_billing_service ON gcp_billing(service); + +CREATE TABLE IF NOT EXISTS gcp_billing_meta ( + key TEXT PRIMARY KEY, + value TEXT +); +""" + + +def get_db(): + os.makedirs(os.path.dirname(DB_PATH) or '.', exist_ok=True) + conn = sqlite3.connect(DB_PATH) + conn.execute('PRAGMA busy_timeout = 5000') + conn.row_factory = sqlite3.Row + conn.executescript(SCHEMA) + return conn + + +def fmt_usd(v): + if v >= 1000: + return f'${v:,.0f}' + if v >= 1: + return f'${v:,.2f}' + return f'${v:,.4f}' + + +# ---- BigQuery Discovery ---- + +def cmd_discover(args): + """Find billing export tables in the project.""" + from google.cloud import bigquery + project = args.project + client = bigquery.Client(project=project) + + print(f'Listing datasets in project: {project}') + datasets = list(client.list_datasets()) + if not datasets: + print(' No datasets found.') + return + + for ds in datasets: + ds_id = ds.dataset_id + tables = list(client.list_tables(ds.reference)) + billing_tables = [t for t in tables if 'billing' in t.table_id.lower() or 'cost' in t.table_id.lower()] + if billing_tables: + print(f'\n Dataset: {ds_id}') + for t in billing_tables: + full = f'{project}.{ds_id}.{t.table_id}' + print(f' {full}') + # Show schema for first billing table + tbl = client.get_table(t.reference) + print(f' rows: {tbl.num_rows}, size: {tbl.num_bytes / 1e6:.1f} MB') + print(f' columns: {", ".join(f.name for f in tbl.schema[:15])}') + else: + # Check for usage metering tables too + usage_tables = [t for t in tables if 'gke_cluster' in t.table_id.lower()] + if usage_tables: + print(f'\n Dataset: {ds_id} (usage metering)') + for t in usage_tables: + print(f' {project}.{ds_id}.{t.table_id}') + + # Also try common billing export naming patterns + print(f'\n Trying common billing export table patterns...') + for ds in datasets: + for t in client.list_tables(ds.reference): + if t.table_id.startswith('gcp_billing_export'): + full = f'{project}.{ds.dataset_id}.{t.table_id}' + print(f' FOUND: {full}') + + +# ---- BigQuery Fetch ---- + +def cmd_fetch(args): + """Fetch billing data from BigQuery and cache in SQLite.""" + from google.cloud import bigquery + + table = args.table + project = args.project + months = args.months + + if not table: + print('ERROR: --table is required. Run "discover" first to find the billing export table.') + print(' e.g. --table project.dataset.gcp_billing_export_resource_v1_XXXXXX') + sys.exit(1) + + client = bigquery.Client(project=project) + end_date = datetime.now(timezone.utc).date() + start_date = end_date - timedelta(days=months * 31) + + print(f'Fetching billing data from {start_date} to {end_date}') + print(f'Table: {table}') + + # Query the billing export table + # The standard billing export has: billing_account_id, service.description, + # sku.description, usage_start_time, project.id, cost, credits, usage.amount, usage.unit + query = f""" + SELECT + DATE(usage_start_time) AS date, + COALESCE(project.id, '') AS project_id, + COALESCE(service.description, '') AS service, + COALESCE(sku.description, '') AS sku, + SUM(cost) AS cost, + SUM(IFNULL((SELECT SUM(c.amount) FROM UNNEST(credits) c), 0)) AS credits, + SUM(usage.amount) AS usage_amount, + MAX(usage.unit) AS usage_unit + FROM `{table}` + WHERE DATE(usage_start_time) BETWEEN @start_date AND @end_date + GROUP BY date, project_id, service, sku + HAVING ABS(cost) > 0.0001 OR ABS(credits) > 0.0001 + ORDER BY date, service, sku + """ + + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter('start_date', 'DATE', start_date.isoformat()), + bigquery.ScalarQueryParameter('end_date', 'DATE', end_date.isoformat()), + ] + ) + + print('Running query...') + result = list(client.query(query, job_config=job_config).result()) + print(f'Got {len(result)} rows') + + if not result: + print('No data returned. Check table name and date range.') + return + + # Store in SQLite + db = get_db() + now = datetime.now(timezone.utc).isoformat() + + db.execute('DELETE FROM gcp_billing WHERE date >= ? AND date <= ?', + (start_date.isoformat(), end_date.isoformat())) + + for row in result: + db.execute(''' + INSERT OR REPLACE INTO gcp_billing + (date, project_id, service, sku, cost, credits, usage_amount, usage_unit, fetched_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + row.date.isoformat() if hasattr(row.date, 'isoformat') else str(row.date), + row.project_id or '', + row.service or '', + row.sku or '', + float(row.cost or 0), + float(row.credits or 0), + float(row.usage_amount or 0), + row.usage_unit or '', + now, + )) + + db.commit() + db.execute("INSERT OR REPLACE INTO gcp_billing_meta VALUES ('last_fetch', ?)", (now,)) + db.execute("INSERT OR REPLACE INTO gcp_billing_meta VALUES ('table', ?)", (table,)) + db.commit() + + print(f'Cached {len(result)} rows in {DB_PATH}') + + # Show quick summary + rows = db.execute(''' + SELECT substr(date, 1, 7) as month, SUM(cost) as cost, SUM(credits) as credits + FROM gcp_billing GROUP BY month ORDER BY month + ''').fetchall() + print(f'\n{"Month":<10} {"Gross":>12} {"Credits":>12} {"Net":>12}') + print('-' * 48) + for r in rows: + net = r['cost'] + r['credits'] + print(f'{r["month"]:<10} {fmt_usd(r["cost"]):>12} {fmt_usd(r["credits"]):>12} {fmt_usd(net):>12}') + + +# ---- Reports ---- + +def cmd_monthly(args): + """Show monthly totals.""" + db = get_db() + group_by = args.by + + if group_by == 'service': + rows = db.execute(''' + SELECT substr(date, 1, 7) as month, service, + SUM(cost) as cost, SUM(credits) as credits + FROM gcp_billing GROUP BY month, service ORDER BY month, cost DESC + ''').fetchall() + + current_month = None + for r in rows: + if r['month'] != current_month: + current_month = r['month'] + month_total = sum(row['cost'] + row['credits'] for row in rows if row['month'] == current_month) + print(f'\n {current_month} (net: {fmt_usd(month_total)})') + print(f' {"Service":<45} {"Gross":>10} {"Credits":>10} {"Net":>10}') + print(' ' + '-' * 77) + net = r['cost'] + r['credits'] + if abs(net) >= 0.01: + print(f' {r["service"]:<45} {fmt_usd(r["cost"]):>10} {fmt_usd(r["credits"]):>10} {fmt_usd(net):>10}') + + elif group_by == 'sku': + month_filter = args.month + if not month_filter: + # Use most recent month + row = db.execute('SELECT MAX(substr(date, 1, 7)) as m FROM gcp_billing').fetchone() + month_filter = row['m'] if row else None + + if not month_filter: + print('No data.') + return + + rows = db.execute(''' + SELECT service, sku, SUM(cost) as cost, SUM(credits) as credits, + SUM(usage_amount) as usage_amount, MAX(usage_unit) as usage_unit + FROM gcp_billing WHERE substr(date, 1, 7) = ? + GROUP BY service, sku ORDER BY cost DESC + ''', (month_filter,)).fetchall() + + total = sum(r['cost'] + r['credits'] for r in rows) + print(f'\n {month_filter} (net: {fmt_usd(total)})') + print(f' {"Service":<30} {"SKU":<40} {"Net":>10} {"Usage":>15}') + print(' ' + '-' * 97) + for r in rows[:40]: + net = r['cost'] + r['credits'] + if abs(net) >= 0.01: + usage = f'{r["usage_amount"]:.1f} {r["usage_unit"]}' if r['usage_amount'] else '' + print(f' {r["service"][:29]:<30} {r["sku"][:39]:<40} {fmt_usd(net):>10} {usage:>15}') + + elif group_by == 'project': + rows = db.execute(''' + SELECT substr(date, 1, 7) as month, project_id, + SUM(cost) as cost, SUM(credits) as credits + FROM gcp_billing GROUP BY month, project_id ORDER BY month, cost DESC + ''').fetchall() + + current_month = None + for r in rows: + if r['month'] != current_month: + current_month = r['month'] + month_total = sum(row['cost'] + row['credits'] for row in rows if row['month'] == current_month) + print(f'\n {current_month} (net: {fmt_usd(month_total)})') + print(f' {"Project":<45} {"Net":>12}') + print(' ' + '-' * 59) + net = r['cost'] + r['credits'] + if abs(net) >= 0.01: + print(f' {r["project_id"]:<45} {fmt_usd(net):>12}') + + else: + # Default: just monthly totals + rows = db.execute(''' + SELECT substr(date, 1, 7) as month, + SUM(cost) as cost, SUM(credits) as credits, + COUNT(DISTINCT date) as days + FROM gcp_billing GROUP BY month ORDER BY month + ''').fetchall() + + print(f'\n {"Month":<10} {"Gross":>12} {"Credits":>12} {"Net":>12} {"Days":>6} {"Daily Avg":>12}') + print(' ' + '-' * 68) + grand_total = 0 + for r in rows: + net = r['cost'] + r['credits'] + daily = net / max(r['days'], 1) + grand_total += net + print(f' {r["month"]:<10} {fmt_usd(r["cost"]):>12} {fmt_usd(r["credits"]):>12} {fmt_usd(net):>12} {r["days"]:>6} {fmt_usd(daily):>12}') + print(' ' + '-' * 68) + print(f' {"TOTAL":<10} {"":>12} {"":>12} {fmt_usd(grand_total):>12}') + + +def cmd_daily(args): + """Show daily costs for a month.""" + db = get_db() + month = args.month + if not month: + row = db.execute('SELECT MAX(substr(date, 1, 7)) as m FROM gcp_billing').fetchone() + month = row['m'] if row else None + + if not month: + print('No data.') + return + + rows = db.execute(''' + SELECT date, SUM(cost) as cost, SUM(credits) as credits + FROM gcp_billing WHERE substr(date, 1, 7) = ? + GROUP BY date ORDER BY date + ''', (month,)).fetchall() + + total = 0 + print(f'\n {"Date":<12} {"Gross":>10} {"Credits":>10} {"Net":>10}') + print(' ' + '-' * 44) + for r in rows: + net = r['cost'] + r['credits'] + total += net + print(f' {r["date"]:<12} {fmt_usd(r["cost"]):>10} {fmt_usd(r["credits"]):>10} {fmt_usd(net):>10}') + print(' ' + '-' * 44) + print(f' {"TOTAL":<12} {"":>10} {"":>10} {fmt_usd(total):>10}') + + +def cmd_top(args): + """Show top cost items for a month.""" + db = get_db() + month = args.month + if not month: + row = db.execute('SELECT MAX(substr(date, 1, 7)) as m FROM gcp_billing').fetchone() + month = row['m'] if row else None + + if not month: + print('No data.') + return + + # Top services + services = db.execute(''' + SELECT service, SUM(cost + credits) as net, SUM(cost) as gross + FROM gcp_billing WHERE substr(date, 1, 7) = ? + GROUP BY service ORDER BY net DESC LIMIT 15 + ''', (month,)).fetchall() + + total = sum(r['net'] for r in services) + print(f'\n Top services for {month} (total: {fmt_usd(total)})') + print(f' {"Service":<45} {"Net":>12} {"% of Total":>10}') + print(' ' + '-' * 69) + for r in services: + pct = 100 * r['net'] / max(total, 0.01) + if abs(r['net']) >= 0.01: + print(f' {r["service"]:<45} {fmt_usd(r["net"]):>12} {pct:>9.1f}%') + + # Top SKUs + skus = db.execute(''' + SELECT service, sku, SUM(cost + credits) as net + FROM gcp_billing WHERE substr(date, 1, 7) = ? + GROUP BY service, sku ORDER BY net DESC LIMIT 20 + ''', (month,)).fetchall() + + print(f'\n Top SKUs for {month}') + print(f' {"Service":<25} {"SKU":<40} {"Net":>12}') + print(' ' + '-' * 79) + for r in skus: + if abs(r['net']) >= 0.01: + print(f' {r["service"][:24]:<25} {r["sku"][:39]:<40} {fmt_usd(r["net"]):>12}') + + +def cmd_compare(args): + """Compare billing export data vs usage metering estimates.""" + db = get_db() + + # Get billing export monthly totals + billing_rows = db.execute(''' + SELECT substr(date, 1, 7) as month, SUM(cost + credits) as net + FROM gcp_billing GROUP BY month ORDER BY month + ''').fetchall() + + if not billing_rows: + print('No billing export data cached. Run "fetch" first.') + return + + # Get usage metering estimates + try: + from billing import gcp as _gcp_billing + _gcp_billing._ensure_cached() + metering_data = _gcp_billing._cache.get('data', []) + except Exception as e: + print(f'Could not load usage metering data: {e}') + metering_data = [] + + metering_monthly = {} + for entry in metering_data: + month = entry['date'][:7] + day_total = sum(ns.get('total', 0) for ns in entry.get('namespaces', {}).values()) + metering_monthly[month] = metering_monthly.get(month, 0) + day_total + + print(f'\n {"Month":<10} {"Billing Export":>15} {"Usage Metering":>15} {"Ratio":>8}') + print(' ' + '-' * 50) + for r in billing_rows: + billing = r['net'] + metering = metering_monthly.get(r['month'], 0) + ratio = f'{billing / metering:.2f}x' if metering > 0 else '--' + print(f' {r["month"]:<10} {fmt_usd(billing):>15} {fmt_usd(metering):>15} {ratio:>8}') + + +def cmd_status(args): + """Show what data we have cached.""" + db = get_db() + meta = {r['key']: r['value'] for r in db.execute('SELECT * FROM gcp_billing_meta').fetchall()} + billing_count = db.execute('SELECT COUNT(*) as c FROM gcp_billing').fetchone()['c'] + billing_range = db.execute('SELECT MIN(date) as mn, MAX(date) as mx FROM gcp_billing').fetchone() + + print(f'\n Billing export cache:') + print(f' DB path: {DB_PATH}') + print(f' Table: {meta.get("table", "(not set)")}') + print(f' Last fetch: {meta.get("last_fetch", "(never)")}') + print(f' Rows: {billing_count}') + if billing_count: + print(f' Date range: {billing_range["mn"]} to {billing_range["mx"]}') + + # Also check billing export table status + try: + from google.cloud import bigquery + client = bigquery.Client(project=args.project) + table_id = 'testnet-440309.testnet440309billing.gcp_billing_export_v1_01EA8B_291C89_753ABC' + t = client.get_table(table_id) + print(f'\n BigQuery billing export:') + print(f' Table: {table_id}') + print(f' Rows: {t.num_rows}') + print(f' Modified: {t.modified}') + if t.num_rows > 0: + print(f' STATUS: Data available! Run "fetch --table {table_id}" to cache it.') + else: + print(f' STATUS: Not yet populated. GCP takes up to 24h after enabling export.') + except Exception as e: + print(f'\n BigQuery check failed: {e}') + + +def cmd_metering(args): + """Query both usage metering tables and compare with different approaches.""" + from google.cloud import bigquery + project = args.project + client = bigquery.Client(project=project) + months = args.months + + end_date = datetime.now(timezone.utc).date() + start_date = end_date - timedelta(days=months * 31) + + # Table names + usage_table = f'{project}.egress_consumption.gke_cluster_resource_usage' + consumption_table = f'{project}.egress_consumption.gke_cluster_resource_consumption' + + print(f'Date range: {start_date} to {end_date}') + + # 1. Current approach: usage table with our SKU pricing + print('\n=== Approach 1: gke_cluster_resource_usage (requests) with hardcoded SKU prices ===') + _query_metering_table(client, usage_table, start_date, end_date, 'REQUESTS') + + # 2. Consumption table with our SKU pricing + print('\n=== Approach 2: gke_cluster_resource_consumption (actual) with hardcoded SKU prices ===') + _query_metering_table(client, consumption_table, start_date, end_date, 'CONSUMPTION') + + # 3. Raw totals: what does each table report? + print('\n=== Approach 3: Raw resource totals from both tables ===') + for tname, label in [(usage_table, 'REQUESTS'), (consumption_table, 'CONSUMPTION')]: + query = f""" + SELECT + FORMAT_DATE('%Y-%m', DATE(start_time)) AS month, + resource_name, + SUM(usage.amount) AS total_amount, + usage.unit + FROM `{tname}` + WHERE DATE(start_time) BETWEEN @start AND @end + GROUP BY month, resource_name, usage.unit + ORDER BY month, resource_name + """ + job_config = bigquery.QueryJobConfig(query_parameters=[ + bigquery.ScalarQueryParameter('start', 'DATE', start_date.isoformat()), + bigquery.ScalarQueryParameter('end', 'DATE', end_date.isoformat()), + ]) + rows = list(client.query(query, job_config=job_config).result()) + print(f'\n {label} table raw resources:') + print(f' {"Month":<10} {"Resource":<20} {"Amount":>20} {"Unit":<15}') + print(' ' + '-' * 67) + for r in rows: + print(f' {r.month:<10} {r.resource_name:<20} {r.total_amount:>20,.0f} {r.unit:<15}') + + # 4. Count distinct SKUs + print('\n=== Approach 4: Distinct SKUs in usage table ===') + query = f""" + SELECT sku_id, resource_name, COUNT(*) as row_count, + SUM(usage.amount) as total_amount, usage.unit + FROM `{usage_table}` + WHERE DATE(start_time) BETWEEN @start AND @end + GROUP BY sku_id, resource_name, usage.unit + ORDER BY total_amount DESC + """ + job_config = bigquery.QueryJobConfig(query_parameters=[ + bigquery.ScalarQueryParameter('start', 'DATE', start_date.isoformat()), + bigquery.ScalarQueryParameter('end', 'DATE', end_date.isoformat()), + ]) + rows = list(client.query(query, job_config=job_config).result()) + # Import pricing to check + from billing.gcp import _SKU_PRICING + print(f' {"SKU ID":<20} {"Resource":<20} {"Rows":>10} {"Amount":>18} {"Unit":<12} {"Known?"}') + print(' ' + '-' * 90) + for r in rows: + known = 'YES' if r.sku_id in _SKU_PRICING else 'MISSING' + print(f' {r.sku_id:<20} {r.resource_name:<20} {r.row_count:>10,} {r.total_amount:>18,.0f} {r.unit:<12} {known}') + + +def _query_metering_table(client, table, start_date, end_date, label): + """Query a metering table and compute costs using our SKU pricing.""" + from google.cloud import bigquery + from billing.gcp import _SKU_PRICING, _usage_to_cost + + query = f""" + SELECT + FORMAT_DATE('%Y-%m', DATE(start_time)) AS month, + namespace, + sku_id, + resource_name, + SUM(usage.amount) AS total_usage + FROM `{table}` + WHERE DATE(start_time) BETWEEN @start AND @end + GROUP BY month, namespace, sku_id, resource_name + ORDER BY month, namespace + """ + job_config = bigquery.QueryJobConfig(query_parameters=[ + bigquery.ScalarQueryParameter('start', 'DATE', start_date.isoformat()), + bigquery.ScalarQueryParameter('end', 'DATE', end_date.isoformat()), + ]) + rows = list(client.query(query, job_config=job_config).result()) + + monthly = {} + monthly_by_cat = {} + missing_skus = set() + for r in rows: + cost, category = _usage_to_cost(r.sku_id, r.resource_name, float(r.total_usage)) + if r.sku_id not in _SKU_PRICING: + missing_skus.add(r.sku_id) + month = r.month + monthly[month] = monthly.get(month, 0) + cost + key = (month, category) + monthly_by_cat[key] = monthly_by_cat.get(key, 0) + cost + + print(f' {"Month":<10} {"Total":>12} {"compute_spot":>14} {"compute_od":>14} {"network":>10} {"storage":>10}') + print(' ' + '-' * 74) + for month in sorted(monthly.keys()): + total = monthly[month] + spot = monthly_by_cat.get((month, 'compute_spot'), 0) + od = monthly_by_cat.get((month, 'compute_ondemand'), 0) + net = monthly_by_cat.get((month, 'network'), 0) + stor = monthly_by_cat.get((month, 'storage'), 0) + print(f' {month:<10} {fmt_usd(total):>12} {fmt_usd(spot):>14} {fmt_usd(od):>14} {fmt_usd(net):>10} {fmt_usd(stor):>10}') + + if missing_skus: + print(f'\n WARNING: {len(missing_skus)} unknown SKU IDs (not priced): {", ".join(sorted(missing_skus)[:5])}...') + + +# ---- Main ---- + +def main(): + parser = argparse.ArgumentParser(description='Explore GCP billing data') + parser.add_argument('--project', default='testnet-440309', help='GCP project ID') + parser.add_argument('--table', default='', help='BigQuery billing export table') + sub = parser.add_subparsers(dest='command') + + sub.add_parser('discover', help='Find billing export tables') + + fetch_p = sub.add_parser('fetch', help='Fetch billing data from BigQuery') + fetch_p.add_argument('--months', type=int, default=6, help='How many months back to fetch') + + monthly_p = sub.add_parser('monthly', help='Monthly totals') + monthly_p.add_argument('--by', choices=['service', 'sku', 'project'], default='', help='Group by') + monthly_p.add_argument('--month', default='', help='Filter to month (YYYY-MM)') + + daily_p = sub.add_parser('daily', help='Daily costs') + daily_p.add_argument('--month', default='', help='Month to show (YYYY-MM)') + + top_p = sub.add_parser('top', help='Top cost items') + top_p.add_argument('--month', default='', help='Month to show (YYYY-MM)') + + sub.add_parser('compare', help='Compare billing export vs usage metering') + sub.add_parser('status', help='Show data status (what we have cached)') + + meter_p = sub.add_parser('metering', help='Query both metering tables directly and compare') + meter_p.add_argument('--months', type=int, default=6, help='How many months back') + + args = parser.parse_args() + + if not args.command: + parser.print_help() + sys.exit(1) + + cmds = { + 'discover': cmd_discover, + 'fetch': cmd_fetch, + 'monthly': cmd_monthly, + 'daily': cmd_daily, + 'top': cmd_top, + 'compare': cmd_compare, + 'metering': cmd_metering, + 'status': cmd_status, + } + cmds[args.command](args) + + +if __name__ == '__main__': + main() diff --git a/ci3/ci-metrics/billing/fetch_billing.py b/ci3/ci-metrics/billing/fetch_billing.py new file mode 100644 index 000000000000..271a788fc6bd --- /dev/null +++ b/ci3/ci-metrics/billing/fetch_billing.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 +"""Fetch namespace billing data from GKE resource consumption metering in BigQuery. + +Queries the GKE cluster resource consumption table which records CPU and memory +usage per namespace per pod. Actual GCP SKU prices (from the Cloud Billing +Catalog API) are applied to convert resource usage into dollar costs. + +Categories produced: + - compute_spot (Spot / Preemptible VM cores + RAM) + - compute_ondemand (On-demand VM cores + RAM) + +Usage: + # Fetch last 30 days + python fetch-billing.py + + # Specific range + python fetch-billing.py --from 2026-01-01 --to 2026-01-31 + + # Custom output directory + python fetch-billing.py --output-dir /tmp/billing + +Environment: + Requires Application Default Credentials or GOOGLE_APPLICATION_CREDENTIALS. + pip install google-cloud-bigquery +""" +import argparse +import json +import os +import sys +from datetime import datetime, timedelta + +from google.cloud import bigquery + +# ---- defaults ---- +DEFAULT_PROJECT = 'testnet-440309' +DEFAULT_DATASET = 'egress_consumption' +DEFAULT_TABLE_CONSUMPTION = 'gke_cluster_resource_consumption' +DEFAULT_TABLE_USAGE = 'gke_cluster_resource_usage' +DEFAULT_OUTPUT_DIR = os.path.join( + os.getenv('LOGS_DISK_PATH', '/logs-disk'), 'billing' +) + +# ---- SKU pricing ---- +# Prices sourced from GCP Cloud Billing Catalog API for us-west1. +SKU_PRICING = { + # Compute - Spot (per vCPU-hour / per GiB-hour) + 'E7FF-A0FB-FA82': {'price': 0.00497, 'resource': 'cpu', 'category': 'compute_spot'}, + '48AB-89F5-9112': {'price': 0.000668, 'resource': 'memory', 'category': 'compute_spot'}, + # Compute - On-demand T2D + 'EFE6-E23C-19CB': {'price': 0.027502, 'resource': 'cpu', 'category': 'compute_ondemand'}, + 'FB05-036A-8982': {'price': 0.003686, 'resource': 'memory', 'category': 'compute_ondemand'}, + # Compute - On-demand N2 + 'BB77-5FDA-69D9': {'price': 0.031611, 'resource': 'cpu', 'category': 'compute_ondemand'}, + '5B01-D157-A097': {'price': 0.004237, 'resource': 'memory', 'category': 'compute_ondemand'}, + # Compute - On-demand N2D + 'A03E-E620-7389': {'price': 0.027502, 'resource': 'cpu', 'category': 'compute_ondemand'}, + '5535-6D2D-4B50': {'price': 0.003686, 'resource': 'memory', 'category': 'compute_ondemand'}, + # Network Egress (per GiB) + '0C3C-6B13-B1E8': {'price': 0.02, 'resource': 'networkEgress', 'category': 'network'}, + '6B8F-E63D-832B': {'price': 0.0, 'resource': 'networkEgress', 'category': 'network'}, + '92CB-C25F-B1D1': {'price': 0.0, 'resource': 'networkEgress', 'category': 'network'}, + '984A-1F27-2D1F': {'price': 0.04, 'resource': 'networkEgress', 'category': 'network'}, + '9DE9-9092-B3BC': {'price': 0.20, 'resource': 'networkEgress', 'category': 'network'}, + 'C863-37DA-506E': {'price': 0.02, 'resource': 'networkEgress', 'category': 'network'}, + 'C8EA-1A86-3D28': {'price': 0.02, 'resource': 'networkEgress', 'category': 'network'}, + 'DE9E-AFBC-A15A': {'price': 0.01, 'resource': 'networkEgress', 'category': 'network'}, + 'DFA5-B5C6-36D6': {'price': 0.085, 'resource': 'networkEgress', 'category': 'network'}, + 'F274-1692-F213': {'price': 0.08, 'resource': 'networkEgress', 'category': 'network'}, + 'FDBC-6E3B-D4D8': {'price': 0.15, 'resource': 'networkEgress', 'category': 'network'}, + # Storage (per GiB-month) + 'D973-5D65-BAB2': {'price': 0.04, 'resource': 'storage', 'category': 'storage'}, +} + + +def usage_to_cost(sku_id: str, resource_name: str, amount: float) -> tuple[float, str]: + """Convert raw usage amount to dollar cost. Returns (cost_usd, category).""" + info = SKU_PRICING.get(sku_id) + if not info: + return 0.0, 'other' + + price = info['price'] + if resource_name == 'cpu': + return (amount / 3600.0) * price, info['category'] + elif resource_name == 'memory': + return (amount / 3600.0 / (1024 ** 3)) * price, info['category'] + elif resource_name.startswith('networkEgress'): + return (amount / (1024 ** 3)) * price, info['category'] + elif resource_name == 'storage': + gib_months = amount / (1024 ** 3) / (730 * 3600) + return gib_months * price, info['category'] + return 0.0, info['category'] + + +# ---- BigQuery query ---- + +def fetch_usage_rows( + client: bigquery.Client, + project: str, + dataset: str, + date_from: str, + date_to: str, +) -> list[dict]: + """Query both metering tables for daily usage by namespace + SKU.""" + consumption = f'{project}.{dataset}.{DEFAULT_TABLE_CONSUMPTION}' + usage = f'{project}.{dataset}.{DEFAULT_TABLE_USAGE}' + query = f""" + SELECT date, namespace, sku_id, resource_name, SUM(total_usage) AS total_usage FROM ( + SELECT DATE(start_time) AS date, namespace, sku_id, resource_name, SUM(usage.amount) AS total_usage + FROM `{consumption}` + WHERE DATE(start_time) BETWEEN @date_from AND @date_to + GROUP BY date, namespace, sku_id, resource_name + UNION ALL + SELECT DATE(start_time) AS date, namespace, sku_id, resource_name, SUM(usage.amount) AS total_usage + FROM `{usage}` + WHERE DATE(start_time) BETWEEN @date_from AND @date_to + AND resource_name IN ('networkEgress', 'storage') + GROUP BY date, namespace, sku_id, resource_name + ) + GROUP BY date, namespace, sku_id, resource_name + ORDER BY date, namespace + """ + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter('date_from', 'DATE', date_from), + bigquery.ScalarQueryParameter('date_to', 'DATE', date_to), + ] + ) + rows = client.query(query, job_config=job_config).result() + return [dict(row) for row in rows] + + +# ---- aggregate into daily JSON ---- + +def build_daily_files(rows: list[dict]) -> tuple[dict[str, dict], set[str]]: + """Convert raw usage rows into daily billing JSON structures. + + Returns (days_dict, unknown_skus). + """ + days: dict[str, dict] = {} + unknown_skus: set[str] = set() + + for row in rows: + date_str = ( + row['date'].isoformat() + if hasattr(row['date'], 'isoformat') + else str(row['date']) + ) + ns = row['namespace'] + sku_id = row['sku_id'] + resource_name = row['resource_name'] + amount = float(row['total_usage']) + + cost, category = usage_to_cost(sku_id, resource_name, amount) + + if sku_id not in SKU_PRICING: + unknown_skus.add(sku_id) + + if cost <= 0: + continue + + if date_str not in days: + days[date_str] = {'date': date_str, 'namespaces': {}} + if ns not in days[date_str]['namespaces']: + days[date_str]['namespaces'][ns] = {'total': 0, 'breakdown': {}} + + entry = days[date_str]['namespaces'][ns] + entry['breakdown'][category] = ( + entry['breakdown'].get(category, 0) + cost + ) + entry['total'] += cost + + # Round + for day in days.values(): + for ns_data in day['namespaces'].values(): + ns_data['total'] = round(ns_data['total'], 4) + ns_data['breakdown'] = { + k: round(v, 4) for k, v in ns_data['breakdown'].items() + } + + return days, unknown_skus + + +def write_files(days: dict[str, dict], output_dir: str) -> int: + os.makedirs(output_dir, exist_ok=True) + count = 0 + for date_str, data in sorted(days.items()): + filepath = os.path.join(output_dir, f'{date_str}.json') + with open(filepath, 'w') as f: + json.dump(data, f, indent=2) + count += 1 + return count + + +# ---- CLI ---- + +def main(): + parser = argparse.ArgumentParser( + description='Fetch GKE namespace compute billing from resource consumption metering' + ) + today = datetime.utcnow().strftime('%Y-%m-%d') + default_from = (datetime.utcnow() - timedelta(days=30)).strftime('%Y-%m-%d') + + parser.add_argument('--from', dest='date_from', default=default_from, + help='Start date YYYY-MM-DD (default: 30 days ago)') + parser.add_argument('--to', dest='date_to', default=today, + help='End date YYYY-MM-DD (default: today)') + parser.add_argument('--project', default=DEFAULT_PROJECT, + help=f'GCP project ID (default: {DEFAULT_PROJECT})') + parser.add_argument('--dataset', default=DEFAULT_DATASET, + help=f'BigQuery dataset (default: {DEFAULT_DATASET})') + parser.add_argument('--output-dir', default=DEFAULT_OUTPUT_DIR, + help=f'Output directory (default: {DEFAULT_OUTPUT_DIR})') + args = parser.parse_args() + + print(f'Connecting to BigQuery ({args.project})...') + client = bigquery.Client(project=args.project) + + print(f'Fetching metering data {args.date_from} to {args.date_to}...') + print(f' consumption: {args.project}.{args.dataset}.{DEFAULT_TABLE_CONSUMPTION}') + print(f' usage: {args.project}.{args.dataset}.{DEFAULT_TABLE_USAGE}') + rows = fetch_usage_rows( + client, args.project, args.dataset, + args.date_from, args.date_to, + ) + print(f'Got {len(rows)} aggregated rows') + + if not rows: + print('No metering data found. Check that:') + print(' 1. GKE resource consumption metering is enabled') + print(' 2. The date range has data') + return + + days, unknown_skus = build_daily_files(rows) + count = write_files(days, args.output_dir) + print(f'Wrote {count} daily billing files to {args.output_dir}') + + if unknown_skus: + print(f'\nWARNING: {len(unknown_skus)} unknown SKU(s) had zero cost assigned:') + for s in sorted(unknown_skus): + print(f' {s}') + print('Add these to SKU_PRICING in fetch-billing.py with prices from') + print('the GCP Cloud Billing Catalog API.') + + # Summary + total = sum( + ns['total'] for day in days.values() + for ns in day['namespaces'].values() + ) + ns_set: set[str] = set() + cat_set: set[str] = set() + for day in days.values(): + for ns_name, ns_data in day['namespaces'].items(): + ns_set.add(ns_name) + cat_set.update(ns_data['breakdown'].keys()) + + print(f'\nTotal cost: ${total:,.2f}') + print(f'Namespaces ({len(ns_set)}): {sorted(ns_set)}') + print(f'Categories: {sorted(cat_set)}') + + +if __name__ == '__main__': + main() diff --git a/ci3/ci-metrics/billing/gcp.py b/ci3/ci-metrics/billing/gcp.py new file mode 100644 index 000000000000..5254e20bbbf0 --- /dev/null +++ b/ci3/ci-metrics/billing/gcp.py @@ -0,0 +1,289 @@ +"""Namespace billing helpers for rkapp. + +Fetches GKE namespace billing from BigQuery with in-memory cache. +Route definitions remain in rk.py; this module provides the logic. + +SKU pricing: Queries the Cloud Billing pricing export table in BigQuery +if available, otherwise falls back to hardcoded rates. To enable the +pricing export: + 1. Go to GCP Console > Billing > Billing export + 2. Enable "Detailed usage cost" and "Pricing" exports + 3. Set the dataset to the _BQ_DATASET below +""" +import threading +import time +from datetime import datetime, timedelta, timezone +from pathlib import Path + +# BigQuery defaults +_BQ_PROJECT = 'testnet-440309' +_BQ_DATASET = 'egress_consumption' +_BQ_TABLE_USAGE = 'gke_cluster_resource_usage' +_BQ_TABLE_PRICING = 'cloud_pricing_export' + +# Hardcoded fallback SKU pricing (us-west1). +# cpu: price per vCPU-hour, memory: price per GiB-hour +# network: price per GiB, storage: price per GiB-month +_HARDCODED_SKU_PRICING = { + # Compute - Spot + 'E7FF-A0FB-FA82': {'price': 0.00497, 'resource': 'cpu', 'category': 'compute_spot'}, + '48AB-89F5-9112': {'price': 0.000668, 'resource': 'memory', 'category': 'compute_spot'}, + # Compute - On-demand T2D + 'EFE6-E23C-19CB': {'price': 0.027502, 'resource': 'cpu', 'category': 'compute_ondemand'}, + 'FB05-036A-8982': {'price': 0.003686, 'resource': 'memory', 'category': 'compute_ondemand'}, + # Compute - On-demand N2 + 'BB77-5FDA-69D9': {'price': 0.031611, 'resource': 'cpu', 'category': 'compute_ondemand'}, + '5B01-D157-A097': {'price': 0.004237, 'resource': 'memory', 'category': 'compute_ondemand'}, + # Compute - On-demand N2D + 'A03E-E620-7389': {'price': 0.027502, 'resource': 'cpu', 'category': 'compute_ondemand'}, + '5535-6D2D-4B50': {'price': 0.003686, 'resource': 'memory', 'category': 'compute_ondemand'}, + # Network Egress (price per GiB) + '0C3C-6B13-B1E8': {'price': 0.02, 'resource': 'networkEgress', 'category': 'network'}, + '6B8F-E63D-832B': {'price': 0.0, 'resource': 'networkEgress', 'category': 'network'}, + '92CB-C25F-B1D1': {'price': 0.0, 'resource': 'networkEgress', 'category': 'network'}, + '984A-1F27-2D1F': {'price': 0.04, 'resource': 'networkEgress', 'category': 'network'}, + '9DE9-9092-B3BC': {'price': 0.20, 'resource': 'networkEgress', 'category': 'network'}, + 'C863-37DA-506E': {'price': 0.02, 'resource': 'networkEgress', 'category': 'network'}, + 'C8EA-1A86-3D28': {'price': 0.02, 'resource': 'networkEgress', 'category': 'network'}, + 'DE9E-AFBC-A15A': {'price': 0.01, 'resource': 'networkEgress', 'category': 'network'}, + 'DFA5-B5C6-36D6': {'price': 0.085, 'resource': 'networkEgress', 'category': 'network'}, + 'F274-1692-F213': {'price': 0.08, 'resource': 'networkEgress', 'category': 'network'}, + 'FDBC-6E3B-D4D8': {'price': 0.15, 'resource': 'networkEgress', 'category': 'network'}, + # Storage (price per GiB-month) + 'D973-5D65-BAB2': {'price': 0.04, 'resource': 'storage', 'category': 'storage'}, +} + +# Resource name to category mapping for SKUs discovered from BigQuery +_RESOURCE_CATEGORIES = { + ('cpu', True): 'compute_spot', + ('cpu', False): 'compute_ondemand', + ('memory', True): 'compute_spot', + ('memory', False): 'compute_ondemand', +} + +# Active SKU pricing — updated from BigQuery if available +_SKU_PRICING = dict(_HARDCODED_SKU_PRICING) + +# In-memory caches +_cache = {'data': [], 'ts': 0} +_cache_lock = threading.Lock() +_CACHE_TTL = 6 * 3600 # 6 hours + +_pricing_cache = {'ts': 0} +_pricing_lock = threading.Lock() +_PRICING_CACHE_TTL = 24 * 3600 # 24 hours + + +def _refresh_sku_pricing(): + """Try to fetch SKU pricing from BigQuery pricing export table.""" + global _SKU_PRICING + now = time.time() + if _pricing_cache['ts'] and now - _pricing_cache['ts'] < _PRICING_CACHE_TTL: + return + if not _pricing_lock.acquire(blocking=False): + return + try: + if _pricing_cache['ts'] and time.time() - _pricing_cache['ts'] < _PRICING_CACHE_TTL: + return + from google.cloud import bigquery + client = bigquery.Client(project=_BQ_PROJECT) + table = f'{_BQ_PROJECT}.{_BQ_DATASET}.{_BQ_TABLE_PRICING}' + + # Get the known SKU IDs we need pricing for + sku_ids = list(_HARDCODED_SKU_PRICING.keys()) + placeholders = ', '.join(f"'{s}'" for s in sku_ids) + + query = f""" + SELECT sku.id AS sku_id, + pricing.effective_price AS price, + sku.description AS description + FROM `{table}` + WHERE sku.id IN ({placeholders}) + AND service.description = 'Compute Engine' + QUALIFY ROW_NUMBER() OVER (PARTITION BY sku.id ORDER BY export_time DESC) = 1 + """ + rows = list(client.query(query).result()) + if rows: + updated = dict(_HARDCODED_SKU_PRICING) + for row in rows: + sid = row.sku_id + if sid in updated: + updated[sid] = {**updated[sid], 'price': float(row.price)} + _SKU_PRICING = updated + _pricing_cache['ts'] = time.time() + print(f"[rk_billing] Updated {len(rows)} SKU prices from BigQuery") + else: + _pricing_cache['ts'] = time.time() + print("[rk_billing] No pricing rows returned, using hardcoded rates") + except Exception as e: + # Table probably doesn't exist yet — use hardcoded rates + _pricing_cache['ts'] = time.time() + print(f"[rk_billing] SKU pricing query failed (using hardcoded): {e}") + finally: + _pricing_lock.release() + + +# ---- BigQuery fetch ---- + +def _usage_to_cost(sku_id, resource_name, amount): + info = _SKU_PRICING.get(sku_id) + if not info: + return 0.0, 'other' + price = info['price'] + if resource_name == 'cpu': + # cpu-seconds -> hours + return (amount / 3600.0) * price, info['category'] + elif resource_name == 'memory': + # byte-seconds -> GiB-hours + return (amount / 3600.0 / (1024 ** 3)) * price, info['category'] + elif resource_name.startswith('networkEgress'): + # bytes -> GiB + return (amount / (1024 ** 3)) * price, info['category'] + elif resource_name == 'storage': + # byte-seconds -> GiB-months (730 hours/month) + gib_months = amount / (1024 ** 3) / (730 * 3600) + return gib_months * price, info['category'] + return 0.0, info['category'] + + +def _fetch_from_bigquery(date_from_str, date_to_str): + """Query BigQuery for usage data, return list of daily billing entries.""" + try: + from google.cloud import bigquery + except ImportError: + print("[rk_billing] google-cloud-bigquery not installed") + return [] + + try: + client = bigquery.Client(project=_BQ_PROJECT) + # Use the usage table for all resources (actual consumption, not just requests). + # The consumption table only records resource *requests* which can be far lower + # than actual usage (e.g. prove-n-tps-real: $2.87 requests vs $138.72 actual). + usage = f'{_BQ_PROJECT}.{_BQ_DATASET}.{_BQ_TABLE_USAGE}' + query = f""" + SELECT DATE(start_time) AS date, namespace, sku_id, resource_name, + SUM(usage.amount) AS total_usage + FROM `{usage}` + WHERE DATE(start_time) BETWEEN @date_from AND @date_to + GROUP BY date, namespace, sku_id, resource_name + ORDER BY date, namespace + """ + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter('date_from', 'DATE', date_from_str), + bigquery.ScalarQueryParameter('date_to', 'DATE', date_to_str), + ] + ) + rows = list(client.query(query, job_config=job_config).result()) + except Exception as e: + print(f"[rk_billing] BigQuery fetch failed: {e}") + return [] + + # Build daily structures + days = {} + for row in rows: + date_str = row.date.isoformat() if hasattr(row.date, 'isoformat') else str(row.date) + ns = row.namespace + cost, category = _usage_to_cost(row.sku_id, row.resource_name, float(row.total_usage)) + if cost <= 0: + continue + if date_str not in days: + days[date_str] = {'date': date_str, 'namespaces': {}} + if ns not in days[date_str]['namespaces']: + days[date_str]['namespaces'][ns] = {'total': 0, 'breakdown': {}} + entry = days[date_str]['namespaces'][ns] + entry['breakdown'][category] = entry['breakdown'].get(category, 0) + cost + entry['total'] += cost + + # Round values + for data in days.values(): + for ns_data in data['namespaces'].values(): + ns_data['total'] = round(ns_data['total'], 4) + ns_data['breakdown'] = {k: round(v, 4) for k, v in ns_data['breakdown'].items()} + + return sorted(days.values(), key=lambda x: x['date']) + + +def _ensure_cached(): + now = time.time() + if _cache['data'] and now - _cache['ts'] < _CACHE_TTL: + return + if not _cache_lock.acquire(blocking=False): + return + try: + yesterday = datetime.now(timezone.utc).date() - timedelta(days=1) + date_from = (yesterday - timedelta(days=365)).isoformat() + date_to = yesterday.isoformat() + print(f"[rk_billing] Fetching billing data from BigQuery ({date_from} to {date_to})...") + data = _fetch_from_bigquery(date_from, date_to) + if data: + _cache['data'] = data + _cache['ts'] = now + print(f"[rk_billing] Cached {len(data)} days of billing data") + finally: + _cache_lock.release() + + +# ---- Public API ---- + +def get_billing_files_in_range(date_from, date_to): + """Return billing data for dates in range. Fetches from BigQuery with in-memory cache.""" + # Refresh SKU pricing from BigQuery (async, falls back to hardcoded) + threading.Thread(target=_refresh_sku_pricing, daemon=True).start() + + if not _cache['data']: + _ensure_cached() # block on first load so dashboard isn't empty + else: + threading.Thread(target=_ensure_cached, daemon=True).start() + + # Convert datetime args to date strings for filtering + from_str = date_from.strftime('%Y-%m-%d') if hasattr(date_from, 'strftime') else str(date_from) + to_str = date_to.strftime('%Y-%m-%d') if hasattr(date_to, 'strftime') else str(date_to) + + return [e for e in _cache['data'] if from_str <= e['date'] <= to_str] + + +def _merge_ns_billing(target, ns_data): + target['total'] += ns_data.get('total', 0) + for cat, val in ns_data.get('breakdown', {}).items(): + target['breakdown'][cat] = target['breakdown'].get(cat, 0) + val + + +def aggregate_billing_weekly(daily_data): + if not daily_data: + return [] + weeks = {} + for entry in daily_data: + d = datetime.strptime(entry['date'], '%Y-%m-%d') + week_start = d - timedelta(days=d.weekday()) + week_key = week_start.strftime('%Y-%m-%d') + if week_key not in weeks: + weeks[week_key] = {'date': week_key, 'namespaces': {}} + for ns, ns_data in entry.get('namespaces', {}).items(): + if ns not in weeks[week_key]['namespaces']: + weeks[week_key]['namespaces'][ns] = {'total': 0, 'breakdown': {}} + _merge_ns_billing(weeks[week_key]['namespaces'][ns], ns_data) + return sorted(weeks.values(), key=lambda x: x['date']) + + +def aggregate_billing_monthly(daily_data): + if not daily_data: + return [] + months = {} + for entry in daily_data: + month_key = entry['date'][:7] + '-01' + if month_key not in months: + months[month_key] = {'date': month_key, 'namespaces': {}} + for ns, ns_data in entry.get('namespaces', {}).items(): + if ns not in months[month_key]['namespaces']: + months[month_key]['namespaces'][ns] = {'total': 0, 'breakdown': {}} + _merge_ns_billing(months[month_key]['namespaces'][ns], ns_data) + return sorted(months.values(), key=lambda x: x['date']) + + +def serve_billing_dashboard(): + billing_html_path = Path(__file__).parent / 'billing-dashboard.html' + if billing_html_path.exists(): + with billing_html_path.open('r') as f: + return f.read() + return None diff --git a/ci3/ci-metrics/ci-run-seed.json.gz b/ci3/ci-metrics/ci-run-seed.json.gz new file mode 100644 index 000000000000..a971ad10d38b Binary files /dev/null and b/ci3/ci-metrics/ci-run-seed.json.gz differ diff --git a/ci3/ci-metrics/db.py b/ci3/ci-metrics/db.py new file mode 100644 index 000000000000..93e970fe3a56 --- /dev/null +++ b/ci3/ci-metrics/db.py @@ -0,0 +1,107 @@ +"""SQLite database for CI metrics storage. + +Stores test events (from Redis pub/sub) and merge queue daily stats +(backfilled from GitHub API). +""" +import os +import sqlite3 +import threading + +_DB_PATH = os.path.join(os.getenv('LOGS_DISK_PATH', '/logs-disk'), 'metrics.db') +_local = threading.local() + +SCHEMA = """ +PRAGMA journal_mode=WAL; + +CREATE TABLE IF NOT EXISTS test_events ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + status TEXT NOT NULL, + test_cmd TEXT NOT NULL, + log_url TEXT, + ref_name TEXT NOT NULL, + commit_hash TEXT, + commit_author TEXT, + commit_msg TEXT, + exit_code INTEGER, + duration_secs REAL, + is_scenario INTEGER DEFAULT 0, + owners TEXT, + flake_group_id TEXT, + dashboard TEXT NOT NULL DEFAULT '', + timestamp TEXT NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_test_events_status ON test_events(status); +CREATE INDEX IF NOT EXISTS idx_test_events_ts ON test_events(timestamp); +CREATE INDEX IF NOT EXISTS idx_test_events_cmd ON test_events(test_cmd); +CREATE INDEX IF NOT EXISTS idx_test_events_dashboard ON test_events(dashboard); + +CREATE TABLE IF NOT EXISTS merge_queue_daily ( + date TEXT PRIMARY KEY, + total INTEGER NOT NULL DEFAULT 0, + success INTEGER NOT NULL DEFAULT 0, + failure INTEGER NOT NULL DEFAULT 0, + cancelled INTEGER NOT NULL DEFAULT 0, + in_progress INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE IF NOT EXISTS ci_runs ( + dashboard TEXT NOT NULL, + name TEXT NOT NULL DEFAULT '', + timestamp_ms INTEGER NOT NULL, + complete_ms INTEGER, + status TEXT, + author TEXT, + pr_number INTEGER, + instance_type TEXT, + instance_vcpus INTEGER, + spot INTEGER DEFAULT 0, + cost_usd REAL, + job_id TEXT DEFAULT '', + arch TEXT DEFAULT '', + synced_at TEXT NOT NULL, + PRIMARY KEY (dashboard, timestamp_ms, name) +); +CREATE INDEX IF NOT EXISTS idx_ci_runs_ts ON ci_runs(timestamp_ms); +CREATE INDEX IF NOT EXISTS idx_ci_runs_name ON ci_runs(name); +CREATE INDEX IF NOT EXISTS idx_ci_runs_dashboard ON ci_runs(dashboard); +""" + + +_MIGRATIONS = [ + # Add columns introduced after initial schema + "ALTER TABLE ci_runs ADD COLUMN instance_vcpus INTEGER", + "ALTER TABLE ci_runs ADD COLUMN job_id TEXT DEFAULT ''", + "ALTER TABLE ci_runs ADD COLUMN arch TEXT DEFAULT ''", + "CREATE INDEX IF NOT EXISTS idx_ci_runs_dashboard ON ci_runs(dashboard)", +] + + +def get_db() -> sqlite3.Connection: + conn = getattr(_local, 'conn', None) + if conn is None: + os.makedirs(os.path.dirname(_DB_PATH), exist_ok=True) + conn = sqlite3.connect(_DB_PATH) + conn.execute('PRAGMA busy_timeout = 5000') + conn.row_factory = sqlite3.Row + conn.executescript(SCHEMA) + # Run migrations (ignore "duplicate column" errors for idempotency) + for sql in _MIGRATIONS: + try: + conn.execute(sql) + except sqlite3.OperationalError: + pass + conn.commit() + _local.conn = conn + return conn + + +def query(sql: str, params=()) -> list[dict]: + conn = get_db() + rows = conn.execute(sql, params).fetchall() + return [dict(r) for r in rows] + + +def execute(sql: str, params=()): + conn = get_db() + conn.execute(sql, params) + conn.commit() diff --git a/ci3/ci-metrics/ec2_pricing.py b/ci3/ci-metrics/ec2_pricing.py new file mode 100644 index 000000000000..ace55ea4f40a --- /dev/null +++ b/ci3/ci-metrics/ec2_pricing.py @@ -0,0 +1,232 @@ +"""EC2 instance pricing: live on-demand + spot rates with TTL cache. + +Queries the AWS Pricing API (on-demand) and EC2 describe_spot_price_history +(spot) for us-east-2 instance rates. Caches results for 24 hours and falls +back to hardcoded values if the APIs are unavailable. + +Exports: + get_instance_rate(instance_type, is_spot) -> float + get_fallback_vcpu_rate(is_spot) -> float +""" +import json +import threading +import time +from datetime import datetime, timezone + +# ---- Hardcoded fallback rates (us-east-2, USD/hr) ---- + +_HARDCODED_RATES = { + ('m6a.48xlarge', True): 8.31, # spot + ('m6a.48xlarge', False): 16.56, # on-demand + ('m6a.32xlarge', True): 5.54, + ('m6a.32xlarge', False): 11.04, + ('m6a.16xlarge', True): 2.77, + ('m6a.16xlarge', False): 5.52, + ('m7a.48xlarge', True): 8.31, + ('m7a.48xlarge', False): 16.56, + ('m7a.16xlarge', True): 2.77, + ('m7a.16xlarge', False): 5.52, + ('m7i.48xlarge', True): 8.31, + ('m7i.48xlarge', False): 16.56, + ('r7g.16xlarge', True): 1.97, + ('r7g.16xlarge', False): 3.94, +} +_FALLBACK_VCPU_HOUR = {True: 0.0433, False: 0.0864} + +# ---- Cache state ---- + +_REGION = 'us-east-2' +_LOCATION = 'US East (Ohio)' # Pricing API uses location names, not codes +_CACHE_TTL = 24 * 3600 # 24 hours + +_cache = { + 'ondemand': {}, # instance_type -> USD/hr + 'spot': {}, # instance_type -> USD/hr + 'ts': 0, # last successful fetch time +} +_cache_lock = threading.Lock() + + +# ---- On-demand pricing (AWS Pricing API) ---- + +def _fetch_ondemand_rate(pricing_client, instance_type: str) -> float | None: + """Fetch on-demand hourly rate for a single instance type from AWS Pricing API. + + The Pricing API is only available in us-east-1 and ap-south-1. + """ + try: + response = pricing_client.get_products( + ServiceCode='AmazonEC2', + Filters=[ + {'Type': 'TERM_MATCH', 'Field': 'instanceType', 'Value': instance_type}, + {'Type': 'TERM_MATCH', 'Field': 'location', 'Value': _LOCATION}, + {'Type': 'TERM_MATCH', 'Field': 'operatingSystem', 'Value': 'Linux'}, + {'Type': 'TERM_MATCH', 'Field': 'preInstalledSw', 'Value': 'NA'}, + {'Type': 'TERM_MATCH', 'Field': 'tenancy', 'Value': 'Shared'}, + {'Type': 'TERM_MATCH', 'Field': 'capacitystatus', 'Value': 'Used'}, + ], + MaxResults=10, + ) + for price_item in response.get('PriceList', []): + product = json.loads(price_item) if isinstance(price_item, str) else price_item + on_demand = product.get('terms', {}).get('OnDemand', {}) + for term in on_demand.values(): + for dim in term.get('priceDimensions', {}).values(): + price = dim.get('pricePerUnit', {}).get('USD') + if price and float(price) > 0: + return float(price) + except Exception as e: + print(f"[ec2_pricing] on-demand fetch error for {instance_type}: {e}") + return None + + +def _fetch_all_ondemand(instance_types: list[str]) -> dict[str, float]: + """Fetch on-demand rates for all instance types. Returns {type: rate}.""" + try: + import boto3 + except ImportError: + print("[ec2_pricing] boto3 not installed, skipping on-demand fetch") + return {} + + results = {} + try: + # Pricing API is only in us-east-1 and ap-south-1 + pricing = boto3.client('pricing', region_name='us-east-1') + for itype in instance_types: + rate = _fetch_ondemand_rate(pricing, itype) + if rate is not None: + results[itype] = rate + except Exception as e: + print(f"[ec2_pricing] on-demand client error: {e}") + return results + + +# ---- Spot pricing (EC2 describe_spot_price_history) ---- + +def _fetch_all_spot(instance_types: list[str]) -> dict[str, float]: + """Fetch current spot prices for all instance types. Returns {type: rate}. + + Uses describe_spot_price_history with StartTime=now to get the most recent + price. Takes the minimum across availability zones. + """ + try: + import boto3 + except ImportError: + print("[ec2_pricing] boto3 not installed, skipping spot fetch") + return {} + + results = {} + try: + ec2 = boto3.client('ec2', region_name=_REGION) + for itype in instance_types: + try: + response = ec2.describe_spot_price_history( + InstanceTypes=[itype], + ProductDescriptions=['Linux/UNIX'], + StartTime=datetime.now(timezone.utc), + MaxResults=10, + ) + prices = [] + for entry in response.get('SpotPriceHistory', []): + try: + prices.append(float(entry['SpotPrice'])) + except (KeyError, ValueError): + continue + if prices: + # Use the minimum AZ price (what our fleet would target) + results[itype] = min(prices) + except Exception as e: + print(f"[ec2_pricing] spot fetch error for {itype}: {e}") + except Exception as e: + print(f"[ec2_pricing] spot client error: {e}") + return results + + +# ---- Cache refresh ---- + +def _get_known_instance_types() -> list[str]: + """Return the set of instance types we need pricing for.""" + return sorted({itype for itype, _ in _HARDCODED_RATES}) + + +def _refresh_cache(): + """Fetch fresh pricing data and update the cache. Thread-safe.""" + now = time.time() + if _cache['ts'] and now - _cache['ts'] < _CACHE_TTL: + return + if not _cache_lock.acquire(blocking=False): + return # another thread is already refreshing + try: + # Double-check after acquiring lock + if _cache['ts'] and time.time() - _cache['ts'] < _CACHE_TTL: + return + + instance_types = _get_known_instance_types() + ondemand = _fetch_all_ondemand(instance_types) + spot = _fetch_all_spot(instance_types) + + # Only update cache if we got at least some data + if ondemand or spot: + if ondemand: + _cache['ondemand'] = ondemand + if spot: + _cache['spot'] = spot + _cache['ts'] = time.time() + print(f"[ec2_pricing] Cache refreshed: {len(ondemand)} on-demand, {len(spot)} spot rates") + else: + print("[ec2_pricing] No pricing data returned, keeping existing cache/fallbacks") + except Exception as e: + print(f"[ec2_pricing] Cache refresh error: {e}") + finally: + _cache_lock.release() + + +def _ensure_cached(): + """Ensure cache is populated. Blocks on first call, async refresh after.""" + if not _cache['ts']: + _refresh_cache() # block on first load + else: + threading.Thread(target=_refresh_cache, daemon=True).start() + + +# ---- Public API ---- + +def get_instance_rate(instance_type: str, is_spot: bool) -> float: + """Get the hourly rate for an EC2 instance type. + + Tries live pricing cache first, falls back to hardcoded rates. + + Args: + instance_type: EC2 instance type (e.g. 'm6a.48xlarge') + is_spot: True for spot pricing, False for on-demand + + Returns: + Hourly rate in USD. + """ + _ensure_cached() + + # Try live cache + cache_key = 'spot' if is_spot else 'ondemand' + rate = _cache[cache_key].get(instance_type) + if rate is not None: + return rate + + # Fall back to hardcoded + rate = _HARDCODED_RATES.get((instance_type, is_spot)) + if rate is not None: + return rate + + # Unknown instance type -- return 0 (caller should use vCPU fallback) + return 0.0 + + +def get_fallback_vcpu_rate(is_spot: bool) -> float: + """Get the per-vCPU hourly rate for unknown instance types. + + Args: + is_spot: True for spot, False for on-demand + + Returns: + Per-vCPU hourly rate in USD. + """ + return _FALLBACK_VCPU_HOUR[is_spot] diff --git a/ci3/ci-metrics/github_data.py b/ci3/ci-metrics/github_data.py new file mode 100644 index 000000000000..8824d187cb81 --- /dev/null +++ b/ci3/ci-metrics/github_data.py @@ -0,0 +1,666 @@ +"""GitHub API polling with in-memory cache. + +Fetches PR lifecycle, deployment runs, branch lag, and merge queue stats via `gh` CLI. +Most data cached in memory with TTL. Merge queue stats persisted to SQLite daily. +""" +import json +import subprocess +import threading +import time +from datetime import datetime, timedelta, timezone + +REPO = 'AztecProtocol/aztec-packages' + +BRANCH_PAIRS = [ + ('next', 'staging-public'), + ('next', 'testnet'), + ('staging-public', 'testnet'), +] + +DEPLOY_WORKFLOWS = [ + 'deploy-staging-networks.yml', + 'deploy-network.yml', + 'deploy-next-net.yml', +] + +_CACHE_TTL = 3600 # 1 hour +_pr_cache = {'data': [], 'ts': 0} +_deploy_cache = {'data': [], 'ts': 0} +_lag_cache = {'data': [], 'ts': 0} +_pr_author_cache = {} # {pr_number: {'author': str, 'title': str, 'branch': str}} +_pr_lock = threading.Lock() +_deploy_lock = threading.Lock() +_lag_lock = threading.Lock() + + +def _gh(args: list[str]) -> str | None: + try: + result = subprocess.run( + ['gh'] + args, + capture_output=True, text=True, timeout=30 + ) + if result.returncode == 0: + return result.stdout.strip() + except (FileNotFoundError, subprocess.TimeoutExpired) as e: + print(f"[rk_github] gh error: {e}") + return None + + +# ---- PR lifecycle ---- + +def _fetch_and_process_prs() -> list[dict]: + out = _gh([ + 'pr', 'list', '--repo', REPO, '--state', 'merged', + '--limit', '500', + '--json', 'number,author,title,createdAt,mergedAt,closedAt,baseRefName,' + 'headRefName,additions,deletions,changedFiles,isDraft,reviewDecision,labels' + ]) + if not out: + return [] + try: + prs = json.loads(out) + except json.JSONDecodeError: + return [] + + for pr in prs: + author = pr.get('author', {}) + if isinstance(author, dict): + pr['author'] = author.get('login', 'unknown') + # Extract label names from label objects + labels = pr.get('labels', []) + if labels and isinstance(labels[0], dict): + pr['labels'] = [l.get('name', '') for l in labels] + created = pr.get('createdAt', '') + merged = pr.get('mergedAt') + if created and merged: + try: + c = datetime.fromisoformat(created.replace('Z', '+00:00')) + m = datetime.fromisoformat(merged.replace('Z', '+00:00')) + pr['merge_time_hrs'] = round((m - c).total_seconds() / 3600, 2) + except (ValueError, TypeError): + pr['merge_time_hrs'] = None + else: + pr['merge_time_hrs'] = None + pr['merged_date'] = merged[:10] if merged else None + pr['size'] = (pr.get('additions', 0) or 0) + (pr.get('deletions', 0) or 0) + return prs + + +def _ensure_prs(): + now = time.time() + if _pr_cache['data'] and now - _pr_cache['ts'] < _CACHE_TTL: + return + if not _pr_lock.acquire(blocking=False): + return + try: + prs = _fetch_and_process_prs() + if prs: + _pr_cache['data'] = prs + _pr_cache['ts'] = now + finally: + _pr_lock.release() + + +# ---- Deployments ---- + +def _fetch_all_deploys() -> list[dict]: + all_runs = [] + for workflow in DEPLOY_WORKFLOWS: + out = _gh([ + 'run', 'list', '--repo', REPO, + '--workflow', workflow, '--limit', '50', + '--json', 'databaseId,status,conclusion,createdAt,updatedAt,headBranch,name' + ]) + if not out: + continue + try: + runs = json.loads(out) + except json.JSONDecodeError: + continue + for run in runs: + started = run.get('createdAt', '') + completed = run.get('updatedAt') + duration = None + if started and completed: + try: + s = datetime.fromisoformat(started.replace('Z', '+00:00')) + c = datetime.fromisoformat(completed.replace('Z', '+00:00')) + duration = round((c - s).total_seconds(), 1) + except (ValueError, TypeError): + pass + all_runs.append({ + 'run_id': str(run.get('databaseId', '')), + 'workflow_name': workflow.replace('.yml', ''), + 'ref_name': run.get('headBranch', ''), + 'status': run.get('conclusion', run.get('status', 'unknown')), + 'started_at': started, + 'completed_at': completed, + 'duration_secs': duration, + 'started_date': started[:10] if started else None, + }) + return all_runs + + +def _ensure_deploys(): + now = time.time() + if _deploy_cache['data'] and now - _deploy_cache['ts'] < _CACHE_TTL: + return + if not _deploy_lock.acquire(blocking=False): + return + try: + deploys = _fetch_all_deploys() + if deploys: + _deploy_cache['data'] = deploys + _deploy_cache['ts'] = now + finally: + _deploy_lock.release() + + +# ---- Branch lag ---- + +def _fetch_branch_lag() -> list[dict]: + results = [] + today = datetime.now(timezone.utc).date().isoformat() + for source, target in BRANCH_PAIRS: + out = _gh([ + 'api', f'repos/{REPO}/compare/{target}...{source}', + '--jq', '.ahead_by' + ]) + if not out: + continue + try: + commits_behind = int(out) + except (ValueError, TypeError): + continue + + days_behind = None + out2 = _gh([ + 'api', f'repos/{REPO}/compare/{target}...{source}', + '--jq', '.commits[0].commit.committer.date' + ]) + if out2: + try: + oldest = datetime.fromisoformat(out2.replace('Z', '+00:00')) + days_behind = round((datetime.now(timezone.utc) - oldest).total_seconds() / 86400, 1) + except (ValueError, TypeError): + pass + + results.append({ + 'date': today, + 'source': source, + 'target': target, + 'commits_behind': commits_behind, + 'days_behind': days_behind, + }) + return results + + +def _ensure_lag(): + now = time.time() + if _lag_cache['data'] and now - _lag_cache['ts'] < _CACHE_TTL: + return + if not _lag_lock.acquire(blocking=False): + return + try: + lag = _fetch_branch_lag() + if lag: + _lag_cache['data'] = lag + _lag_cache['ts'] = now + finally: + _lag_lock.release() + + +# ---- Query functions for API endpoints ---- + +def get_deployment_speed(date_from: str, date_to: str, workflow: str = '') -> dict: + if not _deploy_cache['data']: + _ensure_deploys() + else: + threading.Thread(target=_ensure_deploys, daemon=True).start() + deploys = [d for d in _deploy_cache['data'] + if d.get('started_date') and date_from <= d['started_date'] <= date_to] + if workflow: + deploys = [d for d in deploys if d['workflow_name'] == workflow] + + # Group by date + by_date_map = {} + for d in deploys: + date = d['started_date'] + if date not in by_date_map: + by_date_map[date] = {'durations': [], 'success': 0, 'failure': 0, 'count': 0} + by_date_map[date]['count'] += 1 + if d['duration_secs'] is not None: + by_date_map[date]['durations'].append(d['duration_secs'] / 60.0) + if d['status'] == 'success': + by_date_map[date]['success'] += 1 + elif d['status'] == 'failure': + by_date_map[date]['failure'] += 1 + + by_date = [] + for date in sorted(by_date_map): + b = by_date_map[date] + durs = sorted(b['durations']) + by_date.append({ + 'date': date, + 'median_mins': round(durs[len(durs)//2], 1) if durs else None, + 'p95_mins': round(durs[int(len(durs)*0.95)], 1) if durs else None, + 'count': b['count'], + 'success': b['success'], + 'failure': b['failure'], + }) + + all_durs = sorted([d['duration_secs']/60.0 for d in deploys if d['duration_secs'] is not None]) + total = len(deploys) + success = sum(1 for d in deploys if d['status'] == 'success') + + recent = [{'run_id': d['run_id'], 'workflow_name': d['workflow_name'], + 'status': d['status'], 'duration_mins': round(d['duration_secs']/60.0, 1) if d['duration_secs'] else None, + 'started_at': d['started_at'], 'ref_name': d['ref_name']} + for d in sorted(deploys, key=lambda x: x['started_at'], reverse=True)[:50]] + + return { + 'by_date': by_date, + 'summary': { + 'median_mins': round(all_durs[len(all_durs)//2], 1) if all_durs else None, + 'p95_mins': round(all_durs[int(len(all_durs)*0.95)], 1) if all_durs else None, + 'success_rate': round(100.0 * success / max(total, 1), 1), + 'total': total, + }, + 'recent': recent, + } + + +def get_branch_lag(date_from: str, date_to: str) -> dict: + if not _lag_cache['data']: + _ensure_lag() + else: + threading.Thread(target=_ensure_lag, daemon=True).start() + pairs = [] + for source, target in BRANCH_PAIRS: + matching = [l for l in _lag_cache['data'] + if l['source'] == source and l['target'] == target] + current = matching[-1] if matching else {'commits_behind': 0, 'days_behind': 0} + pairs.append({ + 'source': source, + 'target': target, + 'current': {'commits_behind': current.get('commits_behind', 0), + 'days_behind': current.get('days_behind', 0)}, + 'history': [{'date': l['date'], 'commits_behind': l['commits_behind'], + 'days_behind': l['days_behind']} for l in matching], + }) + return {'pairs': pairs} + + +def get_pr_author(pr_number) -> dict | None: + """Look up PR author/title by number. Results are cached permanently (PR data doesn't change).""" + pr_number = int(pr_number) if pr_number else None + if not pr_number: + return None + if pr_number in _pr_author_cache: + return _pr_author_cache[pr_number] + + # Check merged PR cache first (already fetched) + for pr in _pr_cache.get('data', []): + if pr.get('number') == pr_number: + info = {'author': pr.get('author', 'unknown'), 'title': pr.get('title', ''), + 'branch': pr.get('headRefName', ''), + 'additions': pr.get('additions', 0), 'deletions': pr.get('deletions', 0)} + _pr_author_cache[pr_number] = info + return info + + # Fetch from GitHub API + out = _gh(['pr', 'view', str(pr_number), '--repo', REPO, + '--json', 'author,title,headRefName,additions,deletions']) + if out: + try: + data = json.loads(out) + author = data.get('author', {}) + if isinstance(author, dict): + author = author.get('login', 'unknown') + info = {'author': author, 'title': data.get('title', ''), + 'branch': data.get('headRefName', ''), + 'additions': data.get('additions', 0), 'deletions': data.get('deletions', 0)} + _pr_author_cache[pr_number] = info + return info + except (json.JSONDecodeError, KeyError): + pass + return None + + +def batch_get_pr_authors(pr_numbers: set) -> dict: + """Fetch authors for multiple PR numbers, using cache. Returns {pr_number: info}.""" + result = {} + to_fetch = [] + for prn in pr_numbers: + if not prn: + continue + prn = int(prn) + if prn in _pr_author_cache: + result[prn] = _pr_author_cache[prn] + else: + to_fetch.append(prn) + + # Check merged PR cache first + for pr in _pr_cache.get('data', []): + num = pr.get('number') + if num in to_fetch: + info = {'author': pr.get('author', 'unknown'), 'title': pr.get('title', ''), + 'branch': pr.get('headRefName', ''), + 'additions': pr.get('additions', 0), 'deletions': pr.get('deletions', 0)} + _pr_author_cache[num] = info + result[num] = info + to_fetch.remove(num) + + # Fetch remaining individually (with a cap to avoid API abuse) + for prn in to_fetch[:50]: + info = get_pr_author(prn) + if info: + result[prn] = info + + return result + + +def get_branch_pr_map() -> dict: + """Return {branch_name: pr_number} from the PR cache. Call _ensure_prs first.""" + if not _pr_cache['data']: + _ensure_prs() + else: + threading.Thread(target=_ensure_prs, daemon=True).start() + return {pr['headRefName']: pr['number'] + for pr in _pr_cache.get('data', []) + if pr.get('headRefName')} + + +def get_pr_metrics(date_from: str, date_to: str, author: str = '', + ci_runs: list = None) -> dict: + """Get PR metrics. ci_runs should be passed from the caller (read from Redis).""" + if not _pr_cache['data']: + _ensure_prs() + else: + threading.Thread(target=_ensure_prs, daemon=True).start() + + prs = [p for p in _pr_cache['data'] + if p.get('merged_date') and date_from <= p['merged_date'] <= date_to] + if author: + prs = [p for p in prs if p.get('author') == author] + + # Compute per-PR CI cost and duration from ci_runs + pr_costs = {} + pr_run_counts = {} + pr_ci_time = {} # total CI compute hours per PR + if ci_runs: + for run in ci_runs: + prn = run.get('pr_number') + if not prn: + continue + if run.get('cost_usd') is not None: + pr_costs[prn] = pr_costs.get(prn, 0) + run['cost_usd'] + pr_run_counts[prn] = pr_run_counts.get(prn, 0) + 1 + c = run.get('complete') + t = run.get('timestamp') + if c and t: + pr_ci_time[prn] = pr_ci_time.get(prn, 0) + (c - t) / 3_600_000 + + for pr in prs: + prn = pr.get('number') + pr['ci_cost_usd'] = round(pr_costs.get(prn, 0), 2) + pr['ci_runs_count'] = pr_run_counts.get(prn, 0) + pr['ci_time_hrs'] = round(pr_ci_time.get(prn, 0), 2) + + # Group by date + by_date_map = {} + for pr in prs: + date = pr['merged_date'] + if date not in by_date_map: + by_date_map[date] = {'costs': [], 'merge_times': [], 'ci_times': [], + 'run_counts': [], 'count': 0} + by_date_map[date]['count'] += 1 + by_date_map[date]['costs'].append(pr['ci_cost_usd']) + by_date_map[date]['ci_times'].append(pr.get('ci_time_hrs', 0)) + by_date_map[date]['run_counts'].append(pr.get('ci_runs_count', 0)) + if pr.get('merge_time_hrs') is not None: + by_date_map[date]['merge_times'].append(pr['merge_time_hrs']) + + def _median(vals): + s = sorted(vals) + n = len(s) + if n == 0: + return None + if n % 2 == 1: + return s[n // 2] + return (s[n // 2 - 1] + s[n // 2]) / 2 + + by_date = [] + for d, v in sorted(by_date_map.items()): + by_date.append({ + 'date': d, + 'pr_count': v['count'], + 'avg_cost': round(sum(v['costs']) / max(len(v['costs']), 1), 2), + 'median_merge_time_hrs': round(_median(v['merge_times']), 1) if v['merge_times'] else None, + 'avg_ci_time_hrs': round(sum(v['ci_times']) / max(len(v['ci_times']), 1), 2), + 'avg_runs': round(sum(v['run_counts']) / max(len(v['run_counts']), 1), 1), + }) + + # By author (all PRs in range, not filtered by author) + all_prs_in_range = [p for p in _pr_cache['data'] + if p.get('merged_date') and date_from <= p['merged_date'] <= date_to] + + author_map = {} + for pr in all_prs_in_range: + prn = pr.get('number') + a = pr.get('author', 'unknown') + if a not in author_map: + author_map[a] = {'total_cost': 0, 'pr_count': 0, 'merge_times': [], + 'total_ci_time': 0, 'total_runs': 0} + author_map[a]['total_cost'] += round(pr_costs.get(prn, 0), 2) + author_map[a]['pr_count'] += 1 + author_map[a]['total_ci_time'] += round(pr_ci_time.get(prn, 0), 2) + author_map[a]['total_runs'] += pr_run_counts.get(prn, 0) + if pr.get('merge_time_hrs') is not None: + author_map[a]['merge_times'].append(pr['merge_time_hrs']) + + by_author = [] + for a, v in sorted(author_map.items(), key=lambda x: -x[1]['total_cost'])[:20]: + by_author.append({ + 'author': a, + 'total_cost': round(v['total_cost'], 2), + 'pr_count': v['pr_count'], + 'avg_merge_time_hrs': round(_median(v['merge_times']), 1) if v['merge_times'] else None, + 'avg_ci_time_hrs': round(v['total_ci_time'] / max(v['pr_count'], 1), 2), + 'avg_runs_per_pr': round(v['total_runs'] / max(v['pr_count'], 1), 1), + }) + + all_costs = [p.get('ci_cost_usd', 0) for p in prs] + all_merge = [p['merge_time_hrs'] for p in prs if p.get('merge_time_hrs') is not None] + all_run_counts = [p.get('ci_runs_count', 0) for p in prs] + all_ci_times = [p.get('ci_time_hrs', 0) for p in prs] + + return { + 'by_date': by_date, + 'by_author': by_author, + 'summary': { + 'avg_cost_per_pr': round(sum(all_costs)/max(len(all_costs),1), 2) if all_costs else 0, + 'median_merge_time_hrs': round(_median(all_merge), 1) if all_merge else None, + 'total_prs': len(prs), + 'total_cost': round(sum(all_costs), 2), + 'avg_ci_runs_per_pr': round(sum(all_run_counts)/max(len(all_run_counts),1), 1) if all_run_counts else 0, + 'avg_ci_time_hrs': round(sum(all_ci_times)/max(len(all_ci_times),1), 2) if all_ci_times else 0, + }, + } + + +# ---- Merge queue failure rate ---- + +CI3_WORKFLOW = 'ci3.yml' + +def _fetch_merge_queue_runs(date_str: str) -> dict: + """Fetch merge_group workflow runs for a single date. Returns daily summary.""" + out = _gh([ + 'api', '--paginate', + f'repos/{REPO}/actions/workflows/{CI3_WORKFLOW}/runs' + f'?event=merge_group&created={date_str}&per_page=100', + '--jq', '.workflow_runs[] | [.conclusion, .status] | @tsv', + ]) + summary = {'date': date_str, 'total': 0, 'success': 0, 'failure': 0, + 'cancelled': 0, 'in_progress': 0} + if not out: + return summary + for line in out.strip().split('\n'): + if not line.strip(): + continue + parts = line.split('\t') + conclusion = parts[0] if parts[0] else '' + status = parts[1] if len(parts) > 1 else '' + summary['total'] += 1 + if conclusion == 'success': + summary['success'] += 1 + elif conclusion == 'failure': + summary['failure'] += 1 + elif conclusion == 'cancelled': + summary['cancelled'] += 1 + elif status in ('in_progress', 'queued', 'waiting'): + summary['in_progress'] += 1 + else: + summary['failure'] += 1 # treat unknown conclusions as failures + return summary + + +def _load_backfill_json(): + """Load seed data from merge-queue-backfill.json if SQLite is empty.""" + import db + from pathlib import Path + conn = db.get_db() + + count = conn.execute('SELECT COUNT(*) as c FROM merge_queue_daily').fetchone()['c'] + if count > 0: + return + + seed = Path(__file__).parent / 'merge-queue-backfill.json' + if not seed.exists(): + return + + import json + with seed.open() as f: + data = json.load(f) + + print(f"[rk_github] Loading {len(data)} days from merge-queue-backfill.json...") + for ds, summary in data.items(): + conn.execute( + 'INSERT OR REPLACE INTO merge_queue_daily (date, total, success, failure, cancelled, in_progress) ' + 'VALUES (?, ?, ?, ?, ?, ?)', + (ds, summary['total'], summary['success'], summary['failure'], + summary['cancelled'], summary['in_progress'])) + conn.commit() + + +def _backfill_merge_queue(): + """Backfill missing merge queue daily stats into SQLite.""" + import db + conn = db.get_db() + + # Load seed data on first run + _load_backfill_json() + + # Find which dates we already have + existing = {row['date'] for row in + conn.execute('SELECT date FROM merge_queue_daily').fetchall()} + + yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).date() + # Backfill up to 365 days + start = yesterday - timedelta(days=365) + current = start + + missing = [] + while current <= yesterday: + ds = current.isoformat() + if ds not in existing: + missing.append(ds) + current += timedelta(days=1) + + if not missing: + return + + print(f"[rk_github] Backfilling {len(missing)} days of merge queue stats...") + for ds in missing: + summary = _fetch_merge_queue_runs(ds) + if summary['total'] == 0: + conn.execute( + 'INSERT OR REPLACE INTO merge_queue_daily (date, total, success, failure, cancelled, in_progress) ' + 'VALUES (?, 0, 0, 0, 0, 0)', (ds,)) + else: + conn.execute( + 'INSERT OR REPLACE INTO merge_queue_daily (date, total, success, failure, cancelled, in_progress) ' + 'VALUES (?, ?, ?, ?, ?, ?)', + (ds, summary['total'], summary['success'], summary['failure'], + summary['cancelled'], summary['in_progress'])) + conn.commit() + + +def refresh_merge_queue_today(): + """Refresh today's (and yesterday's) merge queue stats. Called periodically.""" + import db + conn = db.get_db() + today = datetime.now(timezone.utc).date().isoformat() + yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).date().isoformat() + + for ds in [yesterday, today]: + summary = _fetch_merge_queue_runs(ds) + conn.execute( + 'INSERT OR REPLACE INTO merge_queue_daily (date, total, success, failure, cancelled, in_progress) ' + 'VALUES (?, ?, ?, ?, ?, ?)', + (ds, summary['total'], summary['success'], summary['failure'], + summary['cancelled'], summary['in_progress'])) + conn.commit() + + +_mq_backfill_lock = threading.Lock() +_mq_last_refresh = 0 +_MQ_REFRESH_TTL = 3600 # refresh today's data every hour + + +def ensure_merge_queue_data(): + """Ensure merge queue data is backfilled and today is fresh.""" + global _mq_last_refresh + now = time.time() + if now - _mq_last_refresh < _MQ_REFRESH_TTL: + return + if not _mq_backfill_lock.acquire(blocking=False): + return + try: + _backfill_merge_queue() + refresh_merge_queue_today() + _mq_last_refresh = now + finally: + _mq_backfill_lock.release() + + +def get_merge_queue_stats(date_from: str, date_to: str) -> dict: + """Get merge queue failure rate by day. Triggers backfill if needed.""" + # Ensure data is populated (async after first load) + import db + conn = db.get_db() + count = conn.execute('SELECT COUNT(*) as c FROM merge_queue_daily').fetchone()['c'] + if count == 0: + ensure_merge_queue_data() # block on first load + else: + threading.Thread(target=ensure_merge_queue_data, daemon=True).start() + + rows = db.query( + 'SELECT date, total, success, failure, cancelled, in_progress ' + 'FROM merge_queue_daily WHERE date >= ? AND date <= ? ORDER BY date', + (date_from, date_to)) + + total_runs = sum(r['total'] for r in rows) + total_fail = sum(r['failure'] for r in rows) + total_success = sum(r['success'] for r in rows) + + return { + 'by_date': rows, + 'summary': { + 'total_runs': total_runs, + 'total_success': total_success, + 'total_failure': total_fail, + 'failure_rate': round(total_fail / max(total_runs, 1) * 100, 1), + 'days': len([r for r in rows if r['total'] > 0]), + }, + } diff --git a/ci3/ci-metrics/merge-queue-backfill.json b/ci3/ci-metrics/merge-queue-backfill.json new file mode 100644 index 000000000000..079077590581 --- /dev/null +++ b/ci3/ci-metrics/merge-queue-backfill.json @@ -0,0 +1,2564 @@ +{ + "2025-02-10": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-11": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-12": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-13": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-14": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-15": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-16": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-17": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-18": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-19": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-20": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-21": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-22": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-23": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-24": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-25": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-26": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-27": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-02-28": { + "total": 2, + "success": 2, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-01": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-02": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-03": { + "total": 1, + "success": 0, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-04": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-05": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-06": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-07": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-08": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-09": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-10": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-11": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-12": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-13": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-14": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-15": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-16": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-17": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-18": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-19": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-20": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-21": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-22": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-23": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-24": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-25": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-26": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-27": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-28": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-29": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-30": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-03-31": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-01": { + "total": 3, + "success": 2, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-02": { + "total": 31, + "success": 19, + "failure": 12, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-03": { + "total": 113, + "success": 58, + "failure": 55, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-04": { + "total": 69, + "success": 50, + "failure": 19, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-05": { + "total": 4, + "success": 4, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-06": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-07": { + "total": 42, + "success": 32, + "failure": 10, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-08": { + "total": 27, + "success": 19, + "failure": 8, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-09": { + "total": 29, + "success": 26, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-10": { + "total": 42, + "success": 35, + "failure": 7, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-11": { + "total": 51, + "success": 36, + "failure": 15, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-12": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-13": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-14": { + "total": 24, + "success": 19, + "failure": 4, + "cancelled": 1, + "in_progress": 0 + }, + "2025-04-15": { + "total": 41, + "success": 22, + "failure": 19, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-16": { + "total": 26, + "success": 21, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-17": { + "total": 29, + "success": 28, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-18": { + "total": 10, + "success": 10, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-19": { + "total": 4, + "success": 4, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-20": { + "total": 2, + "success": 2, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-21": { + "total": 5, + "success": 5, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-22": { + "total": 49, + "success": 33, + "failure": 15, + "cancelled": 1, + "in_progress": 0 + }, + "2025-04-23": { + "total": 32, + "success": 28, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-24": { + "total": 29, + "success": 26, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-25": { + "total": 28, + "success": 26, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-26": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-27": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-28": { + "total": 26, + "success": 20, + "failure": 6, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-29": { + "total": 60, + "success": 26, + "failure": 34, + "cancelled": 0, + "in_progress": 0 + }, + "2025-04-30": { + "total": 47, + "success": 33, + "failure": 14, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-01": { + "total": 31, + "success": 27, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-02": { + "total": 8, + "success": 8, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-03": { + "total": 2, + "success": 2, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-04": { + "total": 7, + "success": 7, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-05": { + "total": 14, + "success": 11, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-06": { + "total": 18, + "success": 16, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-07": { + "total": 22, + "success": 20, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-08": { + "total": 18, + "success": 15, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-09": { + "total": 36, + "success": 27, + "failure": 9, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-10": { + "total": 2, + "success": 1, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-11": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-12": { + "total": 47, + "success": 30, + "failure": 17, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-13": { + "total": 134, + "success": 65, + "failure": 69, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-14": { + "total": 51, + "success": 34, + "failure": 17, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-15": { + "total": 22, + "success": 9, + "failure": 12, + "cancelled": 1, + "in_progress": 0 + }, + "2025-05-16": { + "total": 21, + "success": 15, + "failure": 6, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-17": { + "total": 2, + "success": 1, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-18": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-19": { + "total": 10, + "success": 9, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-20": { + "total": 30, + "success": 15, + "failure": 15, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-21": { + "total": 26, + "success": 12, + "failure": 14, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-22": { + "total": 51, + "success": 21, + "failure": 30, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-23": { + "total": 67, + "success": 13, + "failure": 53, + "cancelled": 1, + "in_progress": 0 + }, + "2025-05-24": { + "total": 5, + "success": 2, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-25": { + "total": 5, + "success": 0, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-26": { + "total": 10, + "success": 7, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-27": { + "total": 61, + "success": 12, + "failure": 49, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-28": { + "total": 56, + "success": 15, + "failure": 41, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-29": { + "total": 77, + "success": 24, + "failure": 52, + "cancelled": 1, + "in_progress": 0 + }, + "2025-05-30": { + "total": 25, + "success": 15, + "failure": 10, + "cancelled": 0, + "in_progress": 0 + }, + "2025-05-31": { + "total": 6, + "success": 3, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-01": { + "total": 2, + "success": 2, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-02": { + "total": 50, + "success": 20, + "failure": 29, + "cancelled": 1, + "in_progress": 0 + }, + "2025-06-03": { + "total": 57, + "success": 22, + "failure": 35, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-04": { + "total": 219, + "success": 22, + "failure": 196, + "cancelled": 1, + "in_progress": 0 + }, + "2025-06-05": { + "total": 166, + "success": 19, + "failure": 147, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-06": { + "total": 73, + "success": 27, + "failure": 45, + "cancelled": 1, + "in_progress": 0 + }, + "2025-06-07": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-08": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-09": { + "total": 124, + "success": 31, + "failure": 93, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-10": { + "total": 44, + "success": 29, + "failure": 15, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-11": { + "total": 19, + "success": 16, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-12": { + "total": 26, + "success": 14, + "failure": 12, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-13": { + "total": 29, + "success": 24, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-14": { + "total": 2, + "success": 2, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-15": { + "total": 1, + "success": 0, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-16": { + "total": 44, + "success": 21, + "failure": 23, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-17": { + "total": 29, + "success": 15, + "failure": 14, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-18": { + "total": 38, + "success": 25, + "failure": 13, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-19": { + "total": 15, + "success": 11, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-20": { + "total": 27, + "success": 21, + "failure": 6, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-21": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-22": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-23": { + "total": 30, + "success": 14, + "failure": 16, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-24": { + "total": 26, + "success": 17, + "failure": 9, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-25": { + "total": 26, + "success": 20, + "failure": 6, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-26": { + "total": 44, + "success": 21, + "failure": 22, + "cancelled": 1, + "in_progress": 0 + }, + "2025-06-27": { + "total": 18, + "success": 13, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-28": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-29": { + "total": 3, + "success": 3, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-06-30": { + "total": 27, + "success": 17, + "failure": 10, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-01": { + "total": 26, + "success": 12, + "failure": 13, + "cancelled": 1, + "in_progress": 0 + }, + "2025-07-02": { + "total": 42, + "success": 25, + "failure": 17, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-03": { + "total": 17, + "success": 12, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-04": { + "total": 15, + "success": 12, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-05": { + "total": 4, + "success": 3, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-06": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-07": { + "total": 20, + "success": 14, + "failure": 6, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-08": { + "total": 33, + "success": 19, + "failure": 14, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-09": { + "total": 19, + "success": 13, + "failure": 6, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-10": { + "total": 22, + "success": 14, + "failure": 7, + "cancelled": 1, + "in_progress": 0 + }, + "2025-07-11": { + "total": 6, + "success": 6, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-12": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-13": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-14": { + "total": 29, + "success": 21, + "failure": 8, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-15": { + "total": 49, + "success": 22, + "failure": 27, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-16": { + "total": 47, + "success": 21, + "failure": 26, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-17": { + "total": 18, + "success": 10, + "failure": 8, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-18": { + "total": 13, + "success": 12, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-19": { + "total": 2, + "success": 2, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-20": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-21": { + "total": 26, + "success": 22, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-22": { + "total": 25, + "success": 19, + "failure": 6, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-23": { + "total": 33, + "success": 16, + "failure": 15, + "cancelled": 2, + "in_progress": 0 + }, + "2025-07-24": { + "total": 61, + "success": 26, + "failure": 35, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-25": { + "total": 35, + "success": 17, + "failure": 16, + "cancelled": 2, + "in_progress": 0 + }, + "2025-07-26": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-27": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-28": { + "total": 23, + "success": 22, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-29": { + "total": 52, + "success": 21, + "failure": 31, + "cancelled": 0, + "in_progress": 0 + }, + "2025-07-30": { + "total": 30, + "success": 15, + "failure": 14, + "cancelled": 1, + "in_progress": 0 + }, + "2025-07-31": { + "total": 35, + "success": 23, + "failure": 12, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-01": { + "total": 13, + "success": 13, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-02": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-03": { + "total": 4, + "success": 4, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-04": { + "total": 16, + "success": 15, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-05": { + "total": 14, + "success": 10, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-06": { + "total": 23, + "success": 16, + "failure": 7, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-07": { + "total": 19, + "success": 7, + "failure": 12, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-08": { + "total": 24, + "success": 15, + "failure": 9, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-09": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-10": { + "total": 4, + "success": 2, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-11": { + "total": 13, + "success": 12, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-12": { + "total": 9, + "success": 9, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-13": { + "total": 14, + "success": 12, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-14": { + "total": 18, + "success": 16, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-15": { + "total": 38, + "success": 30, + "failure": 8, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-16": { + "total": 2, + "success": 2, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-17": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-18": { + "total": 19, + "success": 12, + "failure": 7, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-19": { + "total": 11, + "success": 7, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-20": { + "total": 11, + "success": 9, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-21": { + "total": 19, + "success": 15, + "failure": 3, + "cancelled": 1, + "in_progress": 0 + }, + "2025-08-22": { + "total": 32, + "success": 24, + "failure": 8, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-23": { + "total": 6, + "success": 5, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-24": { + "total": 2, + "success": 2, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-25": { + "total": 13, + "success": 11, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-26": { + "total": 17, + "success": 10, + "failure": 7, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-27": { + "total": 20, + "success": 11, + "failure": 9, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-28": { + "total": 36, + "success": 18, + "failure": 17, + "cancelled": 1, + "in_progress": 0 + }, + "2025-08-29": { + "total": 39, + "success": 28, + "failure": 11, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-30": { + "total": 4, + "success": 2, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-08-31": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-01": { + "total": 20, + "success": 15, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-02": { + "total": 25, + "success": 16, + "failure": 9, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-03": { + "total": 30, + "success": 19, + "failure": 11, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-04": { + "total": 29, + "success": 15, + "failure": 14, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-05": { + "total": 32, + "success": 14, + "failure": 18, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-06": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-07": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-08": { + "total": 18, + "success": 12, + "failure": 5, + "cancelled": 1, + "in_progress": 0 + }, + "2025-09-09": { + "total": 25, + "success": 14, + "failure": 11, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-10": { + "total": 38, + "success": 23, + "failure": 15, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-11": { + "total": 39, + "success": 18, + "failure": 21, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-12": { + "total": 34, + "success": 21, + "failure": 13, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-13": { + "total": 1, + "success": 0, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-14": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-15": { + "total": 22, + "success": 11, + "failure": 11, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-16": { + "total": 25, + "success": 15, + "failure": 10, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-17": { + "total": 24, + "success": 17, + "failure": 7, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-18": { + "total": 24, + "success": 17, + "failure": 6, + "cancelled": 1, + "in_progress": 0 + }, + "2025-09-19": { + "total": 16, + "success": 9, + "failure": 7, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-20": { + "total": 8, + "success": 3, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-21": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-22": { + "total": 45, + "success": 19, + "failure": 26, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-23": { + "total": 23, + "success": 17, + "failure": 6, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-24": { + "total": 17, + "success": 13, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-25": { + "total": 47, + "success": 26, + "failure": 21, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-26": { + "total": 22, + "success": 21, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-27": { + "total": 4, + "success": 3, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-28": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-29": { + "total": 20, + "success": 12, + "failure": 8, + "cancelled": 0, + "in_progress": 0 + }, + "2025-09-30": { + "total": 46, + "success": 21, + "failure": 25, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-01": { + "total": 23, + "success": 16, + "failure": 6, + "cancelled": 1, + "in_progress": 0 + }, + "2025-10-02": { + "total": 30, + "success": 17, + "failure": 13, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-03": { + "total": 10, + "success": 9, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-04": { + "total": 4, + "success": 4, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-05": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-06": { + "total": 25, + "success": 9, + "failure": 15, + "cancelled": 1, + "in_progress": 0 + }, + "2025-10-07": { + "total": 42, + "success": 12, + "failure": 29, + "cancelled": 1, + "in_progress": 0 + }, + "2025-10-08": { + "total": 21, + "success": 11, + "failure": 10, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-09": { + "total": 61, + "success": 2, + "failure": 59, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-10": { + "total": 47, + "success": 13, + "failure": 34, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-11": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-12": { + "total": 1, + "success": 0, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-13": { + "total": 32, + "success": 18, + "failure": 14, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-14": { + "total": 31, + "success": 16, + "failure": 15, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-15": { + "total": 33, + "success": 22, + "failure": 11, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-16": { + "total": 19, + "success": 12, + "failure": 7, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-17": { + "total": 20, + "success": 12, + "failure": 7, + "cancelled": 1, + "in_progress": 0 + }, + "2025-10-18": { + "total": 1, + "success": 0, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-19": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-20": { + "total": 37, + "success": 14, + "failure": 23, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-21": { + "total": 21, + "success": 12, + "failure": 9, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-22": { + "total": 24, + "success": 11, + "failure": 13, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-23": { + "total": 61, + "success": 17, + "failure": 44, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-24": { + "total": 30, + "success": 18, + "failure": 12, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-25": { + "total": 3, + "success": 3, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-26": { + "total": 2, + "success": 2, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-27": { + "total": 9, + "success": 9, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-28": { + "total": 18, + "success": 16, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-29": { + "total": 19, + "success": 14, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-30": { + "total": 17, + "success": 16, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-10-31": { + "total": 15, + "success": 14, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-01": { + "total": 4, + "success": 1, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-02": { + "total": 2, + "success": 2, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-03": { + "total": 14, + "success": 13, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-04": { + "total": 19, + "success": 16, + "failure": 1, + "cancelled": 2, + "in_progress": 0 + }, + "2025-11-05": { + "total": 13, + "success": 10, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-06": { + "total": 24, + "success": 11, + "failure": 13, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-07": { + "total": 19, + "success": 14, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-08": { + "total": 3, + "success": 2, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-09": { + "total": 2, + "success": 1, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-10": { + "total": 47, + "success": 13, + "failure": 33, + "cancelled": 1, + "in_progress": 0 + }, + "2025-11-11": { + "total": 15, + "success": 11, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-12": { + "total": 42, + "success": 22, + "failure": 20, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-13": { + "total": 17, + "success": 12, + "failure": 4, + "cancelled": 1, + "in_progress": 0 + }, + "2025-11-14": { + "total": 22, + "success": 15, + "failure": 7, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-15": { + "total": 3, + "success": 3, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-16": { + "total": 3, + "success": 3, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-17": { + "total": 9, + "success": 7, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-18": { + "total": 19, + "success": 12, + "failure": 7, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-19": { + "total": 18, + "success": 13, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-20": { + "total": 9, + "success": 8, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-21": { + "total": 16, + "success": 12, + "failure": 3, + "cancelled": 1, + "in_progress": 0 + }, + "2025-11-22": { + "total": 5, + "success": 2, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-23": { + "total": 2, + "success": 2, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-24": { + "total": 8, + "success": 7, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-25": { + "total": 11, + "success": 10, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-26": { + "total": 17, + "success": 16, + "failure": 0, + "cancelled": 1, + "in_progress": 0 + }, + "2025-11-27": { + "total": 17, + "success": 15, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-28": { + "total": 11, + "success": 6, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-29": { + "total": 2, + "success": 2, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-11-30": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-01": { + "total": 13, + "success": 12, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-02": { + "total": 8, + "success": 8, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-03": { + "total": 17, + "success": 10, + "failure": 7, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-04": { + "total": 11, + "success": 8, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-05": { + "total": 12, + "success": 11, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-06": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-07": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-08": { + "total": 17, + "success": 14, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-09": { + "total": 23, + "success": 14, + "failure": 9, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-10": { + "total": 43, + "success": 21, + "failure": 20, + "cancelled": 2, + "in_progress": 0 + }, + "2025-12-11": { + "total": 28, + "success": 19, + "failure": 9, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-12": { + "total": 14, + "success": 12, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-13": { + "total": 2, + "success": 0, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-14": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-15": { + "total": 41, + "success": 15, + "failure": 26, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-16": { + "total": 25, + "success": 21, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-17": { + "total": 10, + "success": 8, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-18": { + "total": 20, + "success": 14, + "failure": 5, + "cancelled": 1, + "in_progress": 0 + }, + "2025-12-19": { + "total": 13, + "success": 11, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-20": { + "total": 7, + "success": 3, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-21": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-22": { + "total": 20, + "success": 16, + "failure": 3, + "cancelled": 1, + "in_progress": 0 + }, + "2025-12-23": { + "total": 28, + "success": 19, + "failure": 9, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-24": { + "total": 13, + "success": 8, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-25": { + "total": 3, + "success": 1, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-26": { + "total": 6, + "success": 3, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-27": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-28": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-29": { + "total": 4, + "success": 2, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-30": { + "total": 3, + "success": 1, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2025-12-31": { + "total": 2, + "success": 1, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-01": { + "total": 2, + "success": 1, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-02": { + "total": 12, + "success": 8, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-03": { + "total": 3, + "success": 1, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-04": { + "total": 3, + "success": 3, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-05": { + "total": 34, + "success": 27, + "failure": 7, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-06": { + "total": 45, + "success": 25, + "failure": 20, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-07": { + "total": 17, + "success": 13, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-08": { + "total": 36, + "success": 24, + "failure": 12, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-09": { + "total": 25, + "success": 17, + "failure": 7, + "cancelled": 1, + "in_progress": 0 + }, + "2026-01-10": { + "total": 5, + "success": 2, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-11": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-12": { + "total": 32, + "success": 17, + "failure": 15, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-13": { + "total": 44, + "success": 22, + "failure": 22, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-14": { + "total": 114, + "success": 32, + "failure": 82, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-15": { + "total": 54, + "success": 22, + "failure": 31, + "cancelled": 1, + "in_progress": 0 + }, + "2026-01-16": { + "total": 70, + "success": 27, + "failure": 40, + "cancelled": 3, + "in_progress": 0 + }, + "2026-01-17": { + "total": 6, + "success": 4, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-18": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-19": { + "total": 28, + "success": 25, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-20": { + "total": 42, + "success": 30, + "failure": 12, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-21": { + "total": 51, + "success": 31, + "failure": 20, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-22": { + "total": 32, + "success": 25, + "failure": 5, + "cancelled": 2, + "in_progress": 0 + }, + "2026-01-23": { + "total": 28, + "success": 25, + "failure": 3, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-24": { + "total": 6, + "success": 4, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-25": { + "total": 3, + "success": 2, + "failure": 1, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-26": { + "total": 89, + "success": 33, + "failure": 56, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-27": { + "total": 24, + "success": 21, + "failure": 2, + "cancelled": 1, + "in_progress": 0 + }, + "2026-01-28": { + "total": 48, + "success": 28, + "failure": 20, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-29": { + "total": 24, + "success": 18, + "failure": 6, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-30": { + "total": 31, + "success": 24, + "failure": 7, + "cancelled": 0, + "in_progress": 0 + }, + "2026-01-31": { + "total": 1, + "success": 1, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2026-02-01": { + "total": 0, + "success": 0, + "failure": 0, + "cancelled": 0, + "in_progress": 0 + }, + "2026-02-02": { + "total": 14, + "success": 12, + "failure": 2, + "cancelled": 0, + "in_progress": 0 + }, + "2026-02-03": { + "total": 27, + "success": 18, + "failure": 9, + "cancelled": 0, + "in_progress": 0 + }, + "2026-02-04": { + "total": 30, + "success": 16, + "failure": 14, + "cancelled": 0, + "in_progress": 0 + }, + "2026-02-05": { + "total": 33, + "success": 19, + "failure": 14, + "cancelled": 0, + "in_progress": 0 + }, + "2026-02-06": { + "total": 20, + "success": 15, + "failure": 5, + "cancelled": 0, + "in_progress": 0 + }, + "2026-02-07": { + "total": 8, + "success": 4, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2026-02-08": { + "total": 5, + "success": 2, + "failure": 2, + "cancelled": 1, + "in_progress": 0 + }, + "2026-02-09": { + "total": 15, + "success": 11, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + }, + "2026-02-10": { + "total": 24, + "success": 20, + "failure": 4, + "cancelled": 0, + "in_progress": 0 + } +} \ No newline at end of file diff --git a/ci3/ci-metrics/metrics.py b/ci3/ci-metrics/metrics.py new file mode 100644 index 000000000000..5c0d1610e06b --- /dev/null +++ b/ci3/ci-metrics/metrics.py @@ -0,0 +1,602 @@ +"""CI metrics: direct Redis reads + test event listener. + +Reads CI run data directly from Redis sorted sets on each request. +Test events stored in SQLite since they only arrive via pub/sub. +CI runs periodically synced from Redis to SQLite for flake correlation. +""" +import json +import re +import time +import threading +from datetime import datetime, timedelta, timezone + +import db +import github_data +import ec2_pricing + +SECTIONS = ['next', 'prs', 'master', 'staging', 'releases', 'nightly', 'network', 'deflake', 'local'] + +_PR_RE = re.compile(r'(?:pr-|#)(\d+)', re.IGNORECASE) +_ANSI_RE = re.compile(r'\x1b\[[^m]*m|\x1b\]8;;[^\x07]*\x07') +_URL_PR_RE = re.compile(r'/pull/(\d+)') + + +def compute_run_cost(data: dict) -> float | None: + complete = data.get('complete') + ts = data.get('timestamp') + if not complete or not ts: + return None + hours = (complete - ts) / 3_600_000 + instance_type = data.get('instance_type', 'unknown') + is_spot = bool(data.get('spot')) + rate = ec2_pricing.get_instance_rate(instance_type, is_spot) + if not rate: + vcpus = data.get('instance_vcpus', 192) + rate = vcpus * ec2_pricing.get_fallback_vcpu_rate(is_spot) + return round(hours * rate, 4) + + +def extract_pr_number(name: str) -> int | None: + m = _PR_RE.search(name) + if m: + return int(m.group(1)) + # Try matching GitHub PR URL in ANSI-encoded strings + m = _URL_PR_RE.search(name) + if m: + return int(m.group(1)) + # Strip ANSI codes and retry + clean = _ANSI_RE.sub('', name) + m = _PR_RE.search(clean) + return int(m.group(1)) if m else None + + +def _get_ci_runs_from_redis(redis_conn, date_from_ms=None, date_to_ms=None): + """Read CI runs from Redis sorted sets.""" + branch_pr_map = github_data.get_branch_pr_map() + + runs = [] + for section in SECTIONS: + key = f'ci-run-{section}' + try: + if date_from_ms is not None or date_to_ms is not None: + lo = date_from_ms if date_from_ms is not None else '-inf' + hi = date_to_ms if date_to_ms is not None else '+inf' + entries = redis_conn.zrangebyscore(key, lo, hi, withscores=True) + else: + entries = redis_conn.zrange(key, 0, -1, withscores=True) + for entry_bytes, score in entries: + try: + raw = entry_bytes.decode() if isinstance(entry_bytes, bytes) else entry_bytes + data = json.loads(raw) + data.setdefault('dashboard', section) + data['cost_usd'] = compute_run_cost(data) + data['pr_number'] = ( + extract_pr_number(data.get('name', '')) + or extract_pr_number(data.get('msg', '')) + or (int(data['pr_number']) if data.get('pr_number') else None) + or branch_pr_map.get(data.get('name')) + ) + runs.append(data) + except Exception: + continue + except Exception as e: + print(f"[rk_metrics] Error reading {key}: {e}") + return runs + + +def _get_ci_runs_from_sqlite(date_from_ms=None, date_to_ms=None): + """Read CI runs from SQLite (persistent store).""" + conditions = [] + params = [] + if date_from_ms is not None: + conditions.append('timestamp_ms >= ?') + params.append(date_from_ms) + if date_to_ms is not None: + conditions.append('timestamp_ms <= ?') + params.append(date_to_ms) + where = ('WHERE ' + ' AND '.join(conditions)) if conditions else '' + rows = db.query(f'SELECT * FROM ci_runs {where} ORDER BY timestamp_ms', params) + runs = [] + for row in rows: + runs.append({ + 'dashboard': row['dashboard'], + 'name': row['name'], + 'timestamp': row['timestamp_ms'], + 'complete': row['complete_ms'], + 'status': row['status'], + 'author': row['author'], + 'pr_number': row['pr_number'], + 'instance_type': row['instance_type'], + 'instance_vcpus': row.get('instance_vcpus'), + 'spot': bool(row['spot']), + 'cost_usd': row['cost_usd'], + 'job_id': row.get('job_id', ''), + 'arch': row.get('arch', ''), + }) + return runs + + +def get_ci_runs(redis_conn, date_from_ms=None, date_to_ms=None): + """Read CI runs from Redis, backfilled with SQLite for data that Redis has flushed.""" + redis_runs = _get_ci_runs_from_redis(redis_conn, date_from_ms, date_to_ms) + + # Find the earliest timestamp in Redis to know what SQLite needs to fill + redis_keys = set() + redis_min_ts = float('inf') + for run in redis_runs: + ts = run.get('timestamp', 0) + redis_keys.add((run.get('dashboard', ''), ts, run.get('name', ''))) + if ts < redis_min_ts: + redis_min_ts = ts + + # If requesting data older than what Redis has, backfill from SQLite + sqlite_runs = [] + need_sqlite = (date_from_ms is not None and date_from_ms < redis_min_ts) or not redis_runs + if need_sqlite: + sqlite_to = int(redis_min_ts) if redis_runs else date_to_ms + sqlite_runs = _get_ci_runs_from_sqlite(date_from_ms, sqlite_to) + # Deduplicate: only include SQLite runs not already in Redis + sqlite_runs = [r for r in sqlite_runs + if (r.get('dashboard', ''), r.get('timestamp', 0), r.get('name', '')) + not in redis_keys] + + return sqlite_runs + redis_runs + + +def _ts_to_date(ts_ms): + return datetime.fromtimestamp(ts_ms / 1000, tz=timezone.utc).strftime('%Y-%m-%d') + + +# ---- Test event handling (only thing needing SQLite) ---- + +def _handle_test_event(channel: str, data: dict): + status = channel.split(':')[-1] + # Handle field name mismatches: run_test_cmd publishes 'cmd' for failed/flaked + # but 'test_cmd' for started events. Same for 'log_key' vs 'log_url'. + test_cmd = data.get('test_cmd') or data.get('cmd', '') + log_url = data.get('log_url') or data.get('log_key') + if log_url and not log_url.startswith('http'): + log_url = f'http://ci.aztec-labs.com/{log_url}' + db.execute(''' + INSERT INTO test_events + (status, test_cmd, log_url, ref_name, commit_hash, commit_author, + commit_msg, exit_code, duration_secs, is_scenario, owners, + flake_group_id, dashboard, timestamp) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + status, + test_cmd, + log_url, + data.get('ref_name', ''), + data.get('commit_hash'), + data.get('commit_author'), + data.get('commit_msg'), + data.get('exit_code'), + data.get('duration_seconds'), + 1 if data.get('is_scenario_test') else 0, + json.dumps(data['owners']) if data.get('owners') else None, + data.get('flake_group_id'), + data.get('dashboard', ''), + data.get('timestamp', datetime.now(timezone.utc).isoformat()), + )) + + +def start_test_listener(redis_conn): + """Subscribe to test event channels only. Reconnects on failure.""" + channels = [b'ci:test:started', b'ci:test:passed', b'ci:test:failed', b'ci:test:flaked'] + + def listener(): + backoff = 1 + while True: + try: + pubsub = redis_conn.pubsub() + pubsub.subscribe(*channels) + backoff = 1 # reset on successful connection + for message in pubsub.listen(): + if message['type'] != 'message': + continue + channel = message['channel'] + if isinstance(channel, bytes): + channel = channel.decode() + try: + payload = message['data'] + if isinstance(payload, bytes): + payload = payload.decode() + _handle_test_event(channel, json.loads(payload)) + except Exception as e: + print(f"[rk_metrics] Error parsing test event: {e}") + except Exception as e: + print(f"[rk_metrics] Test listener error (reconnecting in {backoff}s): {e}") + time.sleep(backoff) + backoff = min(backoff * 2, 60) + + t = threading.Thread(target=listener, daemon=True, name='test-listener') + t.start() + return t + + +# ---- Sync failed_tests_{section} lists from Redis into SQLite ---- + +_ANSI_STRIP = re.compile(r'\x1b\[[^m]*m|\x1b\]8;;[^\x07]*\x07') +_GRIND_CMD_RE = re.compile(r'/grind\?cmd=([^&\x07"]+)') +_LOG_KEY_RE = re.compile(r'ci\.aztec-labs\.com/([a-f0-9]{16})') +_INLINE_CMD_RE = re.compile(r'(?:grind\)|[0-9a-f]{16}\)):?\s+(.+?)\s+\(\d+s\)') +_DURATION_RE = re.compile(r'\((\d+)s\)') +_AUTHOR_MSG_RE = re.compile(r'\(code: \d+\)\s+\((.+?): (.+?)\)\s*$') +_FLAKE_GROUP_RE = re.compile(r'group:(\S+)') + +_failed_tests_sync_ts = 0 +_FAILED_TESTS_SYNC_TTL = 3600 # 1 hour + + +def _parse_failed_test_entry(raw: str, section: str) -> dict | None: + """Parse an ANSI-formatted failed_tests_{section} entry into structured data.""" + from urllib.parse import unquote + clean = _ANSI_STRIP.sub('', raw) + + # Status + if 'FLAKED' in clean: + status = 'flaked' + elif 'FAILED' in clean: + status = 'failed' + else: + return None + + # Timestamp: "02-11 15:11:00: ..." + ts_match = re.match(r'(\d{2}-\d{2} \d{2}:\d{2}:\d{2})', clean) + if not ts_match: + return None + # Assume current year for MM-DD HH:MM:SS; handle year rollover + now = datetime.now(timezone.utc) + year = now.year + ts_str = f'{year}-{ts_match.group(1)}' + try: + parsed_dt = datetime.strptime(ts_str, '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) + # If parsed date is in the future, it's from the previous year + if parsed_dt > now + timedelta(days=1): + parsed_dt = parsed_dt.replace(year=year - 1) + timestamp = parsed_dt.isoformat() + except ValueError: + return None + + # Log key + log_key = None + m = _LOG_KEY_RE.search(raw) + if m: + log_key = m.group(1) + + # Test command: try grind link first, then inline text + test_cmd = '' + m = _GRIND_CMD_RE.search(raw) + if m: + cmd_raw = unquote(m.group(1)) + # Format: "hash:KEY=VAL:KEY=VAL actual_command" + # Strip the hash:KEY=VAL prefix to get the actual test command + parts = cmd_raw.split(' ', 1) + if len(parts) == 2 and ':' in parts[0]: + test_cmd = parts[1].strip() + else: + test_cmd = cmd_raw + else: + # Fallback: extract from inline text after log key + m = _INLINE_CMD_RE.search(clean) + if m: + test_cmd = m.group(1).strip() + + # Duration + duration = None + m = _DURATION_RE.search(clean) + if m: + duration = float(m.group(1)) + + # Author and commit message + author, msg = None, None + m = _AUTHOR_MSG_RE.search(clean) + if m: + author = m.group(1) + msg = m.group(2) + + # Flake group + flake_group = None + m = _FLAKE_GROUP_RE.search(clean) + if m: + flake_group = m.group(1) + + return { + 'status': status, + 'test_cmd': test_cmd, + 'log_url': f'http://ci.aztec-labs.com/{log_key}' if log_key else None, + 'log_key': log_key, + 'ref_name': section, # section is the best ref we have from these lists + 'commit_author': author, + 'commit_msg': msg, + 'duration_secs': duration, + 'flake_group_id': flake_group, + 'timestamp': timestamp, + 'dashboard': section, + } + + +def sync_failed_tests_to_sqlite(redis_conn): + """Read failed_tests_{section} lists from Redis and insert into test_events.""" + global _failed_tests_sync_ts + now = time.time() + if now - _failed_tests_sync_ts < _FAILED_TESTS_SYNC_TTL: + return + _failed_tests_sync_ts = now + + conn = db.get_db() + # Track existing entries to avoid duplicates: log_url for entries that have one, + # (test_cmd, timestamp, dashboard) composite key for entries without log_url + existing_urls = {row['log_url'] for row in conn.execute( + "SELECT DISTINCT log_url FROM test_events WHERE log_url IS NOT NULL" + ).fetchall()} + existing_keys = {(row['test_cmd'], row['timestamp'], row['dashboard']) for row in conn.execute( + "SELECT test_cmd, timestamp, dashboard FROM test_events WHERE log_url IS NULL" + ).fetchall()} + + total = 0 + for section in SECTIONS: + key = f'failed_tests_{section}' + try: + entries = redis_conn.lrange(key, 0, -1) + except Exception as e: + print(f"[rk_metrics] Error reading {key}: {e}") + continue + + for entry_bytes in entries: + raw = entry_bytes.decode() if isinstance(entry_bytes, bytes) else entry_bytes + parsed = _parse_failed_test_entry(raw, section) + if not parsed: + continue + if parsed['log_url']: + if parsed['log_url'] in existing_urls: + continue + existing_urls.add(parsed['log_url']) + else: + composite = (parsed['test_cmd'], parsed['timestamp'], parsed['dashboard']) + if composite in existing_keys: + continue + existing_keys.add(composite) + try: + conn.execute(''' + INSERT INTO test_events + (status, test_cmd, log_url, ref_name, commit_author, + commit_msg, duration_secs, flake_group_id, dashboard, + timestamp) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + parsed['status'], parsed['test_cmd'], parsed['log_url'], + parsed['ref_name'], parsed['commit_author'], + parsed['commit_msg'], parsed['duration_secs'], + parsed['flake_group_id'], parsed['dashboard'], + parsed['timestamp'], + )) + total += 1 + except Exception as e: + print(f"[rk_metrics] Error inserting test event: {e}") + conn.commit() + if total: + print(f"[rk_metrics] Synced {total} test events from Redis lists") + + +# ---- Seed loading ---- + +def _load_seed_data(): + """Load CI runs and test events from ci-run-seed.json.gz if SQLite is empty.""" + import gzip + from pathlib import Path + + conn = db.get_db() + ci_count = conn.execute('SELECT COUNT(*) as c FROM ci_runs').fetchone()['c'] + te_count = conn.execute('SELECT COUNT(*) as c FROM test_events').fetchone()['c'] + if ci_count > 0 and te_count > 0: + return + + seed = Path(__file__).parent / 'ci-run-seed.json.gz' + if not seed.exists(): + return + + with gzip.open(seed, 'rt') as f: + data = json.load(f) + + now_iso = datetime.now(timezone.utc).isoformat() + + if ci_count == 0 and data.get('ci_runs'): + runs = data['ci_runs'] + for run in runs: + try: + conn.execute(''' + INSERT OR IGNORE INTO ci_runs + (dashboard, name, timestamp_ms, complete_ms, status, author, + pr_number, instance_type, instance_vcpus, spot, cost_usd, + job_id, arch, synced_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + run.get('dashboard', ''), + run.get('name', ''), + run.get('timestamp', 0), + run.get('complete'), + run.get('status'), + run.get('author'), + run.get('pr_number'), + run.get('instance_type'), + run.get('instance_vcpus'), + 1 if run.get('spot') else 0, + run.get('cost_usd'), + run.get('job_id', ''), + run.get('arch', ''), + now_iso, + )) + except Exception: + continue + conn.commit() + print(f"[rk_metrics] Loaded {len(runs)} CI runs from seed") + + if te_count == 0 and data.get('test_events'): + events = data['test_events'] + for ev in events: + try: + conn.execute(''' + INSERT OR IGNORE INTO test_events + (status, test_cmd, log_url, ref_name, commit_hash, commit_author, + commit_msg, exit_code, duration_secs, is_scenario, owners, + flake_group_id, dashboard, timestamp) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + ev.get('status', ''), + ev.get('test_cmd', ''), + ev.get('log_url'), + ev.get('ref_name', ''), + ev.get('commit_hash'), + ev.get('commit_author'), + ev.get('commit_msg'), + ev.get('exit_code'), + ev.get('duration_secs'), + ev.get('is_scenario', 0), + ev.get('owners'), + ev.get('flake_group_id'), + ev.get('dashboard', ''), + ev.get('timestamp', ''), + )) + except Exception: + continue + conn.commit() + print(f"[rk_metrics] Loaded {len(events)} test events from seed") + + +# ---- CI run sync (Redis → SQLite) for flake correlation ---- + +_ci_sync_ts = 0 +_CI_SYNC_TTL = 3600 # 1 hour + + +def sync_ci_runs_to_sqlite(redis_conn): + """Sync all CI runs from Redis into SQLite for persistence.""" + global _ci_sync_ts + now = time.time() + if now - _ci_sync_ts < _CI_SYNC_TTL: + return + _ci_sync_ts = now + + # Sync everything Redis has (not just 30 days) + runs = _get_ci_runs_from_redis(redis_conn) + + now_iso = datetime.now(timezone.utc).isoformat() + conn = db.get_db() + count = 0 + for run in runs: + try: + conn.execute(''' + INSERT OR REPLACE INTO ci_runs + (dashboard, name, timestamp_ms, complete_ms, status, author, + pr_number, instance_type, instance_vcpus, spot, cost_usd, + job_id, arch, synced_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + run.get('dashboard', ''), + run.get('name', ''), + run.get('timestamp', 0), + run.get('complete'), + run.get('status'), + run.get('author'), + run.get('pr_number'), + run.get('instance_type'), + run.get('instance_vcpus'), + 1 if run.get('spot') else 0, + run.get('cost_usd'), + run.get('job_id', ''), + run.get('arch', ''), + now_iso, + )) + count += 1 + except Exception as e: + print(f"[rk_metrics] Error syncing run: {e}") + conn.commit() + print(f"[rk_metrics] Synced {count} CI runs to SQLite") + + +def start_ci_run_sync(redis_conn): + """Start periodic CI run + test event sync thread.""" + _load_seed_data() + + def loop(): + while True: + try: + sync_ci_runs_to_sqlite(redis_conn) + sync_failed_tests_to_sqlite(redis_conn) + except Exception as e: + print(f"[rk_metrics] sync error: {e}") + time.sleep(600) # check every 10 min (TTL gates actual work) + + t = threading.Thread(target=loop, daemon=True, name='ci-run-sync') + t.start() + return t + + +def get_flakes_by_command(date_from, date_to, dashboard=''): + """Get flake stats grouped by CI command type (dashboard/section).""" + if dashboard: + rows = db.query(''' + SELECT dashboard, test_cmd, COUNT(*) as count + FROM test_events + WHERE status = 'flaked' AND dashboard = ? + AND timestamp >= ? AND timestamp < ? + GROUP BY dashboard, test_cmd + ORDER BY count DESC + ''', (dashboard, date_from, date_to + 'T23:59:59')) + else: + rows = db.query(''' + SELECT dashboard, test_cmd, COUNT(*) as count + FROM test_events + WHERE status = 'flaked' AND dashboard != '' + AND timestamp >= ? AND timestamp < ? + GROUP BY dashboard, test_cmd + ORDER BY count DESC + ''', (date_from, date_to + 'T23:59:59')) + + by_command = {} + total_flakes = 0 + for row in rows: + cmd = row['dashboard'] + if cmd not in by_command: + by_command[cmd] = {'total': 0, 'tests': {}} + by_command[cmd]['total'] += row['count'] + by_command[cmd]['tests'][row['test_cmd']] = row['count'] + total_flakes += row['count'] + + if dashboard: + failure_rows = db.query(''' + SELECT dashboard, COUNT(*) as count + FROM test_events + WHERE status = 'failed' AND dashboard = ? + AND timestamp >= ? AND timestamp < ? + GROUP BY dashboard + ''', (dashboard, date_from, date_to + 'T23:59:59')) + else: + failure_rows = db.query(''' + SELECT dashboard, COUNT(*) as count + FROM test_events + WHERE status = 'failed' AND dashboard != '' + AND timestamp >= ? AND timestamp < ? + GROUP BY dashboard + ''', (date_from, date_to + 'T23:59:59')) + failures_by_command = {r['dashboard']: r['count'] for r in failure_rows} + + result_list = [] + for cmd, data in sorted(by_command.items(), key=lambda x: -x[1]['total']): + top_tests = sorted(data['tests'].items(), key=lambda x: -x[1])[:10] + result_list.append({ + 'command': cmd, + 'total_flakes': data['total'], + 'total_failures': failures_by_command.get(cmd, 0), + 'top_tests': [{'test_cmd': t, 'count': c} for t, c in top_tests], + }) + + return { + 'by_command': result_list, + 'summary': { + 'total_flakes': total_flakes, + 'total_failures': sum(failures_by_command.values()), + }, + } diff --git a/ci3/ci-metrics/requirements.txt b/ci3/ci-metrics/requirements.txt new file mode 100644 index 000000000000..d6516263133f --- /dev/null +++ b/ci3/ci-metrics/requirements.txt @@ -0,0 +1,8 @@ +flask +gunicorn +redis +Flask-Compress +Flask-HTTPAuth +requests +google-cloud-bigquery +boto3 diff --git a/ci3/ci-metrics/sync_to_sqlite.py b/ci3/ci-metrics/sync_to_sqlite.py new file mode 100755 index 000000000000..5dd6faae6172 --- /dev/null +++ b/ci3/ci-metrics/sync_to_sqlite.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +"""Sync ephemeral Redis CI data to persistent SQLite. + +Normally run automatically by the ci-metrics server's background sync thread. +Can also be run standalone for a one-off manual sync: + + cd ci3/ci-metrics && python3 sync_to_sqlite.py + +Connects to Redis, reads all CI runs and failed test lists, writes to SQLite. +""" +import os +import sys +import time + +# Ensure this script can import sibling modules +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import redis as redis_lib +import db +import metrics + +REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') +REDIS_PORT = int(os.getenv('REDIS_PORT', '6379')) + + +def main(): + start = time.time() + r = redis_lib.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=False) + + try: + r.ping() + except Exception as e: + print(f"[sync] Cannot connect to Redis at {REDIS_HOST}:{REDIS_PORT}: {e}") + sys.exit(1) + + # Ensure DB schema is up to date + db.get_db() + + # Force sync by resetting the TTL gates + metrics._ci_sync_ts = 0 + metrics._failed_tests_sync_ts = 0 + + # Sync CI runs + print("[sync] Syncing CI runs from Redis to SQLite...") + metrics.sync_ci_runs_to_sqlite(r) + + # Sync failed/flaked test events from Redis lists + print("[sync] Syncing test events from Redis to SQLite...") + metrics.sync_failed_tests_to_sqlite(r) + + # Report + conn = db.get_db() + ci_count = conn.execute('SELECT COUNT(*) as c FROM ci_runs').fetchone()['c'] + te_count = conn.execute('SELECT COUNT(*) as c FROM test_events').fetchone()['c'] + elapsed = time.time() - start + print(f"[sync] Done in {elapsed:.1f}s. SQLite: {ci_count} CI runs, {te_count} test events.") + + +if __name__ == '__main__': + main() diff --git a/ci3/ci-metrics/views/ci-insights.html b/ci3/ci-metrics/views/ci-insights.html new file mode 100644 index 000000000000..533b6bfb62cd --- /dev/null +++ b/ci3/ci-metrics/views/ci-insights.html @@ -0,0 +1,658 @@ + + + + + ACI - CI Insights + + + + + +

ci insights

+ +
+ + + + | + + + | + + + + | + +
+ +
+ + + +
+
daily ci spend
--
+
cost / merge
--
+
mq success rate
--
+
flakes / day
--
+
prs merged / day
--
+
+ + +
+
+

daily ci cost + 7-day rolling cost per merge

+
+
+
+

merge queue: daily outcomes + success rate

+
+
+
+

flakes + test failures per day

+
+
+
+ + +
flakes by pipeline
+
+ + + +
+
+ + +
author ci profile
+
+ + + +
+
+ + + + + diff --git a/ci3/ci-metrics/views/cost-overview.html b/ci3/ci-metrics/views/cost-overview.html new file mode 100644 index 000000000000..53424a2d2d70 --- /dev/null +++ b/ci3/ci-metrics/views/cost-overview.html @@ -0,0 +1,905 @@ + + + + + ACI - Cost Overview + + + + + +

cost overview

+ +
+ + + + | + + + | + + + + | + +
+ +
+ +
+
Overview
+
Resource Details
+
CI Attribution
+
+ +
+
+ +
+
+

combined daily spend

+
+
+
+

service category breakdown

+
+
+
+

aws vs gcp split

+
+
+
+ + + + +
+
+ +
+
+
+
+ + + +
+
+
+ +
+
+ +
+
+
+

ci cost by run type (time series)

+
+
+
+

cost by user (AWS + GCP)

+
+
+
+

cost by run type

+
+
+
+

instances

+
+ + + +
+
+
+ + + + + diff --git a/ci3/ci-metrics/views/test-timings.html b/ci3/ci-metrics/views/test-timings.html new file mode 100644 index 000000000000..0bf6c7213bd6 --- /dev/null +++ b/ci3/ci-metrics/views/test-timings.html @@ -0,0 +1,289 @@ + + + + + ACI - Test Timings + + + + + +

test timings

+ +
+ + + + + | + + + | + + + | + + +
+ +
loading...
+ +
+ +
+
+

avg duration by day

+
+
+
+

test run count by day

+
+
+
+ +

tests by duration

+
+ + + + + + + + + + + + + + + + +
test commandrunsavg (s)min (s)max (s)total (h)pass %passedfailedflaked
+
+ +

slowest individual runs

+
+ + + + + + + + + + + + + +
test commandduration (s)statusdateauthorpipelinelog
+
+ + + + + diff --git a/ci3/dashboard/Dockerfile b/ci3/dashboard/Dockerfile index 2ca190fd9753..2da7805ffa83 100644 --- a/ci3/dashboard/Dockerfile +++ b/ci3/dashboard/Dockerfile @@ -16,7 +16,12 @@ RUN apt update && apt install -y \ WORKDIR /app COPY requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt gunicorn + +# Install ci-metrics dependencies (ci-metrics runs as subprocess) +COPY ci-metrics/requirements.txt ci-metrics/requirements.txt +RUN pip install --no-cache-dir -r ci-metrics/requirements.txt + RUN git config --global --add safe.directory /aztec-packages COPY . . -EXPOSE 8080 +EXPOSE 8080 8081 CMD ["gunicorn", "-w", "100", "-b", "0.0.0.0:8080", "rk:app"] diff --git a/ci3/dashboard/deploy.sh b/ci3/dashboard/deploy.sh index cc417006d072..1d9e930e95a1 100755 --- a/ci3/dashboard/deploy.sh +++ b/ci3/dashboard/deploy.sh @@ -1,7 +1,13 @@ #!/bin/bash set -euo pipefail -rsync -avz --exclude='deploy.sh' -e "ssh -i ~/.ssh/build_instance_key" * ubuntu@ci.aztec-labs.com:rk +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Sync dashboard (rkapp) files +rsync -avz --exclude='deploy.sh' -e "ssh -i ~/.ssh/build_instance_key" "$SCRIPT_DIR"/* ubuntu@ci.aztec-labs.com:rk + +# Sync ci-metrics server (started as subprocess by rkapp) +rsync -avz -e "ssh -i ~/.ssh/build_instance_key" "$SCRIPT_DIR/../ci-metrics/" ubuntu@ci.aztec-labs.com:rk/ci-metrics/ ssh -i ~/.ssh/build_instance_key ubuntu@ci.aztec-labs.com " cd rk diff --git a/ci3/dashboard/rk.py b/ci3/dashboard/rk.py index 4e194cbc3a10..aedf35a824e2 100644 --- a/ci3/dashboard/rk.py +++ b/ci3/dashboard/rk.py @@ -18,13 +18,40 @@ YELLOW, BLUE, GREEN, RED, PURPLE, BOLD, RESET, hyperlink, r, get_section_data, get_list_as_string ) - LOGS_DISK_PATH = os.getenv('LOGS_DISK_PATH', '/logs-disk') DASHBOARD_PASSWORD = os.getenv('DASHBOARD_PASSWORD', 'password') +CI_METRICS_PORT = int(os.getenv('CI_METRICS_PORT', '8081')) +CI_METRICS_URL = os.getenv('CI_METRICS_URL', f'http://localhost:{CI_METRICS_PORT}') + app = Flask(__name__) Compress(app) auth = HTTPBasicAuth() +# Start the ci-metrics server as a subprocess +# Check sibling dir (repo layout) then subdirectory (Docker layout) +_ci_metrics_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'ci-metrics') +if not os.path.isdir(_ci_metrics_dir): + _ci_metrics_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'ci-metrics') +if os.path.isdir(_ci_metrics_dir): + # Kill any stale process on the port (e.g. leftover from previous reload) + import signal + try: + out = subprocess.check_output( + ['lsof', '-ti', f':{CI_METRICS_PORT}'], stderr=subprocess.DEVNULL, text=True) + for pid in out.strip().split('\n'): + if pid: + os.kill(int(pid), signal.SIGTERM) + import time; time.sleep(0.5) + except (subprocess.CalledProcessError, OSError): + pass + _ci_metrics_env = {**os.environ, 'CI_METRICS_PORT': str(CI_METRICS_PORT)} + subprocess.Popen( + ['gunicorn', '-w', '4', '-b', f'0.0.0.0:{CI_METRICS_PORT}', '--timeout', '120', 'app:app'], + cwd=_ci_metrics_dir, + env=_ci_metrics_env, + ) + print(f"[rk.py] ci-metrics server started on port {CI_METRICS_PORT}") + def read_from_disk(key): """Read log from disk as fallback when Redis key not found.""" try: @@ -145,6 +172,14 @@ def root() -> str: f"{hyperlink('https://aztecprotocol.github.io/benchmark-page-data/bench?branch=next', 'next')}\n" f"{hyperlink('/chonk-breakdowns', 'chonk breakdowns')}\n" f"{RESET}" + f"\n" + f"CI Metrics:\n" + f"\n{YELLOW}" + f"{hyperlink('/cost-overview', 'cost overview (AWS + GCP)')}\n" + f"{hyperlink('/namespace-billing', 'namespace billing')}\n" + f"{hyperlink('/ci-insights', 'ci insights')}\n" + f"{hyperlink('/test-timings', 'test timings')}\n" + f"{RESET}" ) def section_view(section: str) -> str: @@ -487,6 +522,57 @@ def make_options(param_name, options, current_value, suffix=''): # Redirect to log view. return redirect(f'/{run_id}') + +# ---- Reverse proxy to ci-metrics server ---- + +_proxy_session = requests.Session() +_HOP_BY_HOP = frozenset([ + 'connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', + 'te', 'trailers', 'transfer-encoding', 'upgrade', 'content-length', + # `requests` auto-decompresses gzip responses, so Content-Encoding is + # stale — strip it so the browser doesn't try to decompress plain content. + # Flask-Compress on rkapp handles browser compression. + 'content-encoding', +]) +# Don't forward Accept-Encoding — let `requests` negotiate with ci-metrics +# (it adds its own and auto-decompresses). +_STRIP_REQUEST_HEADERS = frozenset(['host', 'accept-encoding']) + +def _proxy(path): + """Forward request to ci-metrics, streaming the response back.""" + url = f'{CI_METRICS_URL}/{path.lstrip("/")}' + try: + resp = _proxy_session.request( + method=request.method, + url=url, + params=request.args, + data=request.get_data(), + headers={k: v for k, v in request.headers if k.lower() not in _STRIP_REQUEST_HEADERS}, + stream=True, + timeout=60, + ) + # Strip hop-by-hop headers + headers = {k: v for k, v in resp.headers.items() if k.lower() not in _HOP_BY_HOP} + return Response(resp.iter_content(chunk_size=8192), + status=resp.status_code, headers=headers) + except Exception as e: + return Response(json.dumps({'error': f'ci-metrics unavailable: {e}'}), + mimetype='application/json', status=502) + +@app.route('/namespace-billing') +@app.route('/ci-health') +@app.route('/ci-insights') +@app.route('/cost-overview') +@app.route('/test-timings') +@auth.login_required +def proxy_dashboard(): + return _proxy(request.path) + +@app.route('/api/', methods=['GET', 'POST', 'PUT', 'DELETE']) +@auth.login_required +def proxy_api(path): + return _proxy(f'/api/{path}') + @app.route('/') @auth.login_required def get_value(key): diff --git a/ci3/log_ci_run b/ci3/log_ci_run index 5c9567ae91dd..b52b93256edc 100755 --- a/ci3/log_ci_run +++ b/ci3/log_ci_run @@ -35,6 +35,14 @@ if [ -z "$key" ]; then author="$(git log -1 --pretty=format:"%an")" name=$REF_NAME [ "$(aws_get_meta_data instance-life-cycle)" == "spot" ] && spot=true || spot=false + instance_type=$(aws_get_meta_data instance-type 2>/dev/null || echo "unknown") + instance_vcpus=$(nproc 2>/dev/null || echo 0) + + # Extract PR number from branch name or merge queue ref + pr_number="" + if [[ "$REF_NAME" =~ [Pp][Rr]-?([0-9]+) ]]; then + pr_number="${BASH_REMATCH[1]}" + fi # If this is github merge queue, just keep the queue name. if [[ "$name" =~ ^gh-readonly-queue/([^/]+)/ ]]; then @@ -42,6 +50,7 @@ if [ -z "$key" ]; then fi msg=$(pr_link "$msg") + dashboard="${range_key#ci-run-}" json=$(jq -c -j -n \ --argjson timestamp "$key" \ @@ -53,7 +62,12 @@ if [ -z "$key" ]; then --arg author "$author" \ --arg arch "$(arch)" \ --argjson spot "$spot" \ - '{timestamp: $timestamp, run_id: $run_id, job_id: $job_id, status: $status, msg: $msg, name: $name, author: $author, arch: $arch, spot: $spot}') + --arg instance_type "$instance_type" \ + --argjson instance_vcpus "$instance_vcpus" \ + --arg pr_number "$pr_number" \ + --arg dashboard "$dashboard" \ + --arg github_actor "${GITHUB_ACTOR:-}" \ + '{timestamp: $timestamp, run_id: $run_id, job_id: $job_id, status: $status, msg: $msg, name: $name, author: $author, github_actor: $github_actor, arch: $arch, spot: $spot, instance_type: $instance_type, instance_vcpus: $instance_vcpus, pr_number: $pr_number, dashboard: $dashboard}') # echo "$json" >&2 redis_cli ZADD $range_key $key "$json" &>/dev/null redis_cli SETEX hb-$key 60 1 &>/dev/null diff --git a/ci3/run_test_cmd b/ci3/run_test_cmd index 66334e535f27..35c37c4d0c42 100755 --- a/ci3/run_test_cmd +++ b/ci3/run_test_cmd @@ -160,7 +160,8 @@ if [ "$publish" -eq 1 ]; then --arg commit_hash "$COMMIT_HASH" \ --arg commit_author "$COMMIT_AUTHOR" \ --arg commit_msg "$COMMIT_MSG" \ - '{status: $status, test_cmd: $test_cmd, log_id: $log_id, log_url: $log_url, ref_name: $ref_name, commit_hash: $commit_hash, commit_author: $commit_author, commit_msg: $commit_msg, timestamp: now | todate}') + --arg dashboard "${CI_DASHBOARD:-}" \ + '{status: $status, test_cmd: $test_cmd, log_id: $log_id, log_url: $log_url, ref_name: $ref_name, commit_hash: $commit_hash, commit_author: $commit_author, commit_msg: $commit_msg, dashboard: $dashboard, timestamp: now | todate}') redis_publish "ci:test:started" "$start_redis_data" fi @@ -228,15 +229,16 @@ function track_test_failed { function publish_redis { local redis_data=$(jq -n \ --arg status "$1" \ - --arg cmd "$cmd" \ - --arg log_key "$log_key" \ - --arg ref_name "$REF_NAME" \ + --arg test_cmd "$cmd" \ + --arg log_url "http://ci.aztec-labs.com/$log_key" \ + --arg ref_name "${TARGET_BRANCH:-$REF_NAME}" \ --arg commit_hash "$COMMIT_HASH" \ --arg commit_author "$COMMIT_AUTHOR" \ --arg commit_msg "$COMMIT_MSG" \ --argjson code "$code" \ --argjson duration "$SECONDS" \ - '{status: $status, cmd: $cmd, log_key: $log_key, ref_name: $ref_name, commit_hash: $commit_hash, commit_author: $commit_author, commit_msg: $commit_msg, exit_code: $code, duration_seconds: $duration, timestamp: now | todate}') + --arg dashboard "${CI_DASHBOARD:-}" \ + '{status: $status, test_cmd: $test_cmd, log_url: $log_url, ref_name: $ref_name, commit_hash: $commit_hash, commit_author: $commit_author, commit_msg: $commit_msg, exit_code: $code, duration_seconds: $duration, dashboard: $dashboard, timestamp: now | todate}') redis_publish "ci:test:$1" "$redis_data" } @@ -247,6 +249,8 @@ function pass { local line="${green}PASSED${reset}${log_info:-}: $test_cmd (${SECONDS}s)" echo -e "$line" + [ "$publish" -eq 1 ] && publish_redis "passed" + if [ "$track_test_history" -eq 1 ]; then local track_line="${green}PASSED${reset}${log_info:-} ${fail_links}: $test_cmd (${SECONDS}s) (${purple}$COMMIT_AUTHOR${reset}: $COMMIT_MSG)" track_test_history "$track_line"