Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
abb3506
fix: ci-metrics deployment fixes (race condition, caching, nav)
ludamad Feb 13, 2026
b9f99e3
feat: improve CI metrics data quality and dashboards
ludamad Feb 13, 2026
e5cf0c1
fix: improve CloudTrail correlation and drop spurious slack notify ch…
ludamad Feb 13, 2026
c540b09
fix: rewrite CloudTrail resolver for 90% instance type coverage
ludamad Feb 13, 2026
43f687e
fix: make SQLite the source of truth for CI runs
ludamad Feb 13, 2026
5e1395e
fix: add missing EC2 instance rates and recalculate costs
ludamad Feb 13, 2026
13b098e
fix: return unknown cost when instance type and vCPUs are both unknown
ludamad Feb 13, 2026
9283811
feat: consolidate CI Insights, Test Timings, and Attribution into one…
ludamad Feb 13, 2026
12700d6
fix: remove gunicorn --preload to prevent ci-metrics deadlock
ludamad Feb 13, 2026
a2b4b18
fix: improve test data pipeline for Test Details page
ludamad Feb 13, 2026
3c14176
fix: prevent double compression in ci-metrics proxy
ludamad Feb 14, 2026
88ea736
fix: pass through compression directly in ci-metrics proxy
ludamad Feb 14, 2026
b343a77
test: add proxy compression and view structure tests for ci-metrics
ludamad Feb 14, 2026
2d39c79
revert: remove lip-service ci-metrics tests
ludamad Feb 14, 2026
b9273e7
feat: add CI run duration charts and pre-aggregated stats table
ludamad Feb 19, 2026
8c7a746
feat: add CI phase timing instrumentation
ludamad Feb 19, 2026
4157226
feat: phase chart grouped by pipeline with totals on top
ludamad Feb 19, 2026
4811fd3
fix: phase chart shows total time per pipeline, per-circuit timing
ludamad Feb 19, 2026
8f1f870
feat: total CI time chart, skip 0-duration phases, bb cache bust
ludamad Feb 19, 2026
0dcc90c
fix: use EPOCHREALTIME for ms-precision phase timing, filter < 0.1s
ludamad Feb 20, 2026
f3f25bc
feat: P95 phase timing per run, drop cache transfer tracking
ludamad Feb 20, 2026
1018cf0
feat: CI health report presentation
ludamad Feb 20, 2026
f515a1f
feat: add /ci-health-report route
ludamad Feb 20, 2026
72b2858
feat: proxy /ci-health-report through rk dashboard
ludamad Feb 20, 2026
95922e7
fix: use awk instead of bc for phase timing (bc not on CI runners)
ludamad Feb 20, 2026
4846e72
chore: gitignore ci-health-report
ludamad Feb 23, 2026
2366940
feat: flake-prs page + one-time cost separation in cost overview
ludamad Feb 23, 2026
856d18a
feat: add /commits page — web port of scripts/commits
ludamad Feb 23, 2026
ce6d745
fix: add rk.py entry point for gunicorn rk:app
ludamad Feb 23, 2026
4912e6f
feat: commits page v2 — infinite scroll, path filters, rkapp Redis ca…
ludamad Feb 23, 2026
bc52de3
fix: commits page — 100/page with arrows, disk cache on /data, drop p…
ludamad Feb 23, 2026
00d0138
feat: SQLite response cache for all 6 ci-insights API endpoints
ludamad Feb 24, 2026
59c7f9f
feat: cache TTL logic + stashed view/rk.py changes
ludamad Feb 24, 2026
67292b1
perf: push GROUP BY into SQL for test_timings, add duration_secs DESC…
ludamad Feb 24, 2026
146b814
perf: restore SQLite response cache on all 6 ci-insights endpoints
ludamad Feb 24, 2026
5aeb571
feat: persist pr_cache to SQLite, duration_secs in daily stats, gunic…
ludamad Feb 24, 2026
c5d04c8
refactor: remove commits/history code from rk.py (ci-metrics concern)
ludamad Feb 24, 2026
5776067
fix: hash_str_orig uses byte length; cache_cleanup uses unixepoch
ludamad Feb 24, 2026
6b8076d
Delete .claude/skills/ci-logs/ci-logs.md
ludamad Feb 24, 2026
fa1f2c1
revert: drop spurious bb comment, log_ci_run and source_bootstrap cha…
ludamad Feb 24, 2026
9619d30
-
ludamad Feb 24, 2026
3c6f106
.
ludamad Feb 24, 2026
e4f633b
Delete ci3/dashboard/gunicorn.conf.py
ludamad Feb 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
643 changes: 472 additions & 171 deletions ci3/ci-metrics/app.py

Large diffs are not rendered by default.

43 changes: 39 additions & 4 deletions ci3/ci-metrics/billing/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
# Messaging
'Amazon Simple Notification Service': 'sns',
'Amazon Simple Queue Service': 'sqs',
# Savings Plans / Reserved Instances
'Savings Plans for AWS Compute usage': 'savings_plans',
# Other
'Tax': 'tax',
'AWS Support (Business)': 'support',
Expand All @@ -63,6 +65,16 @@

import re

# One-time contract payments: annual Savings Plan upfronts and monthly Reserved Instance charges.
# These appear as large single-day spikes but are not operational spend.
_ONE_TIME_CATEGORIES = frozenset({
'savings_plan_1yr_annual',
'savings_plan_3yr_annual',
'savings_plan_1yr_annual_partial',
'savings_plan_3yr_annual_partial',
'reserved_instance_monthly',
})

_cache = {'rows': [], 'ts': 0}
_cache_lock = threading.Lock()
_detail_cache = {'rows': [], 'ts': 0}
Expand Down Expand Up @@ -152,7 +164,10 @@ def _fetch_aws_costs(date_from: str, date_to: str) -> list[dict]:
TimePeriod={'Start': date_from, 'End': date_to},
Granularity='DAILY',
Metrics=['UnblendedCost'],
GroupBy=[{'Type': 'DIMENSION', 'Key': 'SERVICE'}],
GroupBy=[
{'Type': 'DIMENSION', 'Key': 'SERVICE'},
{'Type': 'DIMENSION', 'Key': 'USAGE_TYPE'},
],
)
if next_token:
kwargs['NextPageToken'] = next_token
Expand All @@ -163,12 +178,26 @@ def _fetch_aws_costs(date_from: str, date_to: str) -> list[dict]:
date = result['TimePeriod']['Start']
for group in result['Groups']:
service = group['Keys'][0]
usage_type = group['Keys'][1] if len(group['Keys']) > 1 else ''
amount = float(group['Metrics']['UnblendedCost']['Amount'])
if amount == 0:
continue
category = SERVICE_CATEGORY_MAP.get(service, 'other')
# Savings plans: ComputeSP:1yrAllUpfront, ComputeSP:3yrNoUpfront, etc.
if category == 'savings_plans':
m = re.match(r'ComputeSP:(\d+yr)(\w+)', usage_type)
if m:
term = m.group(1)
payment = m.group(2)
if payment == 'NoUpfront':
category = f'savings_plan_{term}_monthly'
elif 'Upfront' in payment:
category = f'savings_plan_{term}_annual'
# EC2 reserved instances: HeavyUsage:<type> billed monthly on 1st
elif category == 'ec2' and 'HeavyUsage:' in usage_type:
category = 'reserved_instance_monthly'
if category == 'other':
print(f"[rk_aws_costs] unmapped service: {service!r} (${amount:.2f})")
print(f"[rk_aws_costs] unmapped service: {service!r} / {usage_type!r} (${amount:.2f})")
rows.append({
'date': date,
'service': service,
Expand Down Expand Up @@ -322,26 +351,32 @@ def get_costs_overview(date_from: str, date_to: str) -> dict:
for r in aws_rows:
d = r['date']
if d not in by_date:
by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0}
by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0, 'aws_one_time': 0}
cat = r['category']
by_date[d]['aws'][cat] = by_date[d]['aws'].get(cat, 0) + r['amount_usd']
by_date[d]['aws_total'] += r['amount_usd']
if cat in _ONE_TIME_CATEGORIES:
by_date[d]['aws_one_time'] += r['amount_usd']

for d, cats in gcp_by_date.items():
if d not in by_date:
by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0}
by_date[d] = {'date': d, 'aws': {}, 'gcp': {}, 'aws_total': 0, 'gcp_total': 0, 'aws_one_time': 0}
by_date[d]['gcp'] = cats
by_date[d]['gcp_total'] = sum(cats.values())

sorted_dates = sorted(by_date.values(), key=lambda x: x['date'])
aws_total = sum(d['aws_total'] for d in sorted_dates)
aws_one_time = sum(d['aws_one_time'] for d in sorted_dates)
gcp_total = sum(d['gcp_total'] for d in sorted_dates)

return {
'by_date': sorted_dates,
'totals': {
'aws': round(aws_total, 2),
'aws_operational': round(aws_total - aws_one_time, 2),
'aws_one_time': round(aws_one_time, 2),
'gcp': round(gcp_total, 2),
'combined': round(aws_total + gcp_total, 2),
'combined_operational': round(aws_total - aws_one_time + gcp_total, 2),
}
}
122 changes: 121 additions & 1 deletion ci3/ci-metrics/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
Stores test events (from Redis pub/sub) and merge queue daily stats
(backfilled from GitHub API).
"""
import json
import os
import sqlite3
import threading
import time

_DB_PATH = os.path.join(os.getenv('LOGS_DISK_PATH', '/logs-disk'), 'metrics.db')
_DB_PATH = os.getenv('METRICS_DB_PATH',
os.path.join(os.getenv('LOGS_DISK_PATH', '/logs-disk'), 'metrics.db'))
_local = threading.local()

SCHEMA = """
Expand All @@ -34,6 +37,7 @@
CREATE INDEX IF NOT EXISTS idx_test_events_ts ON test_events(timestamp);
CREATE INDEX IF NOT EXISTS idx_test_events_cmd ON test_events(test_cmd);
CREATE INDEX IF NOT EXISTS idx_test_events_dashboard ON test_events(dashboard);
CREATE INDEX IF NOT EXISTS idx_test_events_status_ts ON test_events(status, timestamp);

CREATE TABLE IF NOT EXISTS merge_queue_daily (
date TEXT PRIMARY KEY,
Expand Down Expand Up @@ -64,6 +68,84 @@
CREATE INDEX IF NOT EXISTS idx_ci_runs_ts ON ci_runs(timestamp_ms);
CREATE INDEX IF NOT EXISTS idx_ci_runs_name ON ci_runs(name);
CREATE INDEX IF NOT EXISTS idx_ci_runs_dashboard ON ci_runs(dashboard);

CREATE TABLE IF NOT EXISTS test_daily_stats (
date TEXT NOT NULL,
test_cmd TEXT NOT NULL,
dashboard TEXT NOT NULL DEFAULT '',
passed INTEGER NOT NULL DEFAULT 0,
failed INTEGER NOT NULL DEFAULT 0,
flaked INTEGER NOT NULL DEFAULT 0,
total_secs REAL NOT NULL DEFAULT 0,
count_timed INTEGER NOT NULL DEFAULT 0,
min_secs REAL,
max_secs REAL,
PRIMARY KEY (date, test_cmd, dashboard)
);
CREATE INDEX IF NOT EXISTS idx_tds_date ON test_daily_stats(date);
CREATE INDEX IF NOT EXISTS idx_tds_dashboard ON test_daily_stats(dashboard);

CREATE TABLE IF NOT EXISTS merge_queue_snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT NOT NULL,
depth INTEGER NOT NULL,
entries_json TEXT
);
CREATE INDEX IF NOT EXISTS idx_mqs_ts ON merge_queue_snapshots(timestamp);

CREATE TABLE IF NOT EXISTS ci_run_daily_stats (
date TEXT NOT NULL,
dashboard TEXT NOT NULL,
run_count INTEGER NOT NULL DEFAULT 0,
passed INTEGER NOT NULL DEFAULT 0,
failed INTEGER NOT NULL DEFAULT 0,
sum_duration REAL NOT NULL DEFAULT 0,
min_duration REAL,
max_duration REAL,
p50_duration REAL,
p95_duration REAL,
PRIMARY KEY (date, dashboard)
);
CREATE INDEX IF NOT EXISTS idx_crds_date ON ci_run_daily_stats(date);

CREATE TABLE IF NOT EXISTS ci_phases (
id INTEGER PRIMARY KEY AUTOINCREMENT,
phase TEXT NOT NULL,
duration_secs REAL NOT NULL,
exit_code INTEGER,
run_id TEXT,
job_id TEXT,
dashboard TEXT NOT NULL DEFAULT '',
ref_name TEXT,
commit_hash TEXT,
timestamp TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_ci_phases_run ON ci_phases(run_id);
CREATE INDEX IF NOT EXISTS idx_ci_phases_ts ON ci_phases(timestamp);
CREATE INDEX IF NOT EXISTS idx_ci_phases_phase ON ci_phases(phase);

CREATE TABLE IF NOT EXISTS pr_authors (
pr_number INTEGER PRIMARY KEY,
author TEXT NOT NULL,
title TEXT NOT NULL DEFAULT '',
branch TEXT NOT NULL DEFAULT '',
additions INTEGER DEFAULT 0,
deletions INTEGER DEFAULT 0,
fetched_at TEXT NOT NULL
);

CREATE TABLE IF NOT EXISTS api_cache (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
created_at REAL NOT NULL,
ttl_secs INTEGER NOT NULL DEFAULT 300
);

CREATE TABLE IF NOT EXISTS pr_cache (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
updated_at REAL NOT NULL
);
"""


Expand All @@ -73,6 +155,16 @@
"ALTER TABLE ci_runs ADD COLUMN job_id TEXT DEFAULT ''",
"ALTER TABLE ci_runs ADD COLUMN arch TEXT DEFAULT ''",
"CREATE INDEX IF NOT EXISTS idx_ci_runs_dashboard ON ci_runs(dashboard)",
"ALTER TABLE test_events ADD COLUMN test_hash TEXT",
"CREATE INDEX IF NOT EXISTS idx_test_events_hash ON test_events(test_hash)",
"ALTER TABLE merge_queue_daily ADD COLUMN avg_depth REAL",
"ALTER TABLE merge_queue_daily ADD COLUMN peak_depth INTEGER",
"CREATE INDEX IF NOT EXISTS idx_test_events_duration_ts ON test_events(timestamp) WHERE duration_secs IS NOT NULL AND duration_secs > 0",
"ALTER TABLE test_daily_stats ADD COLUMN total_secs REAL NOT NULL DEFAULT 0",
"ALTER TABLE test_daily_stats ADD COLUMN count_timed INTEGER NOT NULL DEFAULT 0",
"ALTER TABLE test_daily_stats ADD COLUMN min_secs REAL",
"ALTER TABLE test_daily_stats ADD COLUMN max_secs REAL",
"CREATE INDEX IF NOT EXISTS idx_test_events_duration ON test_events(duration_secs DESC) WHERE duration_secs IS NOT NULL AND duration_secs > 0",
]


Expand Down Expand Up @@ -105,3 +197,31 @@ def execute(sql: str, params=()):
conn = get_db()
conn.execute(sql, params)
conn.commit()


def cache_get(key: str):
"""Return cached value (parsed JSON) if not expired, else None."""
rows = query('SELECT value, created_at, ttl_secs FROM api_cache WHERE key = ?', (key,))
if rows and time.time() - rows[0]['created_at'] < rows[0]['ttl_secs']:
return json.loads(rows[0]['value'])
return None


def cache_set(key: str, data, ttl_secs: int = 300) -> None:
"""Store data as JSON in the cache with a TTL."""
execute(
'INSERT OR REPLACE INTO api_cache (key, value, created_at, ttl_secs) VALUES (?, ?, ?, ?)',
(key, json.dumps(data, default=str), time.time(), ttl_secs),
)


def cache_invalidate_prefix(prefix: str) -> None:
"""Delete all cache entries whose key starts with prefix."""
execute('DELETE FROM api_cache WHERE key LIKE ?', (prefix + '%',))


def cache_cleanup() -> None:
"""Remove expired entries."""
execute(
"DELETE FROM api_cache WHERE created_at + ttl_secs < unixepoch('now')"
)
31 changes: 25 additions & 6 deletions ci3/ci-metrics/ec2_pricing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,20 @@
# ---- Hardcoded fallback rates (us-east-2, USD/hr) ----

_HARDCODED_RATES = {
('m6a.48xlarge', True): 8.31, # spot
('m6a.48xlarge', False): 16.56, # on-demand
('m6a.32xlarge', True): 5.54,
('m6a.32xlarge', False): 11.04,
('m6a.xlarge', True): 0.07, # spot
('m6a.xlarge', False): 0.1728, # on-demand
('m6a.4xlarge', True): 0.28,
('m6a.4xlarge', False): 0.6912,
('m6a.8xlarge', True): 0.55,
('m6a.8xlarge', False): 1.3824,
('m6a.16xlarge', True): 2.77,
('m6a.16xlarge', False): 5.52,
('m6a.24xlarge', True): 1.66,
('m6a.24xlarge', False): 4.1472,
('m6a.32xlarge', True): 5.54,
('m6a.32xlarge', False): 11.04,
('m6a.48xlarge', True): 8.31,
('m6a.48xlarge', False): 16.56,
('m7a.48xlarge', True): 8.31,
('m7a.48xlarge', False): 16.56,
('m7a.16xlarge', True): 2.77,
Expand Down Expand Up @@ -145,8 +153,19 @@ def _fetch_all_spot(instance_types: list[str]) -> dict[str, float]:
# ---- Cache refresh ----

def _get_known_instance_types() -> list[str]:
"""Return the set of instance types we need pricing for."""
return sorted({itype for itype, _ in _HARDCODED_RATES})
"""Return the set of instance types we need pricing for (hardcoded + from DB)."""
types = {itype for itype, _ in _HARDCODED_RATES}
try:
import db
conn = db.get_db()
rows = conn.execute(
"SELECT DISTINCT instance_type FROM ci_runs "
"WHERE instance_type IS NOT NULL AND instance_type != '' AND instance_type != 'unknown'"
).fetchall()
types.update(r['instance_type'] for r in rows)
except Exception:
pass
return sorted(types)


def _refresh_cache():
Expand Down
Loading
Loading