From d567ce6872180d27afaab5d63a32237852e8a597 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Mon, 10 Nov 2025 19:20:22 -0800
Subject: [PATCH 01/31] Add nightly test CI monitor workflow

Add dedicated monitoring for nightly test runs to track performance
and accuracy regressions over time:

- New nightly_monitor.py script that analyzes nightly test workflow runs
  - Tracks job success/failure rates
  - Calculates average duration per job
  - Detects high failure rates (>30%)
  - Identifies consecutive failures
  - Generates daily trend reports

- New nightly-monitor.yml workflow that runs daily at 8 AM UTC
  - Analyzes last 7 days of nightly test runs
  - Uploads detailed statistics as artifacts
  - Reports regressions via GitHub Actions output
  - Can be triggered manually with custom date range

- Updated ci_analyzer.py to include nightly-test-8-gpu-b200 job
  in tracking list for general CI monitoring
---
 .github/workflows/nightly-monitor.yml | 119 +++++++++
 scripts/ci_monitor/ci_analyzer.py     |   2 +
 scripts/ci_monitor/nightly_monitor.py | 357 ++++++++++++++++++++++++++
 3 files changed, 478 insertions(+)
 create mode 100644 .github/workflows/nightly-monitor.yml
 create mode 100644 scripts/ci_monitor/nightly_monitor.py

diff --git a/.github/workflows/nightly-monitor.yml b/.github/workflows/nightly-monitor.yml
new file mode 100644
index 000000000000..bdf9ed260481
--- /dev/null
+++ b/.github/workflows/nightly-monitor.yml
@@ -0,0 +1,119 @@
+name: Nightly Test Monitor
+
+on:
+  schedule:
+    - cron: '0 8 * * *' # Run daily at 8 AM UTC (after nightly tests typically complete)
+  workflow_dispatch:
+    inputs:
+      days:
+        description: 'Number of days to analyze'
+        required: false
+        default: '7'
+        type: string
+
+concurrency:
+  group: nightly-monitor-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: write
+  actions: read
+  issues: write
+
+jobs:
+  nightly-monitor:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install requests
+
+      - name: Run Nightly Test Monitor
+        id: monitor
+        env:
+          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
+          PYTHONUNBUFFERED: 1
+          PYTHONIOENCODING: utf-8
+        run: |
+          cd scripts/ci_monitor
+          python nightly_monitor.py \
+            --token $GITHUB_TOKEN \
+            --days ${{ inputs.days || '7' }} \
+            --output nightly_monitor_$(date +%Y%m%d_%H%M%S).json
+        continue-on-error: true
+
+      - name: Upload Monitor Results
+        uses: actions/upload-artifact@v4
+        with:
+          name: nightly-monitor-results-${{ github.run_number }}
+          path: |
+            scripts/ci_monitor/nightly_monitor_*.json
+          retention-days: 90
+
+      - name: Comment on Issues if Regressions Detected
+        if: steps.monitor.outcome == 'failure'
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
+          script: |
+            const fs = require('fs');
+            const path = require('path');
+
+            // Find the latest monitor output file
+            const files = fs.readdirSync('scripts/ci_monitor')
+              .filter(f => f.startsWith('nightly_monitor_') && f.endsWith('.json'))
+              .sort()
+              .reverse();
+
+            if (files.length === 0) {
+              console.log('No monitor output file found');
+              return;
+            }
+
+            const filePath = path.join('scripts/ci_monitor', files[0]);
+            const stats = JSON.parse(fs.readFileSync(filePath, 'utf8'));
+
+            // Create a summary of regressions
+            let regressionSummary = '## ⚠️ Nightly Test Regressions Detected\n\n';
+            regressionSummary += `**Report Date:** ${new Date().toISOString().split('T')[0]}\n`;
+            regressionSummary += `**Analysis Period:** Last ${{ inputs.days || '7' }} days\n\n`;
+            regressionSummary += `### Summary\n`;
+            regressionSummary += `- Total Runs: ${stats.total_runs}\n`;
+            regressionSummary += `- Failed Runs: ${stats.failed_runs} (${(stats.failed_runs/Math.max(1,stats.total_runs)*100).toFixed(1)}%)\n\n`;
+
+            regressionSummary += `### Jobs with High Failure Rates (>30%)\n\n`;
+            regressionSummary += `| Job Name | Failure Rate | Failures/Total |\n`;
+            regressionSummary += `|----------|--------------|----------------|\n`;
+
+            let hasRegressions = false;
+            for (const [jobName, jobStat] of Object.entries(stats.job_stats)) {
+              if (jobStat.total > 0) {
+                const failureRate = (jobStat.failure / jobStat.total) * 100;
+                if (failureRate > 30) {
+                  hasRegressions = true;
+                  regressionSummary += `| ${jobName} | ${failureRate.toFixed(1)}% | ${jobStat.failure}/${jobStat.total} |\n`;
+                }
+              }
+            }
+
+            if (!hasRegressions) {
+              console.log('No high failure rates found');
+              return;
+            }
+
+            regressionSummary += `\n[View detailed monitor results](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId})`;
+
+            // Post as a comment on a tracking issue or create a new one
+            // For now, just log it (you can modify to create/update issues)
+            console.log(regressionSummary);
+            core.setOutput('regression_summary', regressionSummary);
diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index 8f7dc7e2d247..08d894ed24a3 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -108,6 +108,7 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict:
                 "nightly-test-8-gpu-h200",
                 "nightly-test-8-gpu-h20",
                 "nightly-test-4-gpu-b200",
+                "nightly-test-8-gpu-b200",
             ],
             "integration": [
                 "run-all-notebooks",
@@ -199,6 +200,7 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict:
                     "nightly-test-8-gpu-h200",
                     "nightly-test-8-gpu-h20",
                     "nightly-test-4-gpu-b200",
+                    "nightly-test-8-gpu-b200",
                 ]
 
                 if job_name in target_jobs:
diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
new file mode 100644
index 000000000000..7e6a2e0d6c6a
--- /dev/null
+++ b/scripts/ci_monitor/nightly_monitor.py
@@ -0,0 +1,357 @@
+#!/usr/bin/env python3
+"""
+Nightly Test Monitor
+
+Monitors nightly test runs for performance and accuracy regressions.
+Analyzes metrics from GitHub summaries and tracks trends over time.
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from collections import defaultdict
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional
+
+import requests
+
+
+class NightlyTestMonitor:
+    def __init__(self, token: str):
+        self.token = token
+        self.base_url = "https://api.github.com"
+        self.repo = "sgl-project/sglang"
+        self.headers = {
+            "Authorization": f"token {token}",
+            "Accept": "application/vnd.github.v3+json",
+            "User-Agent": "SGLang-Nightly-Monitor/1.0",
+        }
+        self.session = requests.Session()
+        self.session.headers.update(self.headers)
+
+        # Nightly test jobs to monitor
+        self.nightly_jobs = [
+            "nightly-test-eval-text-models",
+            "nightly-test-perf-text-models",
+            "nightly-test-eval-vlms",
+            "nightly-test-perf-vlms",
+            "nightly-test-1-gpu",
+            "nightly-test-4-gpu",
+            "nightly-test-8-gpu-h200",
+            "nightly-test-8-gpu-h20",
+            "nightly-test-4-gpu-b200",
+            "nightly-test-8-gpu-b200",
+        ]
+
+    def get_nightly_runs(self, days: int = 7) -> List[Dict]:
+        """Get nightly test workflow runs from the last N days"""
+        print(f"Fetching nightly test runs from the last {days} days...")
+
+        since_date = (datetime.now() - timedelta(days=days)).isoformat()
+
+        all_runs = []
+        page = 1
+        per_page = 100
+
+        while True:
+            url = f"{self.base_url}/repos/{self.repo}/actions/runs"
+            params = {
+                "workflow_id": "nightly-test.yml",
+                "per_page": per_page,
+                "page": page,
+                "created": f">={since_date}",
+            }
+
+            try:
+                response = self.session.get(url, params=params)
+                response.raise_for_status()
+                data = response.json()
+
+                if not data.get("workflow_runs"):
+                    break
+
+                runs = data["workflow_runs"]
+                all_runs.extend(runs)
+                print(f"Fetched {len(all_runs)} nightly runs so far...")
+
+                if len(runs) < per_page:
+                    break
+
+                page += 1
+                time.sleep(0.1)
+
+            except requests.exceptions.RequestException as e:
+                print(f"Error fetching nightly test data: {e}")
+                break
+
+        print(f"Total nightly runs fetched: {len(all_runs)}")
+        return all_runs
+
+    def get_job_details(self, run_id: int) -> List[Dict]:
+        """Get job details for a specific run"""
+        url = f"{self.base_url}/repos/{self.repo}/actions/runs/{run_id}/jobs"
+        try:
+            response = self.session.get(url)
+            response.raise_for_status()
+            return response.json().get("jobs", [])
+        except:
+            return []
+
+    def parse_metrics_from_summary(self, run_id: int, job_id: int) -> List[Dict]:
+        """
+        Parse metrics from GitHub step summary.
+        This would ideally download the summary artifact and parse JSON metrics.
+        For now, we'll track basic job success/failure and timing.
+        """
+        # TODO: Implement actual metric parsing from step summary artifacts
+        # This would use the MetricReport JSON format we set up
+        return []
+
+    def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
+        """Analyze nightly test runs for failures and performance"""
+        print("Analyzing nightly test data...")
+
+        stats = {
+            "total_runs": len(runs),
+            "successful_runs": 0,
+            "failed_runs": 0,
+            "cancelled_runs": 0,
+            "job_stats": defaultdict(lambda: {
+                "total": 0,
+                "success": 0,
+                "failure": 0,
+                "recent_failures": [],
+                "avg_duration_minutes": 0,
+                "durations": [],
+            }),
+            "daily_stats": defaultdict(lambda: {
+                "total": 0,
+                "success": 0,
+                "failure": 0,
+            }),
+        }
+
+        for i, run in enumerate(runs, 1):
+            if i % 10 == 0:
+                print(f"Processed {i}/{len(runs)} runs...")
+
+            run_status = run.get("conclusion", "unknown")
+            run_id = run.get("id")
+            run_number = run.get("run_number")
+            created_at = run.get("created_at")
+            run_url = f"https://github.com/{self.repo}/actions/runs/{run_id}"
+
+            # Track daily stats
+            date_str = created_at.split("T")[0] if created_at else "unknown"
+            stats["daily_stats"][date_str]["total"] += 1
+
+            if run_status == "success":
+                stats["successful_runs"] += 1
+                stats["daily_stats"][date_str]["success"] += 1
+            elif run_status == "failure":
+                stats["failed_runs"] += 1
+                stats["daily_stats"][date_str]["failure"] += 1
+            elif run_status == "cancelled":
+                stats["cancelled_runs"] += 1
+
+            # Analyze individual jobs
+            jobs = self.get_job_details(run_id)
+            for job in jobs:
+                job_name = job.get("name", "Unknown")
+                job_conclusion = job.get("conclusion", "unknown")
+                started_at = job.get("started_at")
+                completed_at = job.get("completed_at")
+
+                # Only track our nightly test jobs
+                if job_name not in self.nightly_jobs:
+                    continue
+
+                job_stat = stats["job_stats"][job_name]
+                job_stat["total"] += 1
+
+                if job_conclusion == "success":
+                    job_stat["success"] += 1
+                elif job_conclusion == "failure":
+                    job_stat["failure"] += 1
+
+                    # Store recent failures (up to 5)
+                    if len(job_stat["recent_failures"]) < 5:
+                        job_stat["recent_failures"].append({
+                            "run_url": run_url,
+                            "run_number": run_number,
+                            "created_at": created_at,
+                            "job_url": job.get("html_url"),
+                        })
+
+                # Track duration
+                if started_at and completed_at:
+                    try:
+                        start = datetime.fromisoformat(started_at.replace("Z", "+00:00"))
+                        end = datetime.fromisoformat(completed_at.replace("Z", "+00:00"))
+                        duration_minutes = (end - start).total_seconds() / 60
+                        job_stat["durations"].append(duration_minutes)
+                    except:
+                        pass
+
+            time.sleep(0.1)
+
+        # Calculate average durations
+        for job_name, job_stat in stats["job_stats"].items():
+            if job_stat["durations"]:
+                job_stat["avg_duration_minutes"] = sum(job_stat["durations"]) / len(job_stat["durations"])
+                del job_stat["durations"]  # Remove raw data to reduce size
+
+        return stats
+
+    def generate_report(self, stats: Dict, output_file: str = None):
+        """Generate a human-readable report"""
+        print("\n" + "=" * 80)
+        print("NIGHTLY TEST MONITOR REPORT")
+        print("=" * 80)
+        print(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        print(f"Total Runs Analyzed: {stats['total_runs']}")
+        print(f"Successful: {stats['successful_runs']} ({stats['successful_runs']/max(1, stats['total_runs'])*100:.1f}%)")
+        print(f"Failed: {stats['failed_runs']} ({stats['failed_runs']/max(1, stats['total_runs'])*100:.1f}%)")
+        print(f"Cancelled: {stats['cancelled_runs']}")
+        print("=" * 80)
+
+        # Daily trend
+        print("\nDAILY TRENDS:")
+        print("-" * 80)
+        daily_stats = sorted(stats["daily_stats"].items(), reverse=True)[:7]
+        for date, day_stats in daily_stats:
+            success_rate = (day_stats["success"] / max(1, day_stats["total"])) * 100
+            print(f"{date}: {day_stats['total']} runs, {day_stats['success']} success ({success_rate:.1f}%), {day_stats['failure']} failed")
+
+        # Job statistics
+        print("\nJOB STATISTICS:")
+        print("-" * 80)
+        print(f"{'Job Name':<40} {'Total':<8} {'Success':<8} {'Failed':<8} {'Rate':<8} {'Avg Duration'}")
+        print("-" * 80)
+
+        job_stats_sorted = sorted(
+            stats["job_stats"].items(),
+            key=lambda x: x[1]["failure"],
+            reverse=True
+        )
+
+        for job_name, job_stat in job_stats_sorted:
+            total = job_stat["total"]
+            success = job_stat["success"]
+            failure = job_stat["failure"]
+            success_rate = (success / max(1, total)) * 100
+            avg_duration = job_stat["avg_duration_minutes"]
+
+            print(f"{job_name:<40} {total:<8} {success:<8} {failure:<8} {success_rate:>6.1f}% {avg_duration:>7.1f}m")
+
+            # Show recent failures
+            if job_stat["recent_failures"]:
+                print(f"  Recent failures:")
+                for failure in job_stat["recent_failures"][:3]:
+                    print(f"    - Run #{failure['run_number']}: {failure['run_url']}")
+
+        print("=" * 80)
+
+        # Save to file if requested
+        if output_file:
+            with open(output_file, "w") as f:
+                json.dump(stats, f, indent=2, default=str)
+            print(f"\nDetailed stats saved to: {output_file}")
+
+    def detect_regressions(self, stats: Dict) -> List[Dict]:
+        """Detect potential regressions in nightly tests"""
+        regressions = []
+
+        for job_name, job_stat in stats["job_stats"].items():
+            total = job_stat["total"]
+            failure = job_stat["failure"]
+
+            if total > 0:
+                failure_rate = (failure / total) * 100
+
+                # Flag jobs with high failure rates
+                if failure_rate > 30:
+                    regressions.append({
+                        "job_name": job_name,
+                        "type": "high_failure_rate",
+                        "failure_rate": failure_rate,
+                        "total_runs": total,
+                        "failures": failure,
+                    })
+
+                # Flag jobs with recent consecutive failures
+                recent_failures = len(job_stat["recent_failures"])
+                if recent_failures >= 3:
+                    regressions.append({
+                        "job_name": job_name,
+                        "type": "consecutive_failures",
+                        "recent_failure_count": recent_failures,
+                    })
+
+        if regressions:
+            print("\n" + "⚠" * 40)
+            print("POTENTIAL REGRESSIONS DETECTED:")
+            print("⚠" * 40)
+            for regression in regressions:
+                print(f"\nJob: {regression['job_name']}")
+                if regression["type"] == "high_failure_rate":
+                    print(f"  High failure rate: {regression['failure_rate']:.1f}% ({regression['failures']}/{regression['total_runs']})")
+                elif regression["type"] == "consecutive_failures":
+                    print(f"  {regression['recent_failure_count']} recent consecutive failures")
+            print("⚠" * 40)
+
+        return regressions
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Monitor nightly test runs for regressions"
+    )
+    parser.add_argument(
+        "--token",
+        required=True,
+        help="GitHub personal access token"
+    )
+    parser.add_argument(
+        "--days",
+        type=int,
+        default=7,
+        help="Number of days to analyze (default: 7)"
+    )
+    parser.add_argument(
+        "--output",
+        help="Output file for detailed stats (JSON)"
+    )
+
+    args = parser.parse_args()
+
+    monitor = NightlyTestMonitor(args.token)
+
+    # Get nightly runs
+    runs = monitor.get_nightly_runs(days=args.days)
+
+    if not runs:
+        print("No nightly test runs found in the specified time period.")
+        sys.exit(1)
+
+    # Analyze runs
+    stats = monitor.analyze_nightly_tests(runs)
+
+    # Generate report
+    monitor.generate_report(stats, args.output)
+
+    # Detect regressions
+    regressions = monitor.detect_regressions(stats)
+
+    # Exit with error code if regressions detected
+    if regressions:
+        sys.exit(1)
+    else:
+        print("\n✓ No significant regressions detected")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()

From 10f05cf6abb03643acddb416399916149937c0b7 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Mon, 10 Nov 2025 19:26:47 -0800
Subject: [PATCH 02/31] Fix lint errors in nightly_monitor.py

---
 scripts/ci_monitor/nightly_monitor.py | 53 +++++++++++++--------------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
index 7e6a2e0d6c6a..45ac97dad46b 100644
--- a/scripts/ci_monitor/nightly_monitor.py
+++ b/scripts/ci_monitor/nightly_monitor.py
@@ -273,22 +273,26 @@ def detect_regressions(self, stats: Dict) -> List[Dict]:
 
                 # Flag jobs with high failure rates
                 if failure_rate > 30:
-                    regressions.append({
-                        "job_name": job_name,
-                        "type": "high_failure_rate",
-                        "failure_rate": failure_rate,
-                        "total_runs": total,
-                        "failures": failure,
-                    })
+                    regressions.append(
+                        {
+                            "job_name": job_name,
+                            "type": "high_failure_rate",
+                            "failure_rate": failure_rate,
+                            "total_runs": total,
+                            "failures": failure,
+                        }
+                    )
 
                 # Flag jobs with recent consecutive failures
                 recent_failures = len(job_stat["recent_failures"])
                 if recent_failures >= 3:
-                    regressions.append({
-                        "job_name": job_name,
-                        "type": "consecutive_failures",
-                        "recent_failure_count": recent_failures,
-                    })
+                    regressions.append(
+                        {
+                            "job_name": job_name,
+                            "type": "consecutive_failures",
+                            "recent_failure_count": recent_failures,
+                        }
+                    )
 
         if regressions:
             print("\n" + "⚠" * 40)
@@ -297,9 +301,14 @@ def detect_regressions(self, stats: Dict) -> List[Dict]:
             for regression in regressions:
                 print(f"\nJob: {regression['job_name']}")
                 if regression["type"] == "high_failure_rate":
-                    print(f"  High failure rate: {regression['failure_rate']:.1f}% ({regression['failures']}/{regression['total_runs']})")
+                    print(
+                        f"  High failure rate: {regression['failure_rate']:.1f}% "
+                        f"({regression['failures']}/{regression['total_runs']})"
+                    )
                 elif regression["type"] == "consecutive_failures":
-                    print(f"  {regression['recent_failure_count']} recent consecutive failures")
+                    print(
+                        f"  {regression['recent_failure_count']} recent consecutive failures"
+                    )
             print("⚠" * 40)
 
         return regressions
@@ -309,21 +318,11 @@ def main():
     parser = argparse.ArgumentParser(
         description="Monitor nightly test runs for regressions"
     )
+    parser.add_argument("--token", required=True, help="GitHub personal access token")
     parser.add_argument(
-        "--token",
-        required=True,
-        help="GitHub personal access token"
-    )
-    parser.add_argument(
-        "--days",
-        type=int,
-        default=7,
-        help="Number of days to analyze (default: 7)"
-    )
-    parser.add_argument(
-        "--output",
-        help="Output file for detailed stats (JSON)"
+        "--days", type=int, default=7, help="Number of days to analyze (default: 7)"
     )
+    parser.add_argument("--output", help="Output file for detailed stats (JSON)")
 
     args = parser.parse_args()
 

From ba5f85986e9f0a87026afba451891b7823b2e81b Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Mon, 10 Nov 2025 19:59:52 -0800
Subject: [PATCH 03/31] Fix additional lint errors (line length)

---
 scripts/ci_monitor/nightly_monitor.py | 37 +++++++++++++++++++--------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
index 45ac97dad46b..ee0e666fb2b1 100644
--- a/scripts/ci_monitor/nightly_monitor.py
+++ b/scripts/ci_monitor/nightly_monitor.py
@@ -188,7 +188,9 @@ def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
                 # Track duration
                 if started_at and completed_at:
                     try:
-                        start = datetime.fromisoformat(started_at.replace("Z", "+00:00"))
+                        start = datetime.fromisoformat(
+                            started_at.replace("Z", "+00:00")
+                        )
                         end = datetime.fromisoformat(completed_at.replace("Z", "+00:00"))
                         duration_minutes = (end - start).total_seconds() / 60
                         job_stat["durations"].append(duration_minutes)
@@ -200,7 +202,9 @@ def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
         # Calculate average durations
         for job_name, job_stat in stats["job_stats"].items():
             if job_stat["durations"]:
-                job_stat["avg_duration_minutes"] = sum(job_stat["durations"]) / len(job_stat["durations"])
+                job_stat["avg_duration_minutes"] = sum(job_stat["durations"]) / len(
+                    job_stat["durations"]
+                )
                 del job_stat["durations"]  # Remove raw data to reduce size
 
         return stats
@@ -212,8 +216,14 @@ def generate_report(self, stats: Dict, output_file: str = None):
         print("=" * 80)
         print(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
         print(f"Total Runs Analyzed: {stats['total_runs']}")
-        print(f"Successful: {stats['successful_runs']} ({stats['successful_runs']/max(1, stats['total_runs'])*100:.1f}%)")
-        print(f"Failed: {stats['failed_runs']} ({stats['failed_runs']/max(1, stats['total_runs'])*100:.1f}%)")
+        print(
+            f"Successful: {stats['successful_runs']} "
+            f"({stats['successful_runs']/max(1, stats['total_runs'])*100:.1f}%)"
+        )
+        print(
+            f"Failed: {stats['failed_runs']} "
+            f"({stats['failed_runs']/max(1, stats['total_runs'])*100:.1f}%)"
+        )
         print(f"Cancelled: {stats['cancelled_runs']}")
         print("=" * 80)
 
@@ -223,18 +233,22 @@ def generate_report(self, stats: Dict, output_file: str = None):
         daily_stats = sorted(stats["daily_stats"].items(), reverse=True)[:7]
         for date, day_stats in daily_stats:
             success_rate = (day_stats["success"] / max(1, day_stats["total"])) * 100
-            print(f"{date}: {day_stats['total']} runs, {day_stats['success']} success ({success_rate:.1f}%), {day_stats['failure']} failed")
+            print(
+                f"{date}: {day_stats['total']} runs, {day_stats['success']} success "
+                f"({success_rate:.1f}%), {day_stats['failure']} failed"
+            )
 
         # Job statistics
         print("\nJOB STATISTICS:")
         print("-" * 80)
-        print(f"{'Job Name':<40} {'Total':<8} {'Success':<8} {'Failed':<8} {'Rate':<8} {'Avg Duration'}")
+        print(
+            f"{'Job Name':<40} {'Total':<8} {'Success':<8} {'Failed':<8} "
+            f"{'Rate':<8} {'Avg Duration'}"
+        )
         print("-" * 80)
 
         job_stats_sorted = sorted(
-            stats["job_stats"].items(),
-            key=lambda x: x[1]["failure"],
-            reverse=True
+            stats["job_stats"].items(), key=lambda x: x[1]["failure"], reverse=True
         )
 
         for job_name, job_stat in job_stats_sorted:
@@ -244,7 +258,10 @@ def generate_report(self, stats: Dict, output_file: str = None):
             success_rate = (success / max(1, total)) * 100
             avg_duration = job_stat["avg_duration_minutes"]
 
-            print(f"{job_name:<40} {total:<8} {success:<8} {failure:<8} {success_rate:>6.1f}% {avg_duration:>7.1f}m")
+            print(
+                f"{job_name:<40} {total:<8} {success:<8} {failure:<8} "
+                f"{success_rate:>6.1f}% {avg_duration:>7.1f}m"
+            )
 
             # Show recent failures
             if job_stat["recent_failures"]:

From 4f564b570d358ae2e30f4d5639ac58ef3179ff20 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Mon, 10 Nov 2025 22:35:49 -0800
Subject: [PATCH 04/31] Fix remaining lint formatting issues

---
 scripts/ci_monitor/nightly_monitor.py | 48 ++++++++++++++++-----------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
index ee0e666fb2b1..ec4b9f7b78a0 100644
--- a/scripts/ci_monitor/nightly_monitor.py
+++ b/scripts/ci_monitor/nightly_monitor.py
@@ -118,19 +118,23 @@ def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
             "successful_runs": 0,
             "failed_runs": 0,
             "cancelled_runs": 0,
-            "job_stats": defaultdict(lambda: {
-                "total": 0,
-                "success": 0,
-                "failure": 0,
-                "recent_failures": [],
-                "avg_duration_minutes": 0,
-                "durations": [],
-            }),
-            "daily_stats": defaultdict(lambda: {
-                "total": 0,
-                "success": 0,
-                "failure": 0,
-            }),
+            "job_stats": defaultdict(
+                lambda: {
+                    "total": 0,
+                    "success": 0,
+                    "failure": 0,
+                    "recent_failures": [],
+                    "avg_duration_minutes": 0,
+                    "durations": [],
+                }
+            ),
+            "daily_stats": defaultdict(
+                lambda: {
+                    "total": 0,
+                    "success": 0,
+                    "failure": 0,
+                }
+            ),
         }
 
         for i, run in enumerate(runs, 1):
@@ -178,12 +182,14 @@ def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
 
                     # Store recent failures (up to 5)
                     if len(job_stat["recent_failures"]) < 5:
-                        job_stat["recent_failures"].append({
-                            "run_url": run_url,
-                            "run_number": run_number,
-                            "created_at": created_at,
-                            "job_url": job.get("html_url"),
-                        })
+                        job_stat["recent_failures"].append(
+                            {
+                                "run_url": run_url,
+                                "run_number": run_number,
+                                "created_at": created_at,
+                                "job_url": job.get("html_url"),
+                            }
+                        )
 
                 # Track duration
                 if started_at and completed_at:
@@ -191,7 +197,9 @@ def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
                         start = datetime.fromisoformat(
                             started_at.replace("Z", "+00:00")
                         )
-                        end = datetime.fromisoformat(completed_at.replace("Z", "+00:00"))
+                        end = datetime.fromisoformat(
+                            completed_at.replace("Z", "+00:00")
+                        )
                         duration_minutes = (end - start).total_seconds() / 60
                         job_stat["durations"].append(duration_minutes)
                     except:

From 17f9b1d94ab02b4b535abc2502db208375bcddea Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Mon, 10 Nov 2025 23:17:57 -0800
Subject: [PATCH 05/31] Make nightly_monitor.py executable

---
 scripts/ci_monitor/nightly_monitor.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 scripts/ci_monitor/nightly_monitor.py

diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
old mode 100644
new mode 100755

From c5a62b13a7bf107ec2f6bb70b86c39f8e3b106ad Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Fri, 14 Nov 2025 15:29:46 -0800
Subject: [PATCH 06/31] Add performance metric tracking to nightly monitor
 (Step 1)

- Add regex patterns for parsing performance metrics from logs
- Add get_job_logs() method to fetch job logs from GitHub API
- Add parse_metrics_from_logs() to extract metrics using regex
- Track performance metrics (throughput, latency, ttft, accuracy) with timestamps
- Display average metrics in report output
- Only fetch metrics from successful perf/eval jobs

This is Step 1 of incremental enhancement to add day-to-day performance
comparison and anomaly detection.
---
 scripts/ci_monitor/nightly_monitor.py | 94 ++++++++++++++++++++++++---
 1 file changed, 86 insertions(+), 8 deletions(-)

diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
index ec4b9f7b78a0..7d087fa6a69c 100755
--- a/scripts/ci_monitor/nightly_monitor.py
+++ b/scripts/ci_monitor/nightly_monitor.py
@@ -9,11 +9,12 @@
 import argparse
 import json
 import os
+import re
 import sys
 import time
 from collections import defaultdict
 from datetime import datetime, timedelta
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 import requests
 
@@ -45,6 +46,21 @@ def __init__(self, token: str):
             "nightly-test-8-gpu-b200",
         ]
 
+        # Performance metric patterns for parsing logs
+        self.perf_patterns = {
+            "output_throughput": re.compile(
+                r"Output token throughput \(tok/s\):\s*([\d.]+)"
+            ),
+            "input_throughput": re.compile(
+                r"Input token throughput \(tok/s\):\s*([\d.]+)"
+            ),
+            "latency": re.compile(r"Median E2E Latency \(ms\):\s*([\d.]+)"),
+            "ttft": re.compile(r"Median TTFT \(ms\):\s*([\d.]+)"),
+            "accept_length": re.compile(r"Accept length:\s*([\d.]+)"),
+            "accuracy": re.compile(r"Accuracy:\s*([\d.]+)"),
+            "gsm8k_score": re.compile(r"GSM8K Score:\s*([\d.]+)"),
+        }
+
     def get_nightly_runs(self, days: int = 7) -> List[Dict]:
         """Get nightly test workflow runs from the last N days"""
         print(f"Fetching nightly test runs from the last {days} days...")
@@ -99,15 +115,45 @@ def get_job_details(self, run_id: int) -> List[Dict]:
         except:
             return []
 
-    def parse_metrics_from_summary(self, run_id: int, job_id: int) -> List[Dict]:
+    def get_job_logs(self, job_id: int) -> Optional[str]:
+        """Get logs for a specific job"""
+        url = f"{self.base_url}/repos/{self.repo}/actions/jobs/{job_id}/logs"
+        try:
+            response = self.session.get(url)
+            response.raise_for_status()
+            return response.text
+        except requests.exceptions.RequestException as e:
+            print(f"  Warning: Could not fetch logs for job {job_id}: {e}")
+            return None
+
+    def parse_metrics_from_logs(self, logs: str, job_name: str) -> Dict[str, List[float]]:
         """
-        Parse metrics from GitHub step summary.
-        This would ideally download the summary artifact and parse JSON metrics.
-        For now, we'll track basic job success/failure and timing.
+        Parse performance metrics from job logs.
+
+        Args:
+            logs: Raw log text from the job
+            job_name: Name of the job (to determine which metrics to look for)
+
+        Returns:
+            Dictionary mapping metric names to lists of values found
         """
-        # TODO: Implement actual metric parsing from step summary artifacts
-        # This would use the MetricReport JSON format we set up
-        return []
+        metrics = defaultdict(list)
+
+        if not logs:
+            return metrics
+
+        # Parse each line for matching patterns
+        for line in logs.split("\n"):
+            for metric_name, pattern in self.perf_patterns.items():
+                match = pattern.search(line)
+                if match:
+                    try:
+                        value = float(match.group(1))
+                        metrics[metric_name].append(value)
+                    except (ValueError, IndexError):
+                        continue
+
+        return dict(metrics)
 
     def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
         """Analyze nightly test runs for failures and performance"""
@@ -126,6 +172,7 @@ def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
                     "recent_failures": [],
                     "avg_duration_minutes": 0,
                     "durations": [],
+                    "performance_metrics": defaultdict(list),  # New: track perf metrics
                 }
             ),
             "daily_stats": defaultdict(
@@ -165,6 +212,7 @@ def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
             for job in jobs:
                 job_name = job.get("name", "Unknown")
                 job_conclusion = job.get("conclusion", "unknown")
+                job_id = job.get("id")
                 started_at = job.get("started_at")
                 completed_at = job.get("completed_at")
 
@@ -177,6 +225,26 @@ def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
 
                 if job_conclusion == "success":
                     job_stat["success"] += 1
+
+                    # For successful performance jobs, fetch metrics
+                    if "perf" in job_name.lower() or "eval" in job_name.lower():
+                        logs = self.get_job_logs(job_id)
+                        if logs:
+                            metrics = self.parse_metrics_from_logs(logs, job_name)
+                            # Store metrics with timestamp
+                            for metric_name, values in metrics.items():
+                                if values:  # Only store if we found values
+                                    job_stat["performance_metrics"][metric_name].extend(
+                                        [
+                                            {
+                                                "value": v,
+                                                "timestamp": created_at,
+                                                "run_id": run_id,
+                                            }
+                                            for v in values
+                                        ]
+                                    )
+
                 elif job_conclusion == "failure":
                     job_stat["failure"] += 1
 
@@ -271,6 +339,16 @@ def generate_report(self, stats: Dict, output_file: str = None):
                 f"{success_rate:>6.1f}% {avg_duration:>7.1f}m"
             )
 
+            # Show performance metrics if available
+            if job_stat.get("performance_metrics"):
+                perf_metrics = job_stat["performance_metrics"]
+                print(f"  Performance metrics collected:")
+                for metric_name, metric_data in perf_metrics.items():
+                    if metric_data:
+                        values = [m["value"] for m in metric_data]
+                        avg_value = sum(values) / len(values)
+                        print(f"    - {metric_name}: {avg_value:.2f} (avg, n={len(values)})")
+
             # Show recent failures
             if job_stat["recent_failures"]:
                 print(f"  Recent failures:")

From a0eb43487c12c9c944592440c2076ffc20155bfb Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Fri, 14 Nov 2025 15:33:31 -0800
Subject: [PATCH 07/31] Add historical data fetching from
 sglang-bot/sglang-ci-data (Step 2)

- Add data_repo and data_branch configuration for sglang-bot/sglang-ci-data
- Add get_historical_data_paths() to list available historical data files
- Add fetch_historical_data() to fetch and decode specific data files from repo
- Add get_recent_historical_metrics() to retrieve metrics for a job over time
- Fetch up to 14 recent historical files to compare with current metrics
- Handle base64 decoding and JSON parsing with error handling

This enables day-to-day metric comparison in the next step.
---
 scripts/ci_monitor/nightly_monitor.py | 102 ++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
index 7d087fa6a69c..a18b2ae2e1b0 100755
--- a/scripts/ci_monitor/nightly_monitor.py
+++ b/scripts/ci_monitor/nightly_monitor.py
@@ -7,6 +7,7 @@
 """
 
 import argparse
+import base64
 import json
 import os
 import re
@@ -61,6 +62,10 @@ def __init__(self, token: str):
             "gsm8k_score": re.compile(r"GSM8K Score:\s*([\d.]+)"),
         }
 
+        # Historical data repository
+        self.data_repo = "sglang-bot/sglang-ci-data"
+        self.data_branch = "main"
+
     def get_nightly_runs(self, days: int = 7) -> List[Dict]:
         """Get nightly test workflow runs from the last N days"""
         print(f"Fetching nightly test runs from the last {days} days...")
@@ -155,6 +160,103 @@ def parse_metrics_from_logs(self, logs: str, job_name: str) -> Dict[str, List[fl
 
         return dict(metrics)
 
+    def get_historical_data_paths(self) -> List[str]:
+        """
+        Get list of available nightly monitor data files from the data repository.
+
+        Returns:
+            List of file paths in the repository
+        """
+        url = f"{self.base_url}/repos/{self.data_repo}/contents/nightly_monitor"
+        try:
+            response = self.session.get(url)
+            response.raise_for_status()
+            contents = response.json()
+
+            # Filter for JSON files
+            json_files = [
+                item["path"]
+                for item in contents
+                if item["type"] == "file" and item["name"].endswith(".json")
+            ]
+            return sorted(json_files, reverse=True)  # Most recent first
+        except requests.exceptions.RequestException as e:
+            print(f"Warning: Could not fetch historical data paths: {e}")
+            return []
+
+    def fetch_historical_data(self, file_path: str) -> Optional[Dict]:
+        """
+        Fetch a specific historical data file from the repository.
+
+        Args:
+            file_path: Path to the file in the repository
+
+        Returns:
+            Dictionary with historical data, or None if fetch failed
+        """
+        url = f"{self.base_url}/repos/{self.data_repo}/contents/{file_path}"
+        try:
+            response = self.session.get(url)
+            response.raise_for_status()
+            data = response.json()
+
+            # Decode base64 content
+            content = base64.b64decode(data["content"]).decode("utf-8")
+            return json.loads(content)
+        except (requests.exceptions.RequestException, json.JSONDecodeError, KeyError) as e:
+            print(f"Warning: Could not fetch historical data from {file_path}: {e}")
+            return None
+
+    def get_recent_historical_metrics(
+        self, job_name: str, metric_name: str, days: int = 7
+    ) -> List[Dict]:
+        """
+        Get recent historical metrics for a specific job and metric.
+
+        Args:
+            job_name: Name of the job
+            metric_name: Name of the metric (e.g., 'output_throughput')
+            days: Number of days to look back
+
+        Returns:
+            List of metric data points with timestamps
+        """
+        print(
+            f"  Fetching historical {metric_name} data for {job_name} (last {days} days)..."
+        )
+
+        historical_paths = self.get_historical_data_paths()
+        if not historical_paths:
+            return []
+
+        cutoff_date = datetime.now() - timedelta(days=days)
+        historical_metrics = []
+
+        # Fetch recent files (limit to avoid too many API calls)
+        for file_path in historical_paths[:min(days * 2, 14)]:  # Max 14 files
+            historical_data = self.fetch_historical_data(file_path)
+            if not historical_data:
+                continue
+
+            # Check if this file has data for our job
+            job_stats = historical_data.get("job_stats", {}).get(job_name, {})
+            if not job_stats:
+                continue
+
+            # Extract metrics
+            perf_metrics = job_stats.get("performance_metrics", {}).get(metric_name, [])
+            for metric_entry in perf_metrics:
+                try:
+                    timestamp = datetime.fromisoformat(
+                        metric_entry["timestamp"].replace("Z", "+00:00")
+                    ).replace(tzinfo=None)
+                    if timestamp >= cutoff_date:
+                        historical_metrics.append(metric_entry)
+                except (ValueError, KeyError):
+                    continue
+
+        return sorted(historical_metrics, key=lambda x: x["timestamp"])
+
     def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
         """Analyze nightly test runs for failures and performance"""
         print("Analyzing nightly test data...")

From 0eb18b4801e68c9bf705f94d5a7d0a00a5edfad7 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Fri, 14 Nov 2025 15:38:36 -0800
Subject: [PATCH 08/31] Apply black formatting to nightly_monitor.py

---
 scripts/ci_monitor/nightly_monitor.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
index a18b2ae2e1b0..15950e7ea653 100755
--- a/scripts/ci_monitor/nightly_monitor.py
+++ b/scripts/ci_monitor/nightly_monitor.py
@@ -131,7 +131,9 @@ def get_job_logs(self, job_id: int) -> Optional[str]:
             print(f"  Warning: Could not fetch logs for job {job_id}: {e}")
             return None
 
-    def parse_metrics_from_logs(self, logs: str, job_name: str) -> Dict[str, List[float]]:
+    def parse_metrics_from_logs(
+        self, logs: str, job_name: str
+    ) -> Dict[str, List[float]]:
         """
         Parse performance metrics from job logs.
 
@@ -203,7 +205,11 @@ def fetch_historical_data(self, file_path: str) -> Optional[Dict]:
             # Decode base64 content
             content = base64.b64decode(data["content"]).decode("utf-8")
             return json.loads(content)
-        except (requests.exceptions.RequestException, json.JSONDecodeError, KeyError) as e:
+        except (
+            requests.exceptions.RequestException,
+            json.JSONDecodeError,
+            KeyError,
+        ) as e:
             print(f"Warning: Could not fetch historical data from {file_path}: {e}")
             return None
 
@@ -233,7 +239,7 @@ def get_recent_historical_metrics(
         historical_metrics = []
 
         # Fetch recent files (limit to avoid too many API calls)
-        for file_path in historical_paths[:min(days * 2, 14)]:  # Max 14 files
+        for file_path in historical_paths[: min(days * 2, 14)]:  # Max 14 files
             historical_data = self.fetch_historical_data(file_path)
             if not historical_data:
                 continue
@@ -449,7 +455,9 @@ def generate_report(self, stats: Dict, output_file: str = None):
                     if metric_data:
                         values = [m["value"] for m in metric_data]
                         avg_value = sum(values) / len(values)
-                        print(f"    - {metric_name}: {avg_value:.2f} (avg, n={len(values)})")
+                        print(
+                            f"    - {metric_name}: {avg_value:.2f} (avg, n={len(values)})"
+                        )
 
             # Show recent failures
             if job_stat["recent_failures"]:

From 8c1e19ba4759de38b0add46a4c7436f79c0bb40d Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Fri, 14 Nov 2025 15:46:49 -0800
Subject: [PATCH 09/31] Add day-to-day metric comparison (Step 3)

- Compare current metrics with 7-day historical average
- Calculate percentage changes and classify as stable/minor/significant
- Display metrics with change indicators and percentages in report
---
 scripts/ci_monitor/nightly_monitor.py | 103 ++++++++++++++++++++++++--
 1 file changed, 97 insertions(+), 6 deletions(-)

diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
index 15950e7ea653..18e99c339e45 100755
--- a/scripts/ci_monitor/nightly_monitor.py
+++ b/scripts/ci_monitor/nightly_monitor.py
@@ -263,6 +263,76 @@ def get_recent_historical_metrics(
 
         return sorted(historical_metrics, key=lambda x: x["timestamp"])
 
+    def compare_with_historical(
+        self, current_metrics: Dict[str, List[Dict]], days: int = 7
+    ) -> Dict[str, Dict]:
+        """
+        Compare current metrics with historical data to detect changes.
+
+        Args:
+            current_metrics: Dictionary of metric_name -> list of metric data points
+            days: Number of days to look back for comparison
+
+        Returns:
+            Dictionary with comparison results including percentage changes
+        """
+        comparisons = {}
+
+        for metric_name, current_data in current_metrics.items():
+            if not current_data:
+                continue
+
+            # Calculate current average
+            current_values = [d["value"] for d in current_data]
+            current_avg = sum(current_values) / len(current_values)
+
+            # Get the job name from the first data point
+            # (assumes all data points are from the same job)
+            job_name = current_data[0].get("job_name", "unknown")
+
+            # Fetch historical data
+            historical_data = self.get_recent_historical_metrics(
+                job_name, metric_name, days
+            )
+
+            if not historical_data:
+                comparisons[metric_name] = {
+                    "current_avg": current_avg,
+                    "historical_avg": None,
+                    "percent_change": None,
+                    "status": "no_history",
+                }
+                continue
+
+            # Calculate historical average
+            historical_values = [d["value"] for d in historical_data]
+            historical_avg = sum(historical_values) / len(historical_values)
+
+            # Calculate percentage change
+            if historical_avg > 0:
+                percent_change = ((current_avg - historical_avg) / historical_avg) * 100
+            else:
+                percent_change = 0
+
+            # Determine status based on change
+            if abs(percent_change) < 5:
+                status = "stable"
+            elif abs(percent_change) < 10:
+                status = "minor_change"
+            else:
+                status = "significant_change"
+
+            comparisons[metric_name] = {
+                "current_avg": current_avg,
+                "historical_avg": historical_avg,
+                "percent_change": percent_change,
+                "status": status,
+                "current_count": len(current_values),
+                "historical_count": len(historical_values),
+            }
+
+        return comparisons
+
     def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
         """Analyze nightly test runs for failures and performance"""
         print("Analyzing nightly test data...")
@@ -339,7 +409,7 @@ def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
                         logs = self.get_job_logs(job_id)
                         if logs:
                             metrics = self.parse_metrics_from_logs(logs, job_name)
-                            # Store metrics with timestamp
+                            # Store metrics with timestamp and job name
                             for metric_name, values in metrics.items():
                                 if values:  # Only store if we found values
                                     job_stat["performance_metrics"][metric_name].extend(
@@ -348,6 +418,7 @@ def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
                                                 "value": v,
                                                 "timestamp": created_at,
                                                 "run_id": run_id,
+                                                "job_name": job_name,
                                             }
                                             for v in values
                                         ]
@@ -447,17 +518,37 @@ def generate_report(self, stats: Dict, output_file: str = None):
                 f"{success_rate:>6.1f}% {avg_duration:>7.1f}m"
             )
 
-            # Show performance metrics if available
+            # Show performance metrics with day-to-day comparison if available
             if job_stat.get("performance_metrics"):
                 perf_metrics = job_stat["performance_metrics"]
-                print(f"  Performance metrics collected:")
+                print(f"  Performance metrics:")
+
+                # Compare with historical data
+                comparisons = self.compare_with_historical(perf_metrics, days=7)
+
                 for metric_name, metric_data in perf_metrics.items():
                     if metric_data:
                         values = [m["value"] for m in metric_data]
                         avg_value = sum(values) / len(values)
-                        print(
-                            f"    - {metric_name}: {avg_value:.2f} (avg, n={len(values)})"
-                        )
+
+                        # Get comparison data
+                        comparison = comparisons.get(metric_name, {})
+                        percent_change = comparison.get("percent_change")
+
+                        if percent_change is not None:
+                            change_indicator = "📈" if percent_change > 0 else "📉"
+                            if abs(percent_change) < 1:
+                                change_indicator = "➡️"
+
+                            print(
+                                f"    - {metric_name}: {avg_value:.2f} "
+                                f"(n={len(values)}) {change_indicator} "
+                                f"{percent_change:+.1f}% vs 7d avg"
+                            )
+                        else:
+                            print(
+                                f"    - {metric_name}: {avg_value:.2f} (n={len(values)}) [no history]"
+                            )
 
             # Show recent failures
             if job_stat["recent_failures"]:

From f7031c2bab396901d8b27325910c4e0f46455d21 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Fri, 14 Nov 2025 15:50:48 -0800
Subject: [PATCH 10/31] Add >10% anomaly detection (Step 4)

- Detect performance regressions exceeding 10% change
- Flag throughput decreases >10% and latency increases >10%
- Add performance_regression type to regression reports
- Display regression details with current vs 7-day average
---
 scripts/ci_monitor/nightly_monitor.py | 51 ++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 4 deletions(-)

diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
index 18e99c339e45..18e417d890a0 100755
--- a/scripts/ci_monitor/nightly_monitor.py
+++ b/scripts/ci_monitor/nightly_monitor.py
@@ -598,10 +598,46 @@ def detect_regressions(self, stats: Dict) -> List[Dict]:
                         }
                     )
 
+            # Check for performance regressions >10%
+            if job_stat.get("performance_metrics"):
+                perf_metrics = job_stat["performance_metrics"]
+                comparisons = self.compare_with_historical(perf_metrics, days=7)
+
+                for metric_name, comparison in comparisons.items():
+                    percent_change = comparison.get("percent_change")
+                    if percent_change is None:
+                        continue
+
+                    # Flag performance regressions >10%
+                    # For throughput metrics, negative change is bad
+                    # For latency/ttft metrics, positive change is bad
+                    is_regression = False
+                    if "throughput" in metric_name.lower():
+                        if percent_change < -10:  # >10% decrease in throughput
+                            is_regression = True
+                    elif (
+                        "latency" in metric_name.lower()
+                        or "ttft" in metric_name.lower()
+                    ):
+                        if percent_change > 10:  # >10% increase in latency
+                            is_regression = True
+
+                    if is_regression:
+                        regressions.append(
+                            {
+                                "job_name": job_name,
+                                "type": "performance_regression",
+                                "metric_name": metric_name,
+                                "percent_change": percent_change,
+                                "current_avg": comparison["current_avg"],
+                                "historical_avg": comparison["historical_avg"],
+                            }
+                        )
+
         if regressions:
-            print("\n" + "⚠" * 40)
-            print("POTENTIAL REGRESSIONS DETECTED:")
-            print("⚠" * 40)
+            print("\n" + "=" * 80)
+            print("REGRESSIONS DETECTED:")
+            print("=" * 80)
             for regression in regressions:
                 print(f"\nJob: {regression['job_name']}")
                 if regression["type"] == "high_failure_rate":
@@ -613,7 +649,14 @@ def detect_regressions(self, stats: Dict) -> List[Dict]:
                     print(
                         f"  {regression['recent_failure_count']} recent consecutive failures"
                     )
-            print("⚠" * 40)
+                elif regression["type"] == "performance_regression":
+                    print(f"  Performance regression: {regression['metric_name']}")
+                    print(
+                        f"    Change: {regression['percent_change']:+.1f}% "
+                        f"(current: {regression['current_avg']:.2f}, "
+                        f"7d avg: {regression['historical_avg']:.2f})"
+                    )
+            print("=" * 80)
 
         return regressions
 

From 57722b060d4b4ead8551a75ca0044c423fa1878a Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Fri, 14 Nov 2025 15:53:04 -0800
Subject: [PATCH 11/31] Add GitHub summary and artifact generation (Steps 5&6)

- Generate GitHub Actions step summary with regression alerts
- Display performance metrics table with percentage changes
- Highlight regressions detected section
- JSON artifact already created by workflow for all tracked data
---
 scripts/ci_monitor/nightly_monitor.py | 60 +++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
index 18e417d890a0..5821891a18ff 100755
--- a/scripts/ci_monitor/nightly_monitor.py
+++ b/scripts/ci_monitor/nightly_monitor.py
@@ -464,6 +464,63 @@ def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
 
         return stats
 
+    def generate_github_summary(self, stats: Dict, regressions: List[Dict]):
+        """Generate GitHub Actions step summary"""
+        github_summary = os.environ.get("GITHUB_STEP_SUMMARY")
+        if not github_summary:
+            return
+
+        with open(github_summary, "a") as f:
+            f.write("# Nightly Test Monitor Report\n\n")
+            f.write(
+                f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+            )
+
+            # Summary stats
+            f.write("## Summary\n\n")
+            f.write(f"- Total Runs: {stats['total_runs']}\n")
+            f.write(
+                f"- Successful: {stats['successful_runs']} "
+                f"({stats['successful_runs']/max(1, stats['total_runs'])*100:.1f}%)\n"
+            )
+            f.write(
+                f"- Failed: {stats['failed_runs']} "
+                f"({stats['failed_runs']/max(1, stats['total_runs'])*100:.1f}%)\n\n"
+            )
+
+            # Regressions
+            if regressions:
+                f.write("## Regressions Detected\n\n")
+                for reg in regressions:
+                    if reg["type"] == "performance_regression":
+                        f.write(
+                            f"- **{reg['job_name']}**: {reg['metric_name']} "
+                            f"({reg['percent_change']:+.1f}%)\n"
+                        )
+
+            # Performance metrics table
+            f.write("\n## Performance Metrics\n\n")
+            f.write("| Job | Metric | Current | Change |\n")
+            f.write("|-----|--------|---------|--------|\n")
+
+            for job_name, job_stat in stats["job_stats"].items():
+                if job_stat.get("performance_metrics"):
+                    perf_metrics = job_stat["performance_metrics"]
+                    comparisons = self.compare_with_historical(perf_metrics, days=7)
+
+                    for metric_name, metric_data in perf_metrics.items():
+                        if metric_data:
+                            values = [m["value"] for m in metric_data]
+                            avg_value = sum(values) / len(values)
+                            comparison = comparisons.get(metric_name, {})
+                            percent_change = comparison.get("percent_change")
+
+                            if percent_change is not None:
+                                f.write(
+                                    f"| {job_name} | {metric_name} | {avg_value:.2f} | "
+                                    f"{percent_change:+.1f}% |\n"
+                                )
+
     def generate_report(self, stats: Dict, output_file: str = None):
         """Generate a human-readable report"""
         print("\n" + "=" * 80)
@@ -691,6 +748,9 @@ def main():
     # Detect regressions
     regressions = monitor.detect_regressions(stats)
 
+    # Generate GitHub Actions summary
+    monitor.generate_github_summary(stats, regressions)
+
     # Exit with error code if regressions detected
     if regressions:
         sys.exit(1)

From f3b2379f2f7ab8b03ee389bf00c0db519348d342 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Fri, 14 Nov 2025 16:32:52 -0800
Subject: [PATCH 12/31] Add temporary test workflow for nightly monitor

---
 .github/workflows/test-nightly-monitor.yml | 57 ++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 .github/workflows/test-nightly-monitor.yml

diff --git a/.github/workflows/test-nightly-monitor.yml b/.github/workflows/test-nightly-monitor.yml
new file mode 100644
index 000000000000..2fba7eb785f5
--- /dev/null
+++ b/.github/workflows/test-nightly-monitor.yml
@@ -0,0 +1,57 @@
+name: Test Nightly Monitor
+
+on:
+  workflow_dispatch:
+    inputs:
+      days:
+        description: 'Number of days to analyze'
+        required: false
+        default: '7'
+        type: string
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'scripts/ci_monitor/nightly_monitor.py'
+      - '.github/workflows/test-nightly-monitor.yml'
+
+permissions:
+  contents: write
+  actions: read
+
+jobs:
+  test-monitor:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install requests
+
+      - name: Run Nightly Test Monitor
+        env:
+          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
+          PYTHONUNBUFFERED: 1
+          PYTHONIOENCODING: utf-8
+        run: |
+          cd scripts/ci_monitor
+          python nightly_monitor.py \
+            --token $GITHUB_TOKEN \
+            --days ${{ inputs.days || '7' }} \
+            --output test_monitor_$(date +%Y%m%d_%H%M%S).json
+
+      - name: Upload Test Results
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-monitor-results-${{ github.run_number }}
+          path: |
+            scripts/ci_monitor/test_monitor_*.json
+          retention-days: 7

From c98bd6f52040a7bdf1d949bfe40b4b7e2d6e74ce Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Fri, 14 Nov 2025 16:40:40 -0800
Subject: [PATCH 13/31] Trigger test workflow


From 45adfafc06ff82e2d30eb914e8693b9e95222297 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Fri, 14 Nov 2025 17:04:40 -0800
Subject: [PATCH 14/31] Update CI monitors to support new nightly workflow
 structure

Support tracking of new hardware-specific nightly workflows (NVIDIA, AMD, Intel) alongside existing workflows.

Changes:
- Add new NVIDIA job names from nightly-test-nvidia.yml
- Add AMD job names from nightly-test-amd.yml
- Update nightly_monitor.py to fetch from multiple workflow files instead of just nightly-test.yml
- Maintain backward compatibility with old job names

This enables the CI monitor to track failures from all new workflows, making it safe to eventually disable the old nightly-test.yml and nightly-test-b200.yml workflows.
---
 scripts/ci_monitor/ci_analyzer.py     | 28 +++++++++
 scripts/ci_monitor/nightly_monitor.py | 85 ++++++++++++++++++---------
 2 files changed, 85 insertions(+), 28 deletions(-)

diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index 08d894ed24a3..dc49324eb7c1 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -101,6 +101,7 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict:
                 "per-commit-8-gpu-h20",
             ],
             "nightly": [
+                # Old job names (nightly-test.yml)
                 "nightly-test-perf-text-models",
                 "nightly-test-eval-text-models",
                 "nightly-test-1-gpu",
@@ -109,6 +110,19 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict:
                 "nightly-test-8-gpu-h20",
                 "nightly-test-4-gpu-b200",
                 "nightly-test-8-gpu-b200",
+                # New NVIDIA job names (nightly-test-nvidia.yml)
+                "nightly-test-general-1-gpu-runner",
+                "nightly-test-general-4-gpu-h100",
+                "nightly-test-general-8-gpu-h200",
+                "nightly-test-general-8-gpu-h20",
+                "nightly-test-text-accuracy-2-gpu-runner",
+                "nightly-test-text-perf-2-gpu-runner",
+                "nightly-test-vlm-accuracy-2-gpu-runner",
+                "nightly-test-vlm-perf-2-gpu-runner",
+                "nightly-test-perf-4-gpu-b200",
+                "nightly-test-perf-8-gpu-b200",
+                # AMD job names (nightly-test-amd.yml)
+                "nightly-test",  # AMD uses this generic name with matrix
             ],
             "integration": [
                 "run-all-notebooks",
@@ -191,6 +205,7 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict:
                     "unit-test-backend-4-gpu-b200",
                     "unit-test-backend-4-gpu-gb200",
                     "quantization-test",
+                    # Old nightly job names (nightly-test.yml)
                     "nightly-test-eval-text-models",
                     "nightly-test-perf-text-models",
                     "nightly-test-eval-vlms",
@@ -201,6 +216,19 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict:
                     "nightly-test-8-gpu-h20",
                     "nightly-test-4-gpu-b200",
                     "nightly-test-8-gpu-b200",
+                    # New NVIDIA job names (nightly-test-nvidia.yml)
+                    "nightly-test-general-1-gpu-runner",
+                    "nightly-test-general-4-gpu-h100",
+                    "nightly-test-general-8-gpu-h200",
+                    "nightly-test-general-8-gpu-h20",
+                    "nightly-test-text-accuracy-2-gpu-runner",
+                    "nightly-test-text-perf-2-gpu-runner",
+                    "nightly-test-vlm-accuracy-2-gpu-runner",
+                    "nightly-test-vlm-perf-2-gpu-runner",
+                    "nightly-test-perf-4-gpu-b200",
+                    "nightly-test-perf-8-gpu-b200",
+                    # AMD job names (nightly-test-amd.yml)
+                    "nightly-test",
                 ]
 
                 if job_name in target_jobs:
diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
index 5821891a18ff..b60d18bbd0b9 100755
--- a/scripts/ci_monitor/nightly_monitor.py
+++ b/scripts/ci_monitor/nightly_monitor.py
@@ -35,6 +35,7 @@ def __init__(self, token: str):
 
         # Nightly test jobs to monitor
         self.nightly_jobs = [
+            # Old job names (nightly-test.yml)
             "nightly-test-eval-text-models",
             "nightly-test-perf-text-models",
             "nightly-test-eval-vlms",
@@ -45,6 +46,27 @@ def __init__(self, token: str):
             "nightly-test-8-gpu-h20",
             "nightly-test-4-gpu-b200",
             "nightly-test-8-gpu-b200",
+            # New NVIDIA job names (nightly-test-nvidia.yml)
+            "nightly-test-general-1-gpu-runner",
+            "nightly-test-general-4-gpu-h100",
+            "nightly-test-general-8-gpu-h200",
+            "nightly-test-general-8-gpu-h20",
+            "nightly-test-text-accuracy-2-gpu-runner",
+            "nightly-test-text-perf-2-gpu-runner",
+            "nightly-test-vlm-accuracy-2-gpu-runner",
+            "nightly-test-vlm-perf-2-gpu-runner",
+            "nightly-test-perf-4-gpu-b200",
+            "nightly-test-perf-8-gpu-b200",
+            # AMD job names (nightly-test-amd.yml)
+            "nightly-test",
+        ]
+
+        # Nightly workflow files to monitor
+        self.nightly_workflows = [
+            "nightly-test.yml",
+            "nightly-test-nvidia.yml",
+            "nightly-test-amd.yml",
+            "nightly-test-intel.yml",
         ]
 
         # Performance metric patterns for parsing logs
@@ -67,45 +89,52 @@ def __init__(self, token: str):
         self.data_branch = "main"
 
     def get_nightly_runs(self, days: int = 7) -> List[Dict]:
-        """Get nightly test workflow runs from the last N days"""
+        """Get nightly test workflow runs from the last N days from multiple workflows"""
         print(f"Fetching nightly test runs from the last {days} days...")
 
         since_date = (datetime.now() - timedelta(days=days)).isoformat()
 
         all_runs = []
-        page = 1
-        per_page = 100
-
-        while True:
-            url = f"{self.base_url}/repos/{self.repo}/actions/runs"
-            params = {
-                "workflow_id": "nightly-test.yml",
-                "per_page": per_page,
-                "page": page,
-                "created": f">={since_date}",
-            }
 
-            try:
-                response = self.session.get(url, params=params)
-                response.raise_for_status()
-                data = response.json()
+        # Fetch runs from each nightly workflow
+        for workflow_file in self.nightly_workflows:
+            print(f"  Fetching from {workflow_file}...")
+            page = 1
+            per_page = 100
+            workflow_runs = []
+
+            while True:
+                url = f"{self.base_url}/repos/{self.repo}/actions/runs"
+                params = {
+                    "workflow_id": workflow_file,
+                    "per_page": per_page,
+                    "page": page,
+                    "created": f">={since_date}",
+                }
 
-                if not data.get("workflow_runs"):
-                    break
+                try:
+                    response = self.session.get(url, params=params)
+                    response.raise_for_status()
+                    data = response.json()
 
-                runs = data["workflow_runs"]
-                all_runs.extend(runs)
-                print(f"Fetched {len(all_runs)} nightly runs so far...")
+                    if not data.get("workflow_runs"):
+                        break
 
-                if len(runs) < per_page:
-                    break
+                    runs = data["workflow_runs"]
+                    workflow_runs.extend(runs)
+
+                    if len(runs) < per_page:
+                        break
 
-                page += 1
-                time.sleep(0.1)
+                    page += 1
+                    time.sleep(0.1)
+
+                except requests.exceptions.RequestException as e:
+                    print(f"    Warning: Error fetching from {workflow_file}: {e}")
+                    break
 
-            except requests.exceptions.RequestException as e:
-                print(f"Error fetching nightly test data: {e}")
-                break
+            print(f"    Fetched {len(workflow_runs)} runs from {workflow_file}")
+            all_runs.extend(workflow_runs)
 
         print(f"Total nightly runs fetched: {len(all_runs)}")
         return all_runs

From 94cf58a92f6bd248d17b7b4fe63f14a5fc82b0d1 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Fri, 14 Nov 2025 17:48:55 -0800
Subject: [PATCH 15/31] Use built-in GITHUB_TOKEN for test workflow

The GH_PAT_FOR_NIGHTLY_CI_DATA secret may not be available in PR/branch contexts. Using the built-in GITHUB_TOKEN instead for testing purposes.
---
 .github/workflows/test-nightly-monitor.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-nightly-monitor.yml b/.github/workflows/test-nightly-monitor.yml
index 2fba7eb785f5..ca1256141944 100644
--- a/.github/workflows/test-nightly-monitor.yml
+++ b/.github/workflows/test-nightly-monitor.yml
@@ -38,7 +38,7 @@ jobs:
 
       - name: Run Nightly Test Monitor
         env:
-          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           PYTHONUNBUFFERED: 1
           PYTHONIOENCODING: utf-8
         run: |

From 5f773faca924edb99bd5b0e1e8db096cd09f75cb Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Mon, 17 Nov 2025 20:15:24 -0800
Subject: [PATCH 16/31] Use GH_PAT_FOR_NIGHTLY_CI_DATA token in
 test-nightly-monitor

---
 .github/workflows/test-nightly-monitor.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-nightly-monitor.yml b/.github/workflows/test-nightly-monitor.yml
index ca1256141944..2fba7eb785f5 100644
--- a/.github/workflows/test-nightly-monitor.yml
+++ b/.github/workflows/test-nightly-monitor.yml
@@ -38,7 +38,7 @@ jobs:
 
       - name: Run Nightly Test Monitor
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
           PYTHONUNBUFFERED: 1
           PYTHONIOENCODING: utf-8
         run: |

From 8896dd15c5de384cd70f59cdcc302039dec8be20 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Tue, 18 Nov 2025 13:00:12 -0800
Subject: [PATCH 17/31] Consolidate nightly monitoring into ci_analyzer.py

- Integrate nightly monitoring functionality into ci_analyzer.py with --mode flag
- Update nightly job names for Nvidia, AMD, and Intel workflows
- Remove standalone nightly_monitor.py file
- Update workflow files to use ci_analyzer.py --mode nightly
---
 .github/workflows/nightly-monitor.yml      |   3 +-
 .github/workflows/test-nightly-monitor.yml |   3 +-
 scripts/ci_monitor/ci_analyzer.py          | 460 +++++++++++-
 scripts/ci_monitor/nightly_monitor.py      | 792 ---------------------
 4 files changed, 447 insertions(+), 811 deletions(-)
 delete mode 100755 scripts/ci_monitor/nightly_monitor.py

diff --git a/.github/workflows/nightly-monitor.yml b/.github/workflows/nightly-monitor.yml
index bdf9ed260481..19ebea60d333 100644
--- a/.github/workflows/nightly-monitor.yml
+++ b/.github/workflows/nightly-monitor.yml
@@ -46,8 +46,9 @@ jobs:
           PYTHONIOENCODING: utf-8
         run: |
           cd scripts/ci_monitor
-          python nightly_monitor.py \
+          python ci_analyzer.py \
             --token $GITHUB_TOKEN \
+            --mode nightly \
             --days ${{ inputs.days || '7' }} \
             --output nightly_monitor_$(date +%Y%m%d_%H%M%S).json
         continue-on-error: true
diff --git a/.github/workflows/test-nightly-monitor.yml b/.github/workflows/test-nightly-monitor.yml
index 2fba7eb785f5..10f3c3c1bdd3 100644
--- a/.github/workflows/test-nightly-monitor.yml
+++ b/.github/workflows/test-nightly-monitor.yml
@@ -43,8 +43,9 @@ jobs:
           PYTHONIOENCODING: utf-8
         run: |
           cd scripts/ci_monitor
-          python nightly_monitor.py \
+          python ci_analyzer.py \
             --token $GITHUB_TOKEN \
+            --mode nightly \
             --days ${{ inputs.days || '7' }} \
             --output test_monitor_$(date +%Y%m%d_%H%M%S).json
 
diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index dc49324eb7c1..0db04b86894d 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -1,13 +1,15 @@
 #!/usr/bin/env python3
 
 import argparse
+import base64
 import json
 import os
+import re
 import sys
 import time
 from collections import Counter, defaultdict
-from datetime import datetime
-from typing import Dict, List
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional
 
 import requests
 
@@ -26,6 +28,33 @@ def __init__(self, token: str):
         self.session = requests.Session()
         self.session.headers.update(self.headers)
 
+        # Nightly workflow files to monitor
+        self.nightly_workflows = [
+            "nightly-test.yml",
+            "nightly-test-nvidia.yml",
+            "nightly-test-amd.yml",
+            "nightly-test-intel.yml",
+        ]
+
+        # Performance metric patterns for parsing logs
+        self.perf_patterns = {
+            "output_throughput": re.compile(
+                r"Output token throughput \(tok/s\):\s*([\d.]+)"
+            ),
+            "input_throughput": re.compile(
+                r"Input token throughput \(tok/s\):\s*([\d.]+)"
+            ),
+            "latency": re.compile(r"Median E2E Latency \(ms\):\s*([\d.]+)"),
+            "ttft": re.compile(r"Median TTFT \(ms\):\s*([\d.]+)"),
+            "accept_length": re.compile(r"Accept length:\s*([\d.]+)"),
+            "accuracy": re.compile(r"Accuracy:\s*([\d.]+)"),
+            "gsm8k_score": re.compile(r"GSM8K Score:\s*([\d.]+)"),
+        }
+
+        # Historical data repository
+        self.data_repo = "sglang-bot/sglang-ci-data"
+        self.data_branch = "main"
+
     def get_recent_runs(self, limit: int = 100, branch: str = None) -> List[Dict]:
         branch_info = f" from branch '{branch}'" if branch else ""
         print(f"Fetching {limit} recent CI runs{branch_info}...")
@@ -542,20 +571,397 @@ def generate_github_summary(self, stats: Dict):
         except Exception as e:
             print(f"Failed to generate GitHub Actions summary: {e}")
 
+    def get_nightly_runs(self, days: int = 7) -> List[Dict]:
+        """Get nightly test workflow runs from the last N days"""
+        print(f"Fetching nightly test runs from the last {days} days...")
+
+        since_date = (datetime.now() - timedelta(days=days)).isoformat()
+        all_runs = []
+
+        for workflow_file in self.nightly_workflows:
+            print(f"  Fetching from {workflow_file}...")
+            page = 1
+            per_page = 100
+            workflow_runs = []
+
+            while True:
+                url = f"{self.base_url}/repos/{self.repo}/actions/runs"
+                params = {
+                    "workflow_id": workflow_file,
+                    "per_page": per_page,
+                    "page": page,
+                    "created": f">={since_date}",
+                }
+
+                try:
+                    response = self.session.get(url, params=params)
+                    response.raise_for_status()
+                    data = response.json()
+
+                    if not data.get("workflow_runs"):
+                        break
+
+                    runs = data["workflow_runs"]
+                    workflow_runs.extend(runs)
+
+                    if len(runs) < per_page:
+                        break
+
+                    page += 1
+                    time.sleep(0.1)
+
+                except requests.exceptions.RequestException as e:
+                    print(f"    Warning: Error fetching from {workflow_file}: {e}")
+                    break
+
+            print(f"    Fetched {len(workflow_runs)} runs from {workflow_file}")
+            all_runs.extend(workflow_runs)
+
+        print(f"Total nightly runs fetched: {len(all_runs)}")
+        return all_runs
+
+    def get_job_logs(self, job_id: int) -> Optional[str]:
+        """Get logs for a specific job"""
+        url = f"{self.base_url}/repos/{self.repo}/actions/jobs/{job_id}/logs"
+        try:
+            response = self.session.get(url)
+            response.raise_for_status()
+            return response.text
+        except requests.exceptions.RequestException as e:
+            print(f"  Warning: Could not fetch logs for job {job_id}: {e}")
+            return None
+
+    def parse_metrics_from_logs(self, logs: str, job_name: str) -> Dict[str, List[float]]:
+        """Parse performance metrics from job logs"""
+        metrics = defaultdict(list)
+
+        if not logs:
+            return metrics
+
+        for line in logs.split("\n"):
+            for metric_name, pattern in self.perf_patterns.items():
+                match = pattern.search(line)
+                if match:
+                    try:
+                        value = float(match.group(1))
+                        metrics[metric_name].append(value)
+                    except (ValueError, IndexError):
+                        continue
+
+        return dict(metrics)
+
+    def analyze_nightly_with_metrics(self, runs: List[Dict]) -> Dict:
+        """Analyze nightly test runs including performance metrics"""
+        print("Analyzing nightly test data with performance metrics...")
+
+        # Get nightly job names from the existing job categories
+        nightly_jobs = [
+            # New NVIDIA job names (nightly-test-nvidia.yml)
+            "nightly-test-general-1-gpu-runner",
+            "nightly-test-general-4-gpu-h100",
+            "nightly-test-general-8-gpu-h200",
+            "nightly-test-general-8-gpu-h20",
+            "nightly-test-text-accuracy-2-gpu-runner",
+            "nightly-test-text-perf-2-gpu-runner",
+            "nightly-test-vlm-accuracy-2-gpu-runner",
+            "nightly-test-vlm-perf-2-gpu-runner",
+            "nightly-test-perf-4-gpu-b200",
+            "nightly-test-perf-8-gpu-b200",
+            # AMD job names (nightly-test-amd.yml)
+            "nightly-test",
+            # Intel job names (nightly-test-intel.yml)
+            "placeholder",
+            # Old job names (kept for backwards compatibility)
+            "nightly-test-eval-text-models",
+            "nightly-test-perf-text-models",
+            "nightly-test-eval-vlms",
+            "nightly-test-perf-vlms",
+            "nightly-test-1-gpu",
+            "nightly-test-4-gpu",
+            "nightly-test-8-gpu-h200",
+            "nightly-test-8-gpu-h20",
+            "nightly-test-4-gpu-b200",
+            "nightly-test-8-gpu-b200",
+        ]
+
+        stats = {
+            "total_runs": len(runs),
+            "successful_runs": 0,
+            "failed_runs": 0,
+            "cancelled_runs": 0,
+            "job_stats": defaultdict(
+                lambda: {
+                    "total": 0,
+                    "success": 0,
+                    "failure": 0,
+                    "recent_failures": [],
+                    "avg_duration_minutes": 0,
+                    "durations": [],
+                    "performance_metrics": defaultdict(list),
+                }
+            ),
+            "daily_stats": defaultdict(
+                lambda: {
+                    "total": 0,
+                    "success": 0,
+                    "failure": 0,
+                }
+            ),
+        }
+
+        for i, run in enumerate(runs, 1):
+            if i % 10 == 0:
+                print(f"Processed {i}/{len(runs)} runs...")
+
+            run_status = run.get("conclusion", "unknown")
+            run_id = run.get("id")
+            run_number = run.get("run_number")
+            created_at = run.get("created_at")
+            run_url = f"https://github.com/{self.repo}/actions/runs/{run_id}"
+
+            # Track daily stats
+            date_str = created_at.split("T")[0] if created_at else "unknown"
+            stats["daily_stats"][date_str]["total"] += 1
+
+            if run_status == "success":
+                stats["successful_runs"] += 1
+                stats["daily_stats"][date_str]["success"] += 1
+            elif run_status == "failure":
+                stats["failed_runs"] += 1
+                stats["daily_stats"][date_str]["failure"] += 1
+            elif run_status == "cancelled":
+                stats["cancelled_runs"] += 1
+
+            # Analyze individual jobs
+            jobs = self._get_job_details(run_id)
+            for job in jobs:
+                job_name = job.get("name", "Unknown")
+                job_conclusion = job.get("conclusion", "unknown")
+                job_id = job.get("id")
+                started_at = job.get("started_at")
+                completed_at = job.get("completed_at")
+
+                # Only track nightly test jobs
+                if job_name not in nightly_jobs:
+                    continue
+
+                job_stat = stats["job_stats"][job_name]
+                job_stat["total"] += 1
+
+                if job_conclusion == "success":
+                    job_stat["success"] += 1
+
+                    # For successful performance/accuracy jobs, fetch metrics
+                    if "perf" in job_name.lower() or "accuracy" in job_name.lower() or "eval" in job_name.lower():
+                        logs = self.get_job_logs(job_id)
+                        if logs:
+                            metrics = self.parse_metrics_from_logs(logs, job_name)
+                            for metric_name, values in metrics.items():
+                                if values:
+                                    job_stat["performance_metrics"][metric_name].extend(
+                                        [
+                                            {
+                                                "value": v,
+                                                "timestamp": created_at,
+                                                "run_id": run_id,
+                                                "job_name": job_name,
+                                            }
+                                            for v in values
+                                        ]
+                                    )
+
+                elif job_conclusion == "failure":
+                    job_stat["failure"] += 1
+
+                    if len(job_stat["recent_failures"]) < 5:
+                        job_stat["recent_failures"].append(
+                            {
+                                "run_url": run_url,
+                                "run_number": run_number,
+                                "created_at": created_at,
+                                "job_url": job.get("html_url"),
+                            }
+                        )
+
+                # Track duration
+                if started_at and completed_at:
+                    try:
+                        start = datetime.fromisoformat(started_at.replace("Z", "+00:00"))
+                        end = datetime.fromisoformat(completed_at.replace("Z", "+00:00"))
+                        duration_minutes = (end - start).total_seconds() / 60
+                        job_stat["durations"].append(duration_minutes)
+                    except:
+                        pass
+
+            time.sleep(0.1)
+
+        # Calculate average durations
+        for job_name, job_stat in stats["job_stats"].items():
+            if job_stat["durations"]:
+                job_stat["avg_duration_minutes"] = sum(job_stat["durations"]) / len(
+                    job_stat["durations"]
+                )
+                del job_stat["durations"]
+
+        return stats
+
+    def generate_nightly_report(self, stats: Dict, output_file: str = None):
+        """Generate a report for nightly test analysis"""
+        print("\n" + "=" * 80)
+        print("NIGHTLY TEST MONITOR REPORT")
+        print("=" * 80)
+        print(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        print(f"Total Runs Analyzed: {stats['total_runs']}")
+        print(
+            f"Successful: {stats['successful_runs']} "
+            f"({stats['successful_runs']/max(1, stats['total_runs'])*100:.1f}%)"
+        )
+        print(
+            f"Failed: {stats['failed_runs']} "
+            f"({stats['failed_runs']/max(1, stats['total_runs'])*100:.1f}%)"
+        )
+        print(f"Cancelled: {stats['cancelled_runs']}")
+        print("=" * 80)
+
+        # Daily trend
+        print("\nDAILY TRENDS:")
+        print("-" * 80)
+        daily_stats = sorted(stats["daily_stats"].items(), reverse=True)[:7]
+        for date, day_stats in daily_stats:
+            success_rate = (day_stats["success"] / max(1, day_stats["total"])) * 100
+            print(
+                f"{date}: {day_stats['total']} runs, {day_stats['success']} success "
+                f"({success_rate:.1f}%), {day_stats['failure']} failed"
+            )
+
+        # Job statistics
+        print("\nJOB STATISTICS:")
+        print("-" * 80)
+        print(
+            f"{'Job Name':<50} {'Total':<8} {'Success':<8} {'Failed':<8} "
+            f"{'Rate':<8} {'Avg Duration'}"
+        )
+        print("-" * 80)
+
+        job_stats_sorted = sorted(
+            stats["job_stats"].items(), key=lambda x: x[1]["failure"], reverse=True
+        )
+
+        for job_name, job_stat in job_stats_sorted:
+            total = job_stat["total"]
+            success = job_stat["success"]
+            failure = job_stat["failure"]
+            success_rate = (success / max(1, total)) * 100
+            avg_duration = job_stat["avg_duration_minutes"]
+
+            print(
+                f"{job_name:<50} {total:<8} {success:<8} {failure:<8} "
+                f"{success_rate:>6.1f}% {avg_duration:>7.1f}m"
+            )
+
+            # Show performance metrics if available
+            if job_stat.get("performance_metrics"):
+                perf_metrics = job_stat["performance_metrics"]
+                print(f"  Performance metrics:")
+
+                for metric_name, metric_data in perf_metrics.items():
+                    if metric_data:
+                        values = [m["value"] for m in metric_data]
+                        avg_value = sum(values) / len(values)
+                        print(f"    - {metric_name}: {avg_value:.2f} (n={len(values)})")
+
+            # Show recent failures
+            if job_stat["recent_failures"]:
+                print(f"  Recent failures:")
+                for failure in job_stat["recent_failures"][:3]:
+                    print(f"    - Run #{failure['run_number']}: {failure['run_url']}")
+
+        print("=" * 80)
+
+        # Save to file if requested
+        if output_file:
+            with open(output_file, "w") as f:
+                json.dump(stats, f, indent=2, default=str)
+            print(f"\nDetailed stats saved to: {output_file}")
+
+    def detect_nightly_regressions(self, stats: Dict) -> List[Dict]:
+        """Detect regressions in nightly tests"""
+        regressions = []
+
+        for job_name, job_stat in stats["job_stats"].items():
+            total = job_stat["total"]
+            failure = job_stat["failure"]
+
+            if total > 0:
+                failure_rate = (failure / total) * 100
+
+                # Flag jobs with high failure rates
+                if failure_rate > 30:
+                    regressions.append(
+                        {
+                            "job_name": job_name,
+                            "type": "high_failure_rate",
+                            "failure_rate": failure_rate,
+                            "total_runs": total,
+                            "failures": failure,
+                        }
+                    )
+
+                # Flag jobs with recent consecutive failures
+                recent_failures = len(job_stat["recent_failures"])
+                if recent_failures >= 3:
+                    regressions.append(
+                        {
+                            "job_name": job_name,
+                            "type": "consecutive_failures",
+                            "recent_failure_count": recent_failures,
+                        }
+                    )
+
+        if regressions:
+            print("\n" + "=" * 80)
+            print("REGRESSIONS DETECTED:")
+            print("=" * 80)
+            for regression in regressions:
+                print(f"\nJob: {regression['job_name']}")
+                if regression["type"] == "high_failure_rate":
+                    print(
+                        f"  High failure rate: {regression['failure_rate']:.1f}% "
+                        f"({regression['failures']}/{regression['total_runs']})"
+                    )
+                elif regression["type"] == "consecutive_failures":
+                    print(
+                        f"  {regression['recent_failure_count']} recent consecutive failures"
+                    )
+            print("=" * 80)
+
+        return regressions
+
 
 def main():
     parser = argparse.ArgumentParser(description="SGLang CI Analyzer")
     parser.add_argument("--token", required=True, help="GitHub Personal Access Token")
+    parser.add_argument(
+        "--mode",
+        choices=["ci", "nightly"],
+        default="ci",
+        help="Analysis mode: 'ci' for general CI analysis, 'nightly' for nightly test monitoring (default: ci)",
+    )
     parser.add_argument(
         "--limit",
         type=int,
         default=100,
-        help="Number of runs to analyze (default: 100)",
+        help="Number of runs to analyze (for ci mode, default: 100)",
+    )
+    parser.add_argument(
+        "--days",
+        type=int,
+        default=7,
+        help="Number of days to analyze (for nightly mode, default: 7)",
     )
     parser.add_argument(
         "--output",
-        default="ci_analysis.json",
-        help="Output file (default: ci_analysis.json)",
+        help="Output file for detailed stats (JSON)",
     )
     parser.add_argument(
         "--branch",
@@ -568,20 +974,40 @@ def main():
     analyzer = SGLangCIAnalyzer(args.token)
 
     try:
-        branch = args.branch if args.branch else None
-        runs = analyzer.get_recent_runs(args.limit, branch)
-
-        if not runs:
-            print("No CI run data found")
-            return
-
-        stats = analyzer.analyze_ci_failures(runs)
-
-        analyzer.generate_report(stats)
+        if args.mode == "nightly":
+            # Nightly test monitoring mode
+            runs = analyzer.get_nightly_runs(days=args.days)
+
+            if not runs:
+                print("No nightly test runs found in the specified time period.")
+                sys.exit(1)
+
+            stats = analyzer.analyze_nightly_with_metrics(runs)
+            analyzer.generate_nightly_report(stats, args.output)
+            regressions = analyzer.detect_nightly_regressions(stats)
+
+            # Exit with error code if regressions detected
+            if regressions:
+                sys.exit(1)
+            else:
+                print("\n✓ No significant regressions detected")
+                sys.exit(0)
+
+        else:
+            # Regular CI analysis mode
+            branch = args.branch if args.branch else None
+            runs = analyzer.get_recent_runs(args.limit, branch)
+
+            if not runs:
+                print("No CI run data found")
+                return
 
-        analyzer.save_detailed_report(stats, args.output)
+            stats = analyzer.analyze_ci_failures(runs)
+            analyzer.generate_report(stats)
 
-        analyzer.generate_github_summary(stats)
+            output_file = args.output or "ci_analysis.json"
+            analyzer.save_detailed_report(stats, output_file)
+            analyzer.generate_github_summary(stats)
 
     except Exception as e:
         print(f"Error during analysis: {e}")
diff --git a/scripts/ci_monitor/nightly_monitor.py b/scripts/ci_monitor/nightly_monitor.py
deleted file mode 100755
index b60d18bbd0b9..000000000000
--- a/scripts/ci_monitor/nightly_monitor.py
+++ /dev/null
@@ -1,792 +0,0 @@
-#!/usr/bin/env python3
-"""
-Nightly Test Monitor
-
-Monitors nightly test runs for performance and accuracy regressions.
-Analyzes metrics from GitHub summaries and tracks trends over time.
-"""
-
-import argparse
-import base64
-import json
-import os
-import re
-import sys
-import time
-from collections import defaultdict
-from datetime import datetime, timedelta
-from typing import Dict, List, Optional, Tuple
-
-import requests
-
-
-class NightlyTestMonitor:
-    def __init__(self, token: str):
-        self.token = token
-        self.base_url = "https://api.github.com"
-        self.repo = "sgl-project/sglang"
-        self.headers = {
-            "Authorization": f"token {token}",
-            "Accept": "application/vnd.github.v3+json",
-            "User-Agent": "SGLang-Nightly-Monitor/1.0",
-        }
-        self.session = requests.Session()
-        self.session.headers.update(self.headers)
-
-        # Nightly test jobs to monitor
-        self.nightly_jobs = [
-            # Old job names (nightly-test.yml)
-            "nightly-test-eval-text-models",
-            "nightly-test-perf-text-models",
-            "nightly-test-eval-vlms",
-            "nightly-test-perf-vlms",
-            "nightly-test-1-gpu",
-            "nightly-test-4-gpu",
-            "nightly-test-8-gpu-h200",
-            "nightly-test-8-gpu-h20",
-            "nightly-test-4-gpu-b200",
-            "nightly-test-8-gpu-b200",
-            # New NVIDIA job names (nightly-test-nvidia.yml)
-            "nightly-test-general-1-gpu-runner",
-            "nightly-test-general-4-gpu-h100",
-            "nightly-test-general-8-gpu-h200",
-            "nightly-test-general-8-gpu-h20",
-            "nightly-test-text-accuracy-2-gpu-runner",
-            "nightly-test-text-perf-2-gpu-runner",
-            "nightly-test-vlm-accuracy-2-gpu-runner",
-            "nightly-test-vlm-perf-2-gpu-runner",
-            "nightly-test-perf-4-gpu-b200",
-            "nightly-test-perf-8-gpu-b200",
-            # AMD job names (nightly-test-amd.yml)
-            "nightly-test",
-        ]
-
-        # Nightly workflow files to monitor
-        self.nightly_workflows = [
-            "nightly-test.yml",
-            "nightly-test-nvidia.yml",
-            "nightly-test-amd.yml",
-            "nightly-test-intel.yml",
-        ]
-
-        # Performance metric patterns for parsing logs
-        self.perf_patterns = {
-            "output_throughput": re.compile(
-                r"Output token throughput \(tok/s\):\s*([\d.]+)"
-            ),
-            "input_throughput": re.compile(
-                r"Input token throughput \(tok/s\):\s*([\d.]+)"
-            ),
-            "latency": re.compile(r"Median E2E Latency \(ms\):\s*([\d.]+)"),
-            "ttft": re.compile(r"Median TTFT \(ms\):\s*([\d.]+)"),
-            "accept_length": re.compile(r"Accept length:\s*([\d.]+)"),
-            "accuracy": re.compile(r"Accuracy:\s*([\d.]+)"),
-            "gsm8k_score": re.compile(r"GSM8K Score:\s*([\d.]+)"),
-        }
-
-        # Historical data repository
-        self.data_repo = "sglang-bot/sglang-ci-data"
-        self.data_branch = "main"
-
-    def get_nightly_runs(self, days: int = 7) -> List[Dict]:
-        """Get nightly test workflow runs from the last N days from multiple workflows"""
-        print(f"Fetching nightly test runs from the last {days} days...")
-
-        since_date = (datetime.now() - timedelta(days=days)).isoformat()
-
-        all_runs = []
-
-        # Fetch runs from each nightly workflow
-        for workflow_file in self.nightly_workflows:
-            print(f"  Fetching from {workflow_file}...")
-            page = 1
-            per_page = 100
-            workflow_runs = []
-
-            while True:
-                url = f"{self.base_url}/repos/{self.repo}/actions/runs"
-                params = {
-                    "workflow_id": workflow_file,
-                    "per_page": per_page,
-                    "page": page,
-                    "created": f">={since_date}",
-                }
-
-                try:
-                    response = self.session.get(url, params=params)
-                    response.raise_for_status()
-                    data = response.json()
-
-                    if not data.get("workflow_runs"):
-                        break
-
-                    runs = data["workflow_runs"]
-                    workflow_runs.extend(runs)
-
-                    if len(runs) < per_page:
-                        break
-
-                    page += 1
-                    time.sleep(0.1)
-
-                except requests.exceptions.RequestException as e:
-                    print(f"    Warning: Error fetching from {workflow_file}: {e}")
-                    break
-
-            print(f"    Fetched {len(workflow_runs)} runs from {workflow_file}")
-            all_runs.extend(workflow_runs)
-
-        print(f"Total nightly runs fetched: {len(all_runs)}")
-        return all_runs
-
-    def get_job_details(self, run_id: int) -> List[Dict]:
-        """Get job details for a specific run"""
-        url = f"{self.base_url}/repos/{self.repo}/actions/runs/{run_id}/jobs"
-        try:
-            response = self.session.get(url)
-            response.raise_for_status()
-            return response.json().get("jobs", [])
-        except:
-            return []
-
-    def get_job_logs(self, job_id: int) -> Optional[str]:
-        """Get logs for a specific job"""
-        url = f"{self.base_url}/repos/{self.repo}/actions/jobs/{job_id}/logs"
-        try:
-            response = self.session.get(url)
-            response.raise_for_status()
-            return response.text
-        except requests.exceptions.RequestException as e:
-            print(f"  Warning: Could not fetch logs for job {job_id}: {e}")
-            return None
-
-    def parse_metrics_from_logs(
-        self, logs: str, job_name: str
-    ) -> Dict[str, List[float]]:
-        """
-        Parse performance metrics from job logs.
-
-        Args:
-            logs: Raw log text from the job
-            job_name: Name of the job (to determine which metrics to look for)
-
-        Returns:
-            Dictionary mapping metric names to lists of values found
-        """
-        metrics = defaultdict(list)
-
-        if not logs:
-            return metrics
-
-        # Parse each line for matching patterns
-        for line in logs.split("\n"):
-            for metric_name, pattern in self.perf_patterns.items():
-                match = pattern.search(line)
-                if match:
-                    try:
-                        value = float(match.group(1))
-                        metrics[metric_name].append(value)
-                    except (ValueError, IndexError):
-                        continue
-
-        return dict(metrics)
-
-    def get_historical_data_paths(self) -> List[str]:
-        """
-        Get list of available nightly monitor data files from the data repository.
-
-        Returns:
-            List of file paths in the repository
-        """
-        url = f"{self.base_url}/repos/{self.data_repo}/contents/nightly_monitor"
-        try:
-            response = self.session.get(url)
-            response.raise_for_status()
-            contents = response.json()
-
-            # Filter for JSON files
-            json_files = [
-                item["path"]
-                for item in contents
-                if item["type"] == "file" and item["name"].endswith(".json")
-            ]
-            return sorted(json_files, reverse=True)  # Most recent first
-        except requests.exceptions.RequestException as e:
-            print(f"Warning: Could not fetch historical data paths: {e}")
-            return []
-
-    def fetch_historical_data(self, file_path: str) -> Optional[Dict]:
-        """
-        Fetch a specific historical data file from the repository.
-
-        Args:
-            file_path: Path to the file in the repository
-
-        Returns:
-            Dictionary with historical data, or None if fetch failed
-        """
-        url = f"{self.base_url}/repos/{self.data_repo}/contents/{file_path}"
-        try:
-            response = self.session.get(url)
-            response.raise_for_status()
-            data = response.json()
-
-            # Decode base64 content
-            content = base64.b64decode(data["content"]).decode("utf-8")
-            return json.loads(content)
-        except (
-            requests.exceptions.RequestException,
-            json.JSONDecodeError,
-            KeyError,
-        ) as e:
-            print(f"Warning: Could not fetch historical data from {file_path}: {e}")
-            return None
-
-    def get_recent_historical_metrics(
-        self, job_name: str, metric_name: str, days: int = 7
-    ) -> List[Dict]:
-        """
-        Get recent historical metrics for a specific job and metric.
-
-        Args:
-            job_name: Name of the job
-            metric_name: Name of the metric (e.g., 'output_throughput')
-            days: Number of days to look back
-
-        Returns:
-            List of metric data points with timestamps
-        """
-        print(
-            f"  Fetching historical {metric_name} data for {job_name} (last {days} days)..."
-        )
-
-        historical_paths = self.get_historical_data_paths()
-        if not historical_paths:
-            return []
-
-        cutoff_date = datetime.now() - timedelta(days=days)
-        historical_metrics = []
-
-        # Fetch recent files (limit to avoid too many API calls)
-        for file_path in historical_paths[: min(days * 2, 14)]:  # Max 14 files
-            historical_data = self.fetch_historical_data(file_path)
-            if not historical_data:
-                continue
-
-            # Check if this file has data for our job
-            job_stats = historical_data.get("job_stats", {}).get(job_name, {})
-            if not job_stats:
-                continue
-
-            # Extract metrics
-            perf_metrics = job_stats.get("performance_metrics", {}).get(metric_name, [])
-            for metric_entry in perf_metrics:
-                try:
-                    timestamp = datetime.fromisoformat(
-                        metric_entry["timestamp"].replace("Z", "+00:00")
-                    ).replace(tzinfo=None)
-                    if timestamp >= cutoff_date:
-                        historical_metrics.append(metric_entry)
-                except (ValueError, KeyError):
-                    continue
-
-        return sorted(historical_metrics, key=lambda x: x["timestamp"])
-
-    def compare_with_historical(
-        self, current_metrics: Dict[str, List[Dict]], days: int = 7
-    ) -> Dict[str, Dict]:
-        """
-        Compare current metrics with historical data to detect changes.
-
-        Args:
-            current_metrics: Dictionary of metric_name -> list of metric data points
-            days: Number of days to look back for comparison
-
-        Returns:
-            Dictionary with comparison results including percentage changes
-        """
-        comparisons = {}
-
-        for metric_name, current_data in current_metrics.items():
-            if not current_data:
-                continue
-
-            # Calculate current average
-            current_values = [d["value"] for d in current_data]
-            current_avg = sum(current_values) / len(current_values)
-
-            # Get the job name from the first data point
-            # (assumes all data points are from the same job)
-            job_name = current_data[0].get("job_name", "unknown")
-
-            # Fetch historical data
-            historical_data = self.get_recent_historical_metrics(
-                job_name, metric_name, days
-            )
-
-            if not historical_data:
-                comparisons[metric_name] = {
-                    "current_avg": current_avg,
-                    "historical_avg": None,
-                    "percent_change": None,
-                    "status": "no_history",
-                }
-                continue
-
-            # Calculate historical average
-            historical_values = [d["value"] for d in historical_data]
-            historical_avg = sum(historical_values) / len(historical_values)
-
-            # Calculate percentage change
-            if historical_avg > 0:
-                percent_change = ((current_avg - historical_avg) / historical_avg) * 100
-            else:
-                percent_change = 0
-
-            # Determine status based on change
-            if abs(percent_change) < 5:
-                status = "stable"
-            elif abs(percent_change) < 10:
-                status = "minor_change"
-            else:
-                status = "significant_change"
-
-            comparisons[metric_name] = {
-                "current_avg": current_avg,
-                "historical_avg": historical_avg,
-                "percent_change": percent_change,
-                "status": status,
-                "current_count": len(current_values),
-                "historical_count": len(historical_values),
-            }
-
-        return comparisons
-
-    def analyze_nightly_tests(self, runs: List[Dict]) -> Dict:
-        """Analyze nightly test runs for failures and performance"""
-        print("Analyzing nightly test data...")
-
-        stats = {
-            "total_runs": len(runs),
-            "successful_runs": 0,
-            "failed_runs": 0,
-            "cancelled_runs": 0,
-            "job_stats": defaultdict(
-                lambda: {
-                    "total": 0,
-                    "success": 0,
-                    "failure": 0,
-                    "recent_failures": [],
-                    "avg_duration_minutes": 0,
-                    "durations": [],
-                    "performance_metrics": defaultdict(list),  # New: track perf metrics
-                }
-            ),
-            "daily_stats": defaultdict(
-                lambda: {
-                    "total": 0,
-                    "success": 0,
-                    "failure": 0,
-                }
-            ),
-        }
-
-        for i, run in enumerate(runs, 1):
-            if i % 10 == 0:
-                print(f"Processed {i}/{len(runs)} runs...")
-
-            run_status = run.get("conclusion", "unknown")
-            run_id = run.get("id")
-            run_number = run.get("run_number")
-            created_at = run.get("created_at")
-            run_url = f"https://github.com/{self.repo}/actions/runs/{run_id}"
-
-            # Track daily stats
-            date_str = created_at.split("T")[0] if created_at else "unknown"
-            stats["daily_stats"][date_str]["total"] += 1
-
-            if run_status == "success":
-                stats["successful_runs"] += 1
-                stats["daily_stats"][date_str]["success"] += 1
-            elif run_status == "failure":
-                stats["failed_runs"] += 1
-                stats["daily_stats"][date_str]["failure"] += 1
-            elif run_status == "cancelled":
-                stats["cancelled_runs"] += 1
-
-            # Analyze individual jobs
-            jobs = self.get_job_details(run_id)
-            for job in jobs:
-                job_name = job.get("name", "Unknown")
-                job_conclusion = job.get("conclusion", "unknown")
-                job_id = job.get("id")
-                started_at = job.get("started_at")
-                completed_at = job.get("completed_at")
-
-                # Only track our nightly test jobs
-                if job_name not in self.nightly_jobs:
-                    continue
-
-                job_stat = stats["job_stats"][job_name]
-                job_stat["total"] += 1
-
-                if job_conclusion == "success":
-                    job_stat["success"] += 1
-
-                    # For successful performance jobs, fetch metrics
-                    if "perf" in job_name.lower() or "eval" in job_name.lower():
-                        logs = self.get_job_logs(job_id)
-                        if logs:
-                            metrics = self.parse_metrics_from_logs(logs, job_name)
-                            # Store metrics with timestamp and job name
-                            for metric_name, values in metrics.items():
-                                if values:  # Only store if we found values
-                                    job_stat["performance_metrics"][metric_name].extend(
-                                        [
-                                            {
-                                                "value": v,
-                                                "timestamp": created_at,
-                                                "run_id": run_id,
-                                                "job_name": job_name,
-                                            }
-                                            for v in values
-                                        ]
-                                    )
-
-                elif job_conclusion == "failure":
-                    job_stat["failure"] += 1
-
-                    # Store recent failures (up to 5)
-                    if len(job_stat["recent_failures"]) < 5:
-                        job_stat["recent_failures"].append(
-                            {
-                                "run_url": run_url,
-                                "run_number": run_number,
-                                "created_at": created_at,
-                                "job_url": job.get("html_url"),
-                            }
-                        )
-
-                # Track duration
-                if started_at and completed_at:
-                    try:
-                        start = datetime.fromisoformat(
-                            started_at.replace("Z", "+00:00")
-                        )
-                        end = datetime.fromisoformat(
-                            completed_at.replace("Z", "+00:00")
-                        )
-                        duration_minutes = (end - start).total_seconds() / 60
-                        job_stat["durations"].append(duration_minutes)
-                    except:
-                        pass
-
-            time.sleep(0.1)
-
-        # Calculate average durations
-        for job_name, job_stat in stats["job_stats"].items():
-            if job_stat["durations"]:
-                job_stat["avg_duration_minutes"] = sum(job_stat["durations"]) / len(
-                    job_stat["durations"]
-                )
-                del job_stat["durations"]  # Remove raw data to reduce size
-
-        return stats
-
-    def generate_github_summary(self, stats: Dict, regressions: List[Dict]):
-        """Generate GitHub Actions step summary"""
-        github_summary = os.environ.get("GITHUB_STEP_SUMMARY")
-        if not github_summary:
-            return
-
-        with open(github_summary, "a") as f:
-            f.write("# Nightly Test Monitor Report\n\n")
-            f.write(
-                f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
-            )
-
-            # Summary stats
-            f.write("## Summary\n\n")
-            f.write(f"- Total Runs: {stats['total_runs']}\n")
-            f.write(
-                f"- Successful: {stats['successful_runs']} "
-                f"({stats['successful_runs']/max(1, stats['total_runs'])*100:.1f}%)\n"
-            )
-            f.write(
-                f"- Failed: {stats['failed_runs']} "
-                f"({stats['failed_runs']/max(1, stats['total_runs'])*100:.1f}%)\n\n"
-            )
-
-            # Regressions
-            if regressions:
-                f.write("## Regressions Detected\n\n")
-                for reg in regressions:
-                    if reg["type"] == "performance_regression":
-                        f.write(
-                            f"- **{reg['job_name']}**: {reg['metric_name']} "
-                            f"({reg['percent_change']:+.1f}%)\n"
-                        )
-
-            # Performance metrics table
-            f.write("\n## Performance Metrics\n\n")
-            f.write("| Job | Metric | Current | Change |\n")
-            f.write("|-----|--------|---------|--------|\n")
-
-            for job_name, job_stat in stats["job_stats"].items():
-                if job_stat.get("performance_metrics"):
-                    perf_metrics = job_stat["performance_metrics"]
-                    comparisons = self.compare_with_historical(perf_metrics, days=7)
-
-                    for metric_name, metric_data in perf_metrics.items():
-                        if metric_data:
-                            values = [m["value"] for m in metric_data]
-                            avg_value = sum(values) / len(values)
-                            comparison = comparisons.get(metric_name, {})
-                            percent_change = comparison.get("percent_change")
-
-                            if percent_change is not None:
-                                f.write(
-                                    f"| {job_name} | {metric_name} | {avg_value:.2f} | "
-                                    f"{percent_change:+.1f}% |\n"
-                                )
-
-    def generate_report(self, stats: Dict, output_file: str = None):
-        """Generate a human-readable report"""
-        print("\n" + "=" * 80)
-        print("NIGHTLY TEST MONITOR REPORT")
-        print("=" * 80)
-        print(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-        print(f"Total Runs Analyzed: {stats['total_runs']}")
-        print(
-            f"Successful: {stats['successful_runs']} "
-            f"({stats['successful_runs']/max(1, stats['total_runs'])*100:.1f}%)"
-        )
-        print(
-            f"Failed: {stats['failed_runs']} "
-            f"({stats['failed_runs']/max(1, stats['total_runs'])*100:.1f}%)"
-        )
-        print(f"Cancelled: {stats['cancelled_runs']}")
-        print("=" * 80)
-
-        # Daily trend
-        print("\nDAILY TRENDS:")
-        print("-" * 80)
-        daily_stats = sorted(stats["daily_stats"].items(), reverse=True)[:7]
-        for date, day_stats in daily_stats:
-            success_rate = (day_stats["success"] / max(1, day_stats["total"])) * 100
-            print(
-                f"{date}: {day_stats['total']} runs, {day_stats['success']} success "
-                f"({success_rate:.1f}%), {day_stats['failure']} failed"
-            )
-
-        # Job statistics
-        print("\nJOB STATISTICS:")
-        print("-" * 80)
-        print(
-            f"{'Job Name':<40} {'Total':<8} {'Success':<8} {'Failed':<8} "
-            f"{'Rate':<8} {'Avg Duration'}"
-        )
-        print("-" * 80)
-
-        job_stats_sorted = sorted(
-            stats["job_stats"].items(), key=lambda x: x[1]["failure"], reverse=True
-        )
-
-        for job_name, job_stat in job_stats_sorted:
-            total = job_stat["total"]
-            success = job_stat["success"]
-            failure = job_stat["failure"]
-            success_rate = (success / max(1, total)) * 100
-            avg_duration = job_stat["avg_duration_minutes"]
-
-            print(
-                f"{job_name:<40} {total:<8} {success:<8} {failure:<8} "
-                f"{success_rate:>6.1f}% {avg_duration:>7.1f}m"
-            )
-
-            # Show performance metrics with day-to-day comparison if available
-            if job_stat.get("performance_metrics"):
-                perf_metrics = job_stat["performance_metrics"]
-                print(f"  Performance metrics:")
-
-                # Compare with historical data
-                comparisons = self.compare_with_historical(perf_metrics, days=7)
-
-                for metric_name, metric_data in perf_metrics.items():
-                    if metric_data:
-                        values = [m["value"] for m in metric_data]
-                        avg_value = sum(values) / len(values)
-
-                        # Get comparison data
-                        comparison = comparisons.get(metric_name, {})
-                        percent_change = comparison.get("percent_change")
-
-                        if percent_change is not None:
-                            change_indicator = "📈" if percent_change > 0 else "📉"
-                            if abs(percent_change) < 1:
-                                change_indicator = "➡️"
-
-                            print(
-                                f"    - {metric_name}: {avg_value:.2f} "
-                                f"(n={len(values)}) {change_indicator} "
-                                f"{percent_change:+.1f}% vs 7d avg"
-                            )
-                        else:
-                            print(
-                                f"    - {metric_name}: {avg_value:.2f} (n={len(values)}) [no history]"
-                            )
-
-            # Show recent failures
-            if job_stat["recent_failures"]:
-                print(f"  Recent failures:")
-                for failure in job_stat["recent_failures"][:3]:
-                    print(f"    - Run #{failure['run_number']}: {failure['run_url']}")
-
-        print("=" * 80)
-
-        # Save to file if requested
-        if output_file:
-            with open(output_file, "w") as f:
-                json.dump(stats, f, indent=2, default=str)
-            print(f"\nDetailed stats saved to: {output_file}")
-
-    def detect_regressions(self, stats: Dict) -> List[Dict]:
-        """Detect potential regressions in nightly tests"""
-        regressions = []
-
-        for job_name, job_stat in stats["job_stats"].items():
-            total = job_stat["total"]
-            failure = job_stat["failure"]
-
-            if total > 0:
-                failure_rate = (failure / total) * 100
-
-                # Flag jobs with high failure rates
-                if failure_rate > 30:
-                    regressions.append(
-                        {
-                            "job_name": job_name,
-                            "type": "high_failure_rate",
-                            "failure_rate": failure_rate,
-                            "total_runs": total,
-                            "failures": failure,
-                        }
-                    )
-
-                # Flag jobs with recent consecutive failures
-                recent_failures = len(job_stat["recent_failures"])
-                if recent_failures >= 3:
-                    regressions.append(
-                        {
-                            "job_name": job_name,
-                            "type": "consecutive_failures",
-                            "recent_failure_count": recent_failures,
-                        }
-                    )
-
-            # Check for performance regressions >10%
-            if job_stat.get("performance_metrics"):
-                perf_metrics = job_stat["performance_metrics"]
-                comparisons = self.compare_with_historical(perf_metrics, days=7)
-
-                for metric_name, comparison in comparisons.items():
-                    percent_change = comparison.get("percent_change")
-                    if percent_change is None:
-                        continue
-
-                    # Flag performance regressions >10%
-                    # For throughput metrics, negative change is bad
-                    # For latency/ttft metrics, positive change is bad
-                    is_regression = False
-                    if "throughput" in metric_name.lower():
-                        if percent_change < -10:  # >10% decrease in throughput
-                            is_regression = True
-                    elif (
-                        "latency" in metric_name.lower()
-                        or "ttft" in metric_name.lower()
-                    ):
-                        if percent_change > 10:  # >10% increase in latency
-                            is_regression = True
-
-                    if is_regression:
-                        regressions.append(
-                            {
-                                "job_name": job_name,
-                                "type": "performance_regression",
-                                "metric_name": metric_name,
-                                "percent_change": percent_change,
-                                "current_avg": comparison["current_avg"],
-                                "historical_avg": comparison["historical_avg"],
-                            }
-                        )
-
-        if regressions:
-            print("\n" + "=" * 80)
-            print("REGRESSIONS DETECTED:")
-            print("=" * 80)
-            for regression in regressions:
-                print(f"\nJob: {regression['job_name']}")
-                if regression["type"] == "high_failure_rate":
-                    print(
-                        f"  High failure rate: {regression['failure_rate']:.1f}% "
-                        f"({regression['failures']}/{regression['total_runs']})"
-                    )
-                elif regression["type"] == "consecutive_failures":
-                    print(
-                        f"  {regression['recent_failure_count']} recent consecutive failures"
-                    )
-                elif regression["type"] == "performance_regression":
-                    print(f"  Performance regression: {regression['metric_name']}")
-                    print(
-                        f"    Change: {regression['percent_change']:+.1f}% "
-                        f"(current: {regression['current_avg']:.2f}, "
-                        f"7d avg: {regression['historical_avg']:.2f})"
-                    )
-            print("=" * 80)
-
-        return regressions
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Monitor nightly test runs for regressions"
-    )
-    parser.add_argument("--token", required=True, help="GitHub personal access token")
-    parser.add_argument(
-        "--days", type=int, default=7, help="Number of days to analyze (default: 7)"
-    )
-    parser.add_argument("--output", help="Output file for detailed stats (JSON)")
-
-    args = parser.parse_args()
-
-    monitor = NightlyTestMonitor(args.token)
-
-    # Get nightly runs
-    runs = monitor.get_nightly_runs(days=args.days)
-
-    if not runs:
-        print("No nightly test runs found in the specified time period.")
-        sys.exit(1)
-
-    # Analyze runs
-    stats = monitor.analyze_nightly_tests(runs)
-
-    # Generate report
-    monitor.generate_report(stats, args.output)
-
-    # Detect regressions
-    regressions = monitor.detect_regressions(stats)
-
-    # Generate GitHub Actions summary
-    monitor.generate_github_summary(stats, regressions)
-
-    # Exit with error code if regressions detected
-    if regressions:
-        sys.exit(1)
-    else:
-        print("\n✓ No significant regressions detected")
-        sys.exit(0)
-
-
-if __name__ == "__main__":
-    main()

From 0198944420305656ede4248073c3aa9c947159e8 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Tue, 18 Nov 2025 14:20:58 -0800
Subject: [PATCH 18/31] Apply black formatting to ci_analyzer.py

---
 scripts/ci_monitor/ci_analyzer.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index 0db04b86894d..80505da590ab 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -631,7 +631,9 @@ def get_job_logs(self, job_id: int) -> Optional[str]:
             print(f"  Warning: Could not fetch logs for job {job_id}: {e}")
             return None
 
-    def parse_metrics_from_logs(self, logs: str, job_name: str) -> Dict[str, List[float]]:
+    def parse_metrics_from_logs(
+        self, logs: str, job_name: str
+    ) -> Dict[str, List[float]]:
         """Parse performance metrics from job logs"""
         metrics = defaultdict(list)
 
@@ -752,7 +754,11 @@ def analyze_nightly_with_metrics(self, runs: List[Dict]) -> Dict:
                     job_stat["success"] += 1
 
                     # For successful performance/accuracy jobs, fetch metrics
-                    if "perf" in job_name.lower() or "accuracy" in job_name.lower() or "eval" in job_name.lower():
+                    if (
+                        "perf" in job_name.lower()
+                        or "accuracy" in job_name.lower()
+                        or "eval" in job_name.lower()
+                    ):
                         logs = self.get_job_logs(job_id)
                         if logs:
                             metrics = self.parse_metrics_from_logs(logs, job_name)
@@ -786,8 +792,12 @@ def analyze_nightly_with_metrics(self, runs: List[Dict]) -> Dict:
                 # Track duration
                 if started_at and completed_at:
                     try:
-                        start = datetime.fromisoformat(started_at.replace("Z", "+00:00"))
-                        end = datetime.fromisoformat(completed_at.replace("Z", "+00:00"))
+                        start = datetime.fromisoformat(
+                            started_at.replace("Z", "+00:00")
+                        )
+                        end = datetime.fromisoformat(
+                            completed_at.replace("Z", "+00:00")
+                        )
                         duration_minutes = (end - start).total_seconds() / 60
                         job_stat["durations"].append(duration_minutes)
                     except:

From 79dbf80d180e40fed7dad9b36c7753dcc4346191 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Tue, 18 Nov 2025 15:32:02 -0800
Subject: [PATCH 19/31] Remove separate nightly monitor workflow files

Nightly monitoring is now integrated into ci_analyzer.py with --mode flag
---
 .github/workflows/nightly-monitor.yml      | 120 ---------------------
 .github/workflows/test-nightly-monitor.yml |  58 ----------
 2 files changed, 178 deletions(-)
 delete mode 100644 .github/workflows/nightly-monitor.yml
 delete mode 100644 .github/workflows/test-nightly-monitor.yml

diff --git a/.github/workflows/nightly-monitor.yml b/.github/workflows/nightly-monitor.yml
deleted file mode 100644
index 19ebea60d333..000000000000
--- a/.github/workflows/nightly-monitor.yml
+++ /dev/null
@@ -1,120 +0,0 @@
-name: Nightly Test Monitor
-
-on:
-  schedule:
-    - cron: '0 8 * * *' # Run daily at 8 AM UTC (after nightly tests typically complete)
-  workflow_dispatch:
-    inputs:
-      days:
-        description: 'Number of days to analyze'
-        required: false
-        default: '7'
-        type: string
-
-concurrency:
-  group: nightly-monitor-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: write
-  actions: read
-  issues: write
-
-jobs:
-  nightly-monitor:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10'
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install requests
-
-      - name: Run Nightly Test Monitor
-        id: monitor
-        env:
-          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
-          PYTHONUNBUFFERED: 1
-          PYTHONIOENCODING: utf-8
-        run: |
-          cd scripts/ci_monitor
-          python ci_analyzer.py \
-            --token $GITHUB_TOKEN \
-            --mode nightly \
-            --days ${{ inputs.days || '7' }} \
-            --output nightly_monitor_$(date +%Y%m%d_%H%M%S).json
-        continue-on-error: true
-
-      - name: Upload Monitor Results
-        uses: actions/upload-artifact@v4
-        with:
-          name: nightly-monitor-results-${{ github.run_number }}
-          path: |
-            scripts/ci_monitor/nightly_monitor_*.json
-          retention-days: 90
-
-      - name: Comment on Issues if Regressions Detected
-        if: steps.monitor.outcome == 'failure'
-        uses: actions/github-script@v7
-        with:
-          github-token: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
-          script: |
-            const fs = require('fs');
-            const path = require('path');
-
-            // Find the latest monitor output file
-            const files = fs.readdirSync('scripts/ci_monitor')
-              .filter(f => f.startsWith('nightly_monitor_') && f.endsWith('.json'))
-              .sort()
-              .reverse();
-
-            if (files.length === 0) {
-              console.log('No monitor output file found');
-              return;
-            }
-
-            const filePath = path.join('scripts/ci_monitor', files[0]);
-            const stats = JSON.parse(fs.readFileSync(filePath, 'utf8'));
-
-            // Create a summary of regressions
-            let regressionSummary = '## ⚠️ Nightly Test Regressions Detected\n\n';
-            regressionSummary += `**Report Date:** ${new Date().toISOString().split('T')[0]}\n`;
-            regressionSummary += `**Analysis Period:** Last ${{ inputs.days || '7' }} days\n\n`;
-            regressionSummary += `### Summary\n`;
-            regressionSummary += `- Total Runs: ${stats.total_runs}\n`;
-            regressionSummary += `- Failed Runs: ${stats.failed_runs} (${(stats.failed_runs/Math.max(1,stats.total_runs)*100).toFixed(1)}%)\n\n`;
-
-            regressionSummary += `### Jobs with High Failure Rates (>30%)\n\n`;
-            regressionSummary += `| Job Name | Failure Rate | Failures/Total |\n`;
-            regressionSummary += `|----------|--------------|----------------|\n`;
-
-            let hasRegressions = false;
-            for (const [jobName, jobStat] of Object.entries(stats.job_stats)) {
-              if (jobStat.total > 0) {
-                const failureRate = (jobStat.failure / jobStat.total) * 100;
-                if (failureRate > 30) {
-                  hasRegressions = true;
-                  regressionSummary += `| ${jobName} | ${failureRate.toFixed(1)}% | ${jobStat.failure}/${jobStat.total} |\n`;
-                }
-              }
-            }
-
-            if (!hasRegressions) {
-              console.log('No high failure rates found');
-              return;
-            }
-
-            regressionSummary += `\n[View detailed monitor results](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId})`;
-
-            // Post as a comment on a tracking issue or create a new one
-            // For now, just log it (you can modify to create/update issues)
-            console.log(regressionSummary);
-            core.setOutput('regression_summary', regressionSummary);
diff --git a/.github/workflows/test-nightly-monitor.yml b/.github/workflows/test-nightly-monitor.yml
deleted file mode 100644
index 10f3c3c1bdd3..000000000000
--- a/.github/workflows/test-nightly-monitor.yml
+++ /dev/null
@@ -1,58 +0,0 @@
-name: Test Nightly Monitor
-
-on:
-  workflow_dispatch:
-    inputs:
-      days:
-        description: 'Number of days to analyze'
-        required: false
-        default: '7'
-        type: string
-  pull_request:
-    branches:
-      - main
-    paths:
-      - 'scripts/ci_monitor/nightly_monitor.py'
-      - '.github/workflows/test-nightly-monitor.yml'
-
-permissions:
-  contents: write
-  actions: read
-
-jobs:
-  test-monitor:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10'
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install requests
-
-      - name: Run Nightly Test Monitor
-        env:
-          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
-          PYTHONUNBUFFERED: 1
-          PYTHONIOENCODING: utf-8
-        run: |
-          cd scripts/ci_monitor
-          python ci_analyzer.py \
-            --token $GITHUB_TOKEN \
-            --mode nightly \
-            --days ${{ inputs.days || '7' }} \
-            --output test_monitor_$(date +%Y%m%d_%H%M%S).json
-
-      - name: Upload Test Results
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-monitor-results-${{ github.run_number }}
-          path: |
-            scripts/ci_monitor/test_monitor_*.json
-          retention-days: 7

From 82314a1066b9783ee211ab8373ab3b5a299cb301 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Tue, 18 Nov 2025 15:33:22 -0800
Subject: [PATCH 20/31] Add nightly monitoring job to ci-monitor workflow

---
 .github/workflows/ci-monitor.yml | 37 ++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/.github/workflows/ci-monitor.yml b/.github/workflows/ci-monitor.yml
index cf75771b56f0..2f5d5da1f024 100644
--- a/.github/workflows/ci-monitor.yml
+++ b/.github/workflows/ci-monitor.yml
@@ -64,6 +64,43 @@ jobs:
             scripts/ci_monitor/performance_tables_*
           retention-days: 30
 
+  nightly-monitor:
+    needs: ci-monitor
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.9'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install requests
+
+      - name: Run Nightly Test Monitor
+        id: monitor
+        env:
+          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
+          PYTHONUNBUFFERED: 1
+          PYTHONIOENCODING: utf-8
+        run: |
+          cd scripts/ci_monitor
+          python ci_analyzer.py --token $GITHUB_TOKEN --mode nightly --days 7 --output nightly_monitor_$(date +%Y%m%d_%H%M%S).json
+        continue-on-error: true
+
+      - name: Upload Nightly Monitor Results
+        uses: actions/upload-artifact@v4
+        with:
+          name: nightly-monitor-results-${{ github.run_number }}
+          path: |
+            scripts/ci_monitor/nightly_monitor_*.json
+          retention-days: 90
+
   ci-monitor-balance:
     needs: ci-monitor
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'

From d852d25120f0947ba2ce2c359629cdb4f2f9c401 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Tue, 18 Nov 2025 16:12:16 -0800
Subject: [PATCH 21/31] Add performance metrics tracking to existing CI monitor

- Track throughput, latency, accuracy metrics from nightly test jobs
- Parse metrics from successful nightly job logs
- Detect performance trends (>10% changes)
- Display metrics table in GitHub Actions summary with trend indicators
- Save all metrics data in JSON artifacts for analysis
---
 .github/workflows/ci-monitor.yml  | 37 -------------
 scripts/ci_monitor/ci_analyzer.py | 91 +++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/ci-monitor.yml b/.github/workflows/ci-monitor.yml
index 2f5d5da1f024..cf75771b56f0 100644
--- a/.github/workflows/ci-monitor.yml
+++ b/.github/workflows/ci-monitor.yml
@@ -64,43 +64,6 @@ jobs:
             scripts/ci_monitor/performance_tables_*
           retention-days: 30
 
-  nightly-monitor:
-    needs: ci-monitor
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.9'
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install requests
-
-      - name: Run Nightly Test Monitor
-        id: monitor
-        env:
-          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
-          PYTHONUNBUFFERED: 1
-          PYTHONIOENCODING: utf-8
-        run: |
-          cd scripts/ci_monitor
-          python ci_analyzer.py --token $GITHUB_TOKEN --mode nightly --days 7 --output nightly_monitor_$(date +%Y%m%d_%H%M%S).json
-        continue-on-error: true
-
-      - name: Upload Nightly Monitor Results
-        uses: actions/upload-artifact@v4
-        with:
-          name: nightly-monitor-results-${{ github.run_number }}
-          path: |
-            scripts/ci_monitor/nightly_monitor_*.json
-          retention-days: 90
-
   ci-monitor-balance:
     needs: ci-monitor
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index 80505da590ab..8672a39f2ba5 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -179,6 +179,9 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict:
                 list
             ),  # Store recent failure links for each job
             "job_last_success": {},  # Store last successful run for each job
+            "performance_metrics": defaultdict(
+                lambda: defaultdict(list)
+            ),  # Track performance metrics for nightly jobs
         }
 
         total_runs = len(runs)
@@ -269,6 +272,30 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict:
                             "pr_info": pr_info,
                         }
 
+                        # Parse performance metrics from successful nightly jobs
+                        if job_name in job_categories["nightly"] and (
+                            "perf" in job_name.lower()
+                            or "accuracy" in job_name.lower()
+                            or "eval" in job_name.lower()
+                        ):
+                            job_id = job.get("id")
+                            logs = self.get_job_logs(job_id)
+                            if logs:
+                                metrics = self.parse_metrics_from_logs(logs, job_name)
+                                for metric_name, values in metrics.items():
+                                    if values:
+                                        for value in values:
+                                            stats["performance_metrics"][job_name][
+                                                metric_name
+                                            ].append(
+                                                {
+                                                    "value": value,
+                                                    "timestamp": created_at,
+                                                    "run_id": run_id,
+                                                    "run_url": run_url,
+                                                }
+                                            )
+
                     elif job_conclusion == "failure":
                         stats["job_failures"][job_name] += 1
 
@@ -562,6 +589,70 @@ def generate_github_summary(self, stats: Dict):
                     summary_lines.append(f"| {pattern} | {count} |")
                 summary_lines.append("")
 
+            # Performance metrics section for nightly jobs
+            if stats.get("performance_metrics"):
+                summary_lines.append("## Nightly Test Performance Metrics")
+                summary_lines.append("")
+                summary_lines.append(
+                    "| Job | Metric | Latest Value | Count | Trend |"
+                )
+                summary_lines.append("|-----|--------|--------------|-------|-------|")
+
+                for job_name in sorted(stats["performance_metrics"].keys()):
+                    job_metrics = stats["performance_metrics"][job_name]
+                    for metric_name in sorted(job_metrics.keys()):
+                        metric_data = job_metrics[metric_name]
+                        if metric_data:
+                            # Calculate average of recent values
+                            values = [m["value"] for m in metric_data]
+                            avg_value = sum(values) / len(values)
+                            count = len(values)
+
+                            # Simple trend: compare first half vs second half
+                            trend_indicator = "➡️"
+                            if len(values) >= 4:
+                                first_half = values[: len(values) // 2]
+                                second_half = values[len(values) // 2 :]
+                                first_avg = sum(first_half) / len(first_half)
+                                second_avg = sum(second_half) / len(second_half)
+
+                                if first_avg > 0:
+                                    change_pct = (
+                                        (second_avg - first_avg) / first_avg
+                                    ) * 100
+
+                                    # For throughput metrics, up is good
+                                    # For latency/ttft metrics, down is good
+                                    if "throughput" in metric_name.lower():
+                                        if change_pct > 10:
+                                            trend_indicator = f"📈 +{change_pct:.1f}%"
+                                        elif change_pct < -10:
+                                            trend_indicator = (
+                                                f"⚠️ 📉 {change_pct:.1f}%"
+                                            )
+                                        else:
+                                            trend_indicator = f"➡️ {change_pct:+.1f}%"
+                                    elif (
+                                        "latency" in metric_name.lower()
+                                        or "ttft" in metric_name.lower()
+                                    ):
+                                        if change_pct < -10:
+                                            trend_indicator = f"📈 {change_pct:.1f}%"
+                                        elif change_pct > 10:
+                                            trend_indicator = (
+                                                f"⚠️ 📉 +{change_pct:.1f}%"
+                                            )
+                                        else:
+                                            trend_indicator = f"➡️ {change_pct:+.1f}%"
+                                    else:
+                                        trend_indicator = f"➡️ {change_pct:+.1f}%"
+
+                            summary_lines.append(
+                                f"| {job_name} | {metric_name} | {avg_value:.2f} | {count} | {trend_indicator} |"
+                            )
+
+                summary_lines.append("")
+
             with open(github_step_summary, "w", encoding="utf-8") as f:
                 f.write("\n".join(summary_lines))
                 f.write("\n\n---\n\n")

From e1851139dcef0cd0cf7e7b5701a7a6fec55a1106 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Tue, 18 Nov 2025 16:23:36 -0800
Subject: [PATCH 22/31] Apply black formatting

---
 scripts/ci_monitor/ci_analyzer.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index 8672a39f2ba5..385b99b3b0be 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -593,9 +593,7 @@ def generate_github_summary(self, stats: Dict):
             if stats.get("performance_metrics"):
                 summary_lines.append("## Nightly Test Performance Metrics")
                 summary_lines.append("")
-                summary_lines.append(
-                    "| Job | Metric | Latest Value | Count | Trend |"
-                )
+                summary_lines.append("| Job | Metric | Latest Value | Count | Trend |")
                 summary_lines.append("|-----|--------|--------------|-------|-------|")
 
                 for job_name in sorted(stats["performance_metrics"].keys()):
@@ -627,9 +625,7 @@ def generate_github_summary(self, stats: Dict):
                                         if change_pct > 10:
                                             trend_indicator = f"📈 +{change_pct:.1f}%"
                                         elif change_pct < -10:
-                                            trend_indicator = (
-                                                f"⚠️ 📉 {change_pct:.1f}%"
-                                            )
+                                            trend_indicator = f"⚠️ 📉 {change_pct:.1f}%"
                                         else:
                                             trend_indicator = f"➡️ {change_pct:+.1f}%"
                                     elif (
@@ -639,9 +635,7 @@ def generate_github_summary(self, stats: Dict):
                                         if change_pct < -10:
                                             trend_indicator = f"📈 {change_pct:.1f}%"
                                         elif change_pct > 10:
-                                            trend_indicator = (
-                                                f"⚠️ 📉 +{change_pct:.1f}%"
-                                            )
+                                            trend_indicator = f"⚠️ 📉 +{change_pct:.1f}%"
                                         else:
                                             trend_indicator = f"➡️ {change_pct:+.1f}%"
                                     else:

From 9f8a9679b8014adc013a4460b61708d1f5951a21 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Tue, 18 Nov 2025 16:56:15 -0800
Subject: [PATCH 23/31] Fix 403 error by not filtering by branch by default

The GitHub API requires higher permissions when filtering by branch.
Change default from 'main' to None to avoid 403 Forbidden errors.
---
 scripts/ci_monitor/ci_analyzer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index 385b99b3b0be..513786637adc 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -1060,8 +1060,8 @@ def main():
     )
     parser.add_argument(
         "--branch",
-        default="main",
-        help="Filter runs by branch (default: 'main'). Set to empty string '' to analyze all branches.",
+        default=None,
+        help="Filter runs by branch (default: None - all branches). Specify branch name to filter.",
     )
 
     args = parser.parse_args()

From ace451583754fc540e17cf6f5220a4ed731cc3c6 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Tue, 18 Nov 2025 17:03:27 -0800
Subject: [PATCH 24/31] Use default GITHUB_TOKEN instead of custom PAT

The custom PAT may not be available on non-main branches.
The default GITHUB_TOKEN has Actions: read permission.
---
 .github/workflows/ci-monitor.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci-monitor.yml b/.github/workflows/ci-monitor.yml
index cf75771b56f0..8c67ba3c995d 100644
--- a/.github/workflows/ci-monitor.yml
+++ b/.github/workflows/ci-monitor.yml
@@ -39,7 +39,7 @@ jobs:
 
       - name: Run CI Analysis
         env:
-          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           PYTHONUNBUFFERED: 1
           PYTHONIOENCODING: utf-8
         run: |
@@ -48,7 +48,7 @@ jobs:
 
       - name: Run Performance Analysis
         env:
-          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           PYTHONUNBUFFERED: 1
           PYTHONIOENCODING: utf-8
         run: |
@@ -84,7 +84,7 @@ jobs:
 
       - name: Run Test Balance Analysis
         env:
-          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           PYTHONUNBUFFERED: 1
           PYTHONIOENCODING: utf-8
         run: |

From 3047f7e6125b3fdb643a1a5f28db56b1927c1a69 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Tue, 18 Nov 2025 19:19:03 -0800
Subject: [PATCH 25/31] Revert to using GH_PAT_FOR_NIGHTLY_CI_DATA for all
 steps

Match the main branch configuration. The PAT works on main branch,
issue is likely related to secret access on non-main branches.
---
 .github/workflows/ci-monitor.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci-monitor.yml b/.github/workflows/ci-monitor.yml
index 8c67ba3c995d..cf75771b56f0 100644
--- a/.github/workflows/ci-monitor.yml
+++ b/.github/workflows/ci-monitor.yml
@@ -39,7 +39,7 @@ jobs:
 
       - name: Run CI Analysis
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
           PYTHONUNBUFFERED: 1
           PYTHONIOENCODING: utf-8
         run: |
@@ -48,7 +48,7 @@ jobs:
 
       - name: Run Performance Analysis
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
           PYTHONUNBUFFERED: 1
           PYTHONIOENCODING: utf-8
         run: |
@@ -84,7 +84,7 @@ jobs:
 
       - name: Run Test Balance Analysis
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
           PYTHONUNBUFFERED: 1
           PYTHONIOENCODING: utf-8
         run: |

From 47d473f2f5de5aac2a888ad5cb5a7525092214de Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Wed, 19 Nov 2025 11:53:48 -0800
Subject: [PATCH 26/31] Both regular CI analysis and nightly analysis now run
 together

---
 .github/workflows/ci-monitor.yml  | 10 ++++++++++
 scripts/ci_monitor/ci_analyzer.py |  4 ++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci-monitor.yml b/.github/workflows/ci-monitor.yml
index cf75771b56f0..28a198a32a58 100644
--- a/.github/workflows/ci-monitor.yml
+++ b/.github/workflows/ci-monitor.yml
@@ -46,6 +46,15 @@ jobs:
           cd scripts/ci_monitor
           python ci_analyzer.py --token $GITHUB_TOKEN --limit ${{ inputs.limit || '1000' }} --output ci_analysis_$(date +%Y%m%d_%H%M%S).json
 
+      - name: Run Nightly Test Analysis
+        env:
+          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
+          PYTHONUNBUFFERED: 1
+          PYTHONIOENCODING: utf-8
+        run: |
+          cd scripts/ci_monitor
+          python ci_analyzer.py --token $GITHUB_TOKEN --mode nightly --days 2 --output nightly_analysis_$(date +%Y%m%d_%H%M%S).json
+
       - name: Run Performance Analysis
         env:
           GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
@@ -61,6 +70,7 @@ jobs:
           name: ci-analysis-results-${{ github.run_number }}
           path: |
             scripts/ci_monitor/ci_analysis_*.json
+            scripts/ci_monitor/nightly_analysis_*.json
             scripts/ci_monitor/performance_tables_*
           retention-days: 30
 
diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index 513786637adc..c59e31bdbc76 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -1051,8 +1051,8 @@ def main():
     parser.add_argument(
         "--days",
         type=int,
-        default=7,
-        help="Number of days to analyze (for nightly mode, default: 7)",
+        default=2,
+        help="Number of days to analyze (for nightly mode, default: 2)",
     )
     parser.add_argument(
         "--output",

From dd7692424aaaff72cd4a74572181003a2e3b5a8e Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Wed, 19 Nov 2025 12:08:09 -0800
Subject: [PATCH 27/31] Remove deprecated nightly-test.yml workflow and old job
 names from CI analyzer

Stop tracking old nightly jobs that have been replaced by hardware-specific workflows:
- Removed nightly-test.yml from monitored workflows
- Removed old job names (nightly-test-eval-text-models, nightly-test-perf-text-models, etc.)
- Now only tracking new NVIDIA/AMD/Intel specific job names
---
 scripts/ci_monitor/ci_analyzer.py | 38 +++----------------------------
 1 file changed, 3 insertions(+), 35 deletions(-)

diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index c59e31bdbc76..ce6612d421e9 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -30,7 +30,6 @@ def __init__(self, token: str):
 
         # Nightly workflow files to monitor
         self.nightly_workflows = [
-            "nightly-test.yml",
             "nightly-test-nvidia.yml",
             "nightly-test-amd.yml",
             "nightly-test-intel.yml",
@@ -130,16 +129,7 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict:
                 "per-commit-8-gpu-h20",
             ],
             "nightly": [
-                # Old job names (nightly-test.yml)
-                "nightly-test-perf-text-models",
-                "nightly-test-eval-text-models",
-                "nightly-test-1-gpu",
-                "nightly-test-4-gpu",
-                "nightly-test-8-gpu-h200",
-                "nightly-test-8-gpu-h20",
-                "nightly-test-4-gpu-b200",
-                "nightly-test-8-gpu-b200",
-                # New NVIDIA job names (nightly-test-nvidia.yml)
+                # NVIDIA job names (nightly-test-nvidia.yml)
                 "nightly-test-general-1-gpu-runner",
                 "nightly-test-general-4-gpu-h100",
                 "nightly-test-general-8-gpu-h200",
@@ -237,18 +227,7 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict:
                     "unit-test-backend-4-gpu-b200",
                     "unit-test-backend-4-gpu-gb200",
                     "quantization-test",
-                    # Old nightly job names (nightly-test.yml)
-                    "nightly-test-eval-text-models",
-                    "nightly-test-perf-text-models",
-                    "nightly-test-eval-vlms",
-                    "nightly-test-perf-vlms",
-                    "nightly-test-1-gpu",
-                    "nightly-test-4-gpu",
-                    "nightly-test-8-gpu-h200",
-                    "nightly-test-8-gpu-h20",
-                    "nightly-test-4-gpu-b200",
-                    "nightly-test-8-gpu-b200",
-                    # New NVIDIA job names (nightly-test-nvidia.yml)
+                    # NVIDIA job names (nightly-test-nvidia.yml)
                     "nightly-test-general-1-gpu-runner",
                     "nightly-test-general-4-gpu-h100",
                     "nightly-test-general-8-gpu-h200",
@@ -743,7 +722,7 @@ def analyze_nightly_with_metrics(self, runs: List[Dict]) -> Dict:
 
         # Get nightly job names from the existing job categories
         nightly_jobs = [
-            # New NVIDIA job names (nightly-test-nvidia.yml)
+            # NVIDIA job names (nightly-test-nvidia.yml)
             "nightly-test-general-1-gpu-runner",
             "nightly-test-general-4-gpu-h100",
             "nightly-test-general-8-gpu-h200",
@@ -758,17 +737,6 @@ def analyze_nightly_with_metrics(self, runs: List[Dict]) -> Dict:
             "nightly-test",
             # Intel job names (nightly-test-intel.yml)
             "placeholder",
-            # Old job names (kept for backwards compatibility)
-            "nightly-test-eval-text-models",
-            "nightly-test-perf-text-models",
-            "nightly-test-eval-vlms",
-            "nightly-test-perf-vlms",
-            "nightly-test-1-gpu",
-            "nightly-test-4-gpu",
-            "nightly-test-8-gpu-h200",
-            "nightly-test-8-gpu-h20",
-            "nightly-test-4-gpu-b200",
-            "nightly-test-8-gpu-b200",
         ]
 
         stats = {

From d2088327ecaa9b2f3f71e53fd78bf121c5c9f802 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Wed, 19 Nov 2025 12:17:45 -0800
Subject: [PATCH 28/31] Limit nightly runs fetching to avoid excessive API
 calls

Nightly tests run once per day, so fetching 1000 runs per workflow was excessive.
Now limited to fetch at most (days * 5) runs per workflow with smaller page size.
For 2 days: ~10 runs per workflow instead of 1000
---
 scripts/ci_monitor/ci_analyzer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index ce6612d421e9..35c8f4c7177b 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -635,7 +635,7 @@ def generate_github_summary(self, stats: Dict):
         except Exception as e:
             print(f"Failed to generate GitHub Actions summary: {e}")
 
-    def get_nightly_runs(self, days: int = 7) -> List[Dict]:
+    def get_nightly_runs(self, days: int = 2) -> List[Dict]:
         """Get nightly test workflow runs from the last N days"""
         print(f"Fetching nightly test runs from the last {days} days...")
 
@@ -645,10 +645,11 @@ def get_nightly_runs(self, days: int = 7) -> List[Dict]:
         for workflow_file in self.nightly_workflows:
             print(f"  Fetching from {workflow_file}...")
             page = 1
-            per_page = 100
+            per_page = 10  # Nightly runs once per day, so 10 runs covers ~10 days max
             workflow_runs = []
+            max_runs_per_workflow = days * 5  # Allow up to 5 runs per day per workflow
 
-            while True:
+            while len(workflow_runs) < max_runs_per_workflow:
                 url = f"{self.base_url}/repos/{self.repo}/actions/runs"
                 params = {
                     "workflow_id": workflow_file,

From fa441d91b7c91b035548af5606013da3254f664a Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Wed, 19 Nov 2025 12:30:34 -0800
Subject: [PATCH 29/31] Add GitHub Actions summary for nightly test analysis

Now nightly analysis will appear in the GitHub Actions summary showing:
- Overall statistics with daily trends table
- Job statistics with performance metrics
- Recent failures for each job

The daily trends table shows the date and success rate for easy tracking
---
 scripts/ci_monitor/ci_analyzer.py | 102 ++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index 35c8f4c7177b..703848114256 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -948,6 +948,107 @@ def generate_nightly_report(self, stats: Dict, output_file: str = None):
                 json.dump(stats, f, indent=2, default=str)
             print(f"\nDetailed stats saved to: {output_file}")
 
+    def generate_nightly_github_summary(self, stats: Dict):
+        """Generate GitHub Actions summary for nightly test analysis"""
+        try:
+            github_step_summary = os.environ.get("GITHUB_STEP_SUMMARY")
+            if not github_step_summary:
+                print("Not running in GitHub Actions, skipping nightly summary generation")
+                return
+
+            print("Generating GitHub Actions summary for Nightly Analysis...")
+
+            summary_lines = []
+            summary_lines.append("# Nightly Test Monitor Report")
+            summary_lines.append("")
+            summary_lines.append(f"**Report Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+            summary_lines.append("")
+
+            # Overall statistics
+            total = stats["total_runs"]
+            success = stats["successful_runs"]
+            failed = stats["failed_runs"]
+            cancelled = stats["cancelled_runs"]
+
+            summary_lines.append("## Overall Statistics")
+            summary_lines.append("")
+            summary_lines.append("| Metric | Count | Percentage |")
+            summary_lines.append("|--------|-------|------------|")
+            summary_lines.append(f"| Total Runs | {total} | 100% |")
+            summary_lines.append(f"| Successful | {success} | {success/max(1,total)*100:.1f}% |")
+            summary_lines.append(f"| Failed | {failed} | {failed/max(1,total)*100:.1f}% |")
+            summary_lines.append(f"| Cancelled | {cancelled} | {cancelled/max(1,total)*100:.1f}% |")
+            summary_lines.append("")
+
+            # Daily trends
+            summary_lines.append("## Daily Trends")
+            summary_lines.append("")
+            summary_lines.append("| Date | Total Runs | Success | Failed | Success Rate |")
+            summary_lines.append("|------|------------|---------|--------|--------------|")
+
+            daily_stats = sorted(stats["daily_stats"].items(), reverse=True)[:7]
+            for date, day_stats in daily_stats:
+                success_rate = (day_stats["success"] / max(1, day_stats["total"])) * 100
+                summary_lines.append(
+                    f"| {date} | {day_stats['total']} | {day_stats['success']} | "
+                    f"{day_stats['failure']} | {success_rate:.1f}% |"
+                )
+            summary_lines.append("")
+
+            # Job statistics with performance metrics
+            if stats["job_stats"]:
+                summary_lines.append("## Job Statistics")
+                summary_lines.append("")
+
+                job_stats_sorted = sorted(
+                    stats["job_stats"].items(), key=lambda x: x[1]["failure"], reverse=True
+                )
+
+                for job_name, job_stat in job_stats_sorted:
+                    total_job = job_stat["total"]
+                    success_job = job_stat["success"]
+                    failure_job = job_stat["failure"]
+                    success_rate_job = (success_job / max(1, total_job)) * 100
+                    avg_duration = job_stat["avg_duration_minutes"]
+
+                    summary_lines.append(f"### {job_name}")
+                    summary_lines.append("")
+                    summary_lines.append(
+                        f"**Stats:** {total_job} runs | {success_job} success ({success_rate_job:.1f}%) | "
+                        f"{failure_job} failed | Avg duration: {avg_duration:.1f}m"
+                    )
+                    summary_lines.append("")
+
+                    # Performance metrics
+                    if job_stat.get("performance_metrics"):
+                        summary_lines.append("**Performance Metrics:**")
+                        summary_lines.append("")
+                        summary_lines.append("| Metric | Avg Value | Samples |")
+                        summary_lines.append("|--------|-----------|---------|")
+
+                        for metric_name, metric_data in job_stat["performance_metrics"].items():
+                            if metric_data:
+                                values = [m["value"] for m in metric_data]
+                                avg_value = sum(values) / len(values)
+                                summary_lines.append(f"| {metric_name} | {avg_value:.2f} | {len(values)} |")
+                        summary_lines.append("")
+
+                    # Recent failures
+                    if job_stat["recent_failures"]:
+                        summary_lines.append("**Recent Failures:**")
+                        for failure in job_stat["recent_failures"][:3]:
+                            summary_lines.append(f"- [Run #{failure['run_number']}]({failure['run_url']})")
+                        summary_lines.append("")
+
+            with open(github_step_summary, "a", encoding="utf-8") as f:
+                f.write("\n".join(summary_lines))
+                f.write("\n\n---\n\n")
+
+            print("GitHub Actions nightly summary generated successfully")
+
+        except Exception as e:
+            print(f"Failed to generate nightly GitHub Actions summary: {e}")
+
     def detect_nightly_regressions(self, stats: Dict) -> List[Dict]:
         """Detect regressions in nightly tests"""
         regressions = []
@@ -1048,6 +1149,7 @@ def main():
 
             stats = analyzer.analyze_nightly_with_metrics(runs)
             analyzer.generate_nightly_report(stats, args.output)
+            analyzer.generate_nightly_github_summary(stats)
             regressions = analyzer.detect_nightly_regressions(stats)
 
             # Exit with error code if regressions detected

From 0da20889e0aa15e812ecdec87bda5911df93f273 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Wed, 19 Nov 2025 12:36:25 -0800
Subject: [PATCH 30/31] Fix lint errors - break long lines in nightly summary
 generation

---
 scripts/ci_monitor/ci_analyzer.py | 36 +++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index 703848114256..de02f7531a35 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -975,16 +975,26 @@ def generate_nightly_github_summary(self, stats: Dict):
             summary_lines.append("| Metric | Count | Percentage |")
             summary_lines.append("|--------|-------|------------|")
             summary_lines.append(f"| Total Runs | {total} | 100% |")
-            summary_lines.append(f"| Successful | {success} | {success/max(1,total)*100:.1f}% |")
-            summary_lines.append(f"| Failed | {failed} | {failed/max(1,total)*100:.1f}% |")
-            summary_lines.append(f"| Cancelled | {cancelled} | {cancelled/max(1,total)*100:.1f}% |")
+            summary_lines.append(
+                f"| Successful | {success} | {success/max(1,total)*100:.1f}% |"
+            )
+            summary_lines.append(
+                f"| Failed | {failed} | {failed/max(1,total)*100:.1f}% |"
+            )
+            summary_lines.append(
+                f"| Cancelled | {cancelled} | {cancelled/max(1,total)*100:.1f}% |"
+            )
             summary_lines.append("")
 
             # Daily trends
             summary_lines.append("## Daily Trends")
             summary_lines.append("")
-            summary_lines.append("| Date | Total Runs | Success | Failed | Success Rate |")
-            summary_lines.append("|------|------------|---------|--------|--------------|")
+            summary_lines.append(
+                "| Date | Total Runs | Success | Failed | Success Rate |"
+            )
+            summary_lines.append(
+                "|------|------------|---------|--------|--------------|"
+            )
 
             daily_stats = sorted(stats["daily_stats"].items(), reverse=True)[:7]
             for date, day_stats in daily_stats:
@@ -1001,7 +1011,9 @@ def generate_nightly_github_summary(self, stats: Dict):
                 summary_lines.append("")
 
                 job_stats_sorted = sorted(
-                    stats["job_stats"].items(), key=lambda x: x[1]["failure"], reverse=True
+                    stats["job_stats"].items(),
+                    key=lambda x: x[1]["failure"],
+                    reverse=True,
                 )
 
                 for job_name, job_stat in job_stats_sorted:
@@ -1026,18 +1038,24 @@ def generate_nightly_github_summary(self, stats: Dict):
                         summary_lines.append("| Metric | Avg Value | Samples |")
                         summary_lines.append("|--------|-----------|---------|")
 
-                        for metric_name, metric_data in job_stat["performance_metrics"].items():
+                        for metric_name, metric_data in job_stat[
+                            "performance_metrics"
+                        ].items():
                             if metric_data:
                                 values = [m["value"] for m in metric_data]
                                 avg_value = sum(values) / len(values)
-                                summary_lines.append(f"| {metric_name} | {avg_value:.2f} | {len(values)} |")
+                                summary_lines.append(
+                                    f"| {metric_name} | {avg_value:.2f} | {len(values)} |"
+                                )
                         summary_lines.append("")
 
                     # Recent failures
                     if job_stat["recent_failures"]:
                         summary_lines.append("**Recent Failures:**")
                         for failure in job_stat["recent_failures"][:3]:
-                            summary_lines.append(f"- [Run #{failure['run_number']}]({failure['run_url']})")
+                            summary_lines.append(
+                                f"- [Run #{failure['run_number']}]({failure['run_url']})"
+                            )
                         summary_lines.append("")
 
             with open(github_step_summary, "a", encoding="utf-8") as f:

From 9882656dd81c117c0f07b9d7970a24b9343b7c68 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Wed, 19 Nov 2025 12:39:20 -0800
Subject: [PATCH 31/31] Apply black formatting

---
 scripts/ci_monitor/ci_analyzer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py
index de02f7531a35..63474be90675 100755
--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -953,7 +953,9 @@ def generate_nightly_github_summary(self, stats: Dict):
         try:
             github_step_summary = os.environ.get("GITHUB_STEP_SUMMARY")
             if not github_step_summary:
-                print("Not running in GitHub Actions, skipping nightly summary generation")
+                print(
+                    "Not running in GitHub Actions, skipping nightly summary generation"
+                )
                 return
 
             print("Generating GitHub Actions summary for Nightly Analysis...")
@@ -961,7 +963,9 @@ def generate_nightly_github_summary(self, stats: Dict):
             summary_lines = []
             summary_lines.append("# Nightly Test Monitor Report")
             summary_lines.append("")
-            summary_lines.append(f"**Report Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+            summary_lines.append(
+                f"**Report Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+            )
             summary_lines.append("")
 
             # Overall statistics