NixOS · infinisil · May 1, 2025 · Apr 1, 2025
diff --git a/ci/eval/compare/cmp-stats.py b/ci/eval/compare/cmp-stats.py
@@ -0,0 +1,141 @@
+import json
+import os
+from scipy.stats import ttest_rel
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+# Define metrics of interest (can be expanded as needed)
+METRIC_PREFIXES = ("nr", "gc")
+
+def flatten_data(json_data: dict) -> dict:
+    """
+    Extracts and flattens metrics from JSON data.
+    This is needed because the JSON data can be nested.
+    For example, the JSON data entry might look like this:
+
+    "gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}
+
+    Flattened:
+
+    "gc.cycles": 13
+    "gc.heapSize": 5404549120
+    ...
+
+    Args:
+        json_data (dict): JSON data containing metrics.
+    Returns:
+        dict: Flattened metrics with keys as metric names.
+    """
+    flat_metrics = {}
+    for k, v in json_data.items():
+        if isinstance(v, (int, float)):
+            flat_metrics[k] = v
+        elif isinstance(v, dict):
+            for sub_k, sub_v in v.items():
+                flat_metrics[f"{k}.{sub_k}"] = sub_v
+    return flat_metrics
+
+
+
+
+def load_all_metrics(directory: Path) -> dict:
+    """
+    Loads all stats JSON files in the specified directory and extracts metrics.
+
+    Args:
+        directory (Path): Directory containing JSON files.
+    Returns:
+        dict: Dictionary with filenames as keys and extracted metrics as values.
+    """
+    metrics = {}
+    for system_dir in directory.iterdir():
+        assert system_dir.is_dir()
+
+        for chunk_output in system_dir.iterdir():
+                with chunk_output.open() as f:
+                    data = json.load(f)
+                metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)
+
+    return metrics
+
+def dataframe_to_markdown(df: pd.DataFrame) -> str:
+    markdown_lines = []
+
+    # Header (get column names and format them)
+    header = '\n| ' + ' | '.join(df.columns) + ' |'
+    markdown_lines.append(header)
+    markdown_lines.append("| - " * (len(df.columns)) + "|")  # Separator line
+
+    # Iterate over rows to build Markdown rows
+    for _, row in df.iterrows():
+        # TODO: define threshold for highlighting
+        highlight = False
+
+        fmt = lambda x: f"**{x}**" if highlight else f"{x}"
+
+        # Check for no change and NaN in p_value/t_stat
+        row_values = []
+        for val in row:
+            if isinstance(val, float) and np.isnan(val):  # For NaN values in p-value or t-stat
+                row_values.append("-")  # Custom symbol for NaN
+            elif isinstance(val, float) and val == 0:  # For no change (mean_diff == 0)
+                row_values.append("-")  # Custom symbol for no change
+            else:
+                row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val)))
+
+        markdown_lines.append('| ' + ' | '.join(row_values) + ' |')
+
+    return '\n'.join(markdown_lines)
+
+
+def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:
+    common_files = sorted(set(before_metrics) & set(after_metrics))
+    all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() })
+
+    results = []
+
+    for key in all_keys:
+        before_vals, after_vals = [], []
+
+        for fname in common_files:
+            if key in before_metrics[fname] and key in after_metrics[fname]:
+                before_vals.append(before_metrics[fname][key])
+                after_vals.append(after_metrics[fname][key])
+
+        if len(before_vals) >= 2:
+            before_arr = np.array(before_vals)
+            after_arr = np.array(after_vals)
+
+            diff = after_arr - before_arr
+            pct_change = 100 * diff / before_arr
+            t_stat, p_val = ttest_rel(after_arr, before_arr)
+
+            results.append({
+                "metric": key,
+                "mean_before": np.mean(before_arr),
+                "mean_after": np.mean(after_arr),
+                "mean_diff": np.mean(diff),
+                "mean_%_change": np.mean(pct_change),
+                "p_value": p_val,
+                "t_stat": t_stat
+            })
+
+    df = pd.DataFrame(results).sort_values("p_value")
+    return df
+
+
+if __name__ == "__main__":
+    before_dir = os.environ.get("BEFORE_DIR")
+    after_dir = os.environ.get("AFTER_DIR")
+
+    if not before_dir or not after_dir:
+        print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.")
+        exit(1)
+
+    before_metrics = load_all_metrics(Path(before_dir) / "stats")
+    after_metrics = load_all_metrics(Path(after_dir) / "stats")
+
+    df1 = perform_pairwise_tests(before_metrics, after_metrics)
+    markdown_table = dataframe_to_markdown(df1)
+    print(markdown_table)
diff --git a/ci/eval/compare/default.nix b/ci/eval/compare/default.nix
@@ -3,6 +3,7 @@
   jq,
   runCommand,
   writeText,
+  python3,
   ...
 }:
 {
@@ -125,18 +126,59 @@ let
 in
 runCommand "compare"
   {
-    nativeBuildInputs = [ jq ];
+    nativeBuildInputs = [
+      jq
+      (python3.withPackages (
+        ps: with ps; [
+          numpy
+          pandas
+          scipy
+        ]
+      ))
+
+    ];
     maintainers = builtins.toJSON maintainers;
     passAsFile = [ "maintainers" ];
+    env = {
+      BEFORE_DIR = "${beforeResultDir}";
+      AFTER_DIR = "${afterResultDir}";
+    };
   }
   ''
     mkdir $out
 
     cp ${changed-paths} $out/changed-paths.json
 
-    jq -r -f ${./generate-step-summary.jq} < ${changed-paths} > $out/step-summary.md
 
-    cp "$maintainersPath" "$out/maintainers.json"
+    if jq -e '(.attrdiff.added | length == 0) and (.attrdiff.removed | length == 0)' "${changed-paths}" > /dev/null; then
+      # Chunks have changed between revisions
+      # We cannot generate a performance comparison
+      {
+        echo
+        echo "# Performance comparison"
+        echo
+        echo "This compares the performance of this branch against its pull request base branch (e.g., 'master')"
+        echo
+        echo "For further help please refer to: [ci/README.md](https://github.com/NixOS/nixpkgs/blob/master/ci/README.md)"
+        echo
+      } >> $out/step-summary.md
+
+      python3 ${./cmp-stats.py} >> $out/step-summary.md
 
-    # TODO: Compare eval stats
+    else
+      # Package chunks are the same in both revisions
+      # We can use the to generate a performance comparison
+      {
+        echo
+        echo "# Performance Comparison"
+        echo
+        echo "Performance stats were skipped because the package sets differ between the two revisions."
+        echo
+        echo "For further help please refer to: [ci/README.md](https://github.com/NixOS/nixpkgs/blob/master/ci/README.md)"
+      } >> $out/step-summary.md
+    fi
+
+    jq -r -f ${./generate-step-summary.jq} < ${changed-paths} >> $out/step-summary.md
+
+    cp "$maintainersPath" "$out/maintainers.json"
   ''
diff --git a/ci/eval/default.nix b/ci/eval/default.nix
@@ -9,6 +9,7 @@
   nixVersions,
   jq,
   sta,
+  python3,
 }:
 
 let
@@ -270,6 +271,7 @@ let
       runCommand
       writeText
       supportedSystems
+      python3
       ;
   };