diff --git a/ci/eval/compare/cmp-stats.py b/ci/eval/compare/cmp-stats.py
deleted file mode 100644
index ffc9026ca72e7..0000000000000
--- a/ci/eval/compare/cmp-stats.py
+++ /dev/null
@@ -1,141 +0,0 @@
-import json
-import os
-from scipy.stats import ttest_rel
-import pandas as pd
-import numpy as np
-from pathlib import Path
-
-# Define metrics of interest (can be expanded as needed)
-METRIC_PREFIXES = ("nr", "gc")
-
-def flatten_data(json_data: dict) -> dict:
-    """
-    Extracts and flattens metrics from JSON data.
-    This is needed because the JSON data can be nested.
-    For example, the JSON data entry might look like this:
-
-    "gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}
-
-    Flattened:
-
-    "gc.cycles": 13
-    "gc.heapSize": 5404549120
-    ...
-
-    Args:
-        json_data (dict): JSON data containing metrics.
-    Returns:
-        dict: Flattened metrics with keys as metric names.
-    """
-    flat_metrics = {}
-    for k, v in json_data.items():
-        if isinstance(v, (int, float)):
-            flat_metrics[k] = v
-        elif isinstance(v, dict):
-            for sub_k, sub_v in v.items():
-                flat_metrics[f"{k}.{sub_k}"] = sub_v
-    return flat_metrics
-
-
-
-
-def load_all_metrics(directory: Path) -> dict:
-    """
-    Loads all stats JSON files in the specified directory and extracts metrics.
-
-    Args:
-        directory (Path): Directory containing JSON files.
-    Returns:
-        dict: Dictionary with filenames as keys and extracted metrics as values.
-    """
-    metrics = {}
-    for system_dir in directory.iterdir():
-        assert system_dir.is_dir()
-
-        for chunk_output in system_dir.iterdir():
-                with chunk_output.open() as f:
-                    data = json.load(f)
-                metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)
-
-    return metrics
-
-def dataframe_to_markdown(df: pd.DataFrame) -> str:
-    markdown_lines = []
-
-    # Header (get column names and format them)
-    header = '\n| ' + ' | '.join(df.columns) + ' |'
-    markdown_lines.append(header)
-    markdown_lines.append("| - " * (len(df.columns)) + "|")  # Separator line
-
-    # Iterate over rows to build Markdown rows
-    for _, row in df.iterrows():
-        # TODO: define threshold for highlighting
-        highlight = False
-
-        fmt = lambda x: f"**{x}**" if highlight else f"{x}"
-
-        # Check for no change and NaN in p_value/t_stat
-        row_values = []
-        for val in row:
-            if isinstance(val, float) and np.isnan(val):  # For NaN values in p-value or t-stat
-                row_values.append("-")  # Custom symbol for NaN
-            elif isinstance(val, float) and val == 0:  # For no change (mean_diff == 0)
-                row_values.append("-")  # Custom symbol for no change
-            else:
-                row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val)))
-
-        markdown_lines.append('| ' + ' | '.join(row_values) + ' |')
-
-    return '\n'.join(markdown_lines)
-
-
-def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:
-    common_files = sorted(set(before_metrics) & set(after_metrics))
-    all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() })
-
-    results = []
-
-    for key in all_keys:
-        before_vals, after_vals = [], []
-
-        for fname in common_files:
-            if key in before_metrics[fname] and key in after_metrics[fname]:
-                before_vals.append(before_metrics[fname][key])
-                after_vals.append(after_metrics[fname][key])
-
-        if len(before_vals) >= 2:
-            before_arr = np.array(before_vals)
-            after_arr = np.array(after_vals)
-
-            diff = after_arr - before_arr
-            pct_change = 100 * diff / before_arr
-            t_stat, p_val = ttest_rel(after_arr, before_arr)
-
-            results.append({
-                "metric": key,
-                "mean_before": np.mean(before_arr),
-                "mean_after": np.mean(after_arr),
-                "mean_diff": np.mean(diff),
-                "mean_%_change": np.mean(pct_change),
-                "p_value": p_val,
-                "t_stat": t_stat
-            })
-
-    df = pd.DataFrame(results).sort_values("p_value")
-    return df
-
-
-if __name__ == "__main__":
-    before_dir = os.environ.get("BEFORE_DIR")
-    after_dir = os.environ.get("AFTER_DIR")
-
-    if not before_dir or not after_dir:
-        print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.")
-        exit(1)
-
-    before_metrics = load_all_metrics(Path(before_dir) / "stats")
-    after_metrics = load_all_metrics(Path(after_dir) / "stats")
-
-    df1 = perform_pairwise_tests(before_metrics, after_metrics)
-    markdown_table = dataframe_to_markdown(df1)
-    print(markdown_table)
diff --git a/ci/eval/compare/default.nix b/ci/eval/compare/default.nix
index 7b677c6d01f3c..04676476f4b2a 100644
--- a/ci/eval/compare/default.nix
+++ b/ci/eval/compare/default.nix
@@ -3,7 +3,6 @@
   jq,
   runCommand,
   writeText,
-  python3,
   ...
 }:
 {
@@ -126,59 +125,18 @@ let
 in
 runCommand "compare"
   {
-    nativeBuildInputs = [
-      jq
-      (python3.withPackages (
-        ps: with ps; [
-          numpy
-          pandas
-          scipy
-        ]
-      ))
-
-    ];
+    nativeBuildInputs = [ jq ];
     maintainers = builtins.toJSON maintainers;
     passAsFile = [ "maintainers" ];
-    env = {
-      BEFORE_DIR = "${beforeResultDir}";
-      AFTER_DIR = "${afterResultDir}";
-    };
   }
   ''
     mkdir $out
 
     cp ${changed-paths} $out/changed-paths.json
 
-
-    if jq -e '(.attrdiff.added | length == 0) and (.attrdiff.removed | length == 0)' "${changed-paths}" > /dev/null; then
-      # Chunks have changed between revisions
-      # We cannot generate a performance comparison
-      {
-        echo
-        echo "# Performance comparison"
-        echo
-        echo "This compares the performance of this branch against its pull request base branch (e.g., 'master')"
-        echo
-        echo "For further help please refer to: [ci/README.md](https://github.com/NixOS/nixpkgs/blob/master/ci/README.md)"
-        echo
-      } >> $out/step-summary.md
-
-      python3 ${./cmp-stats.py} >> $out/step-summary.md
-
-    else
-      # Package chunks are the same in both revisions
-      # We can use the to generate a performance comparison
-      {
-        echo
-        echo "# Performance Comparison"
-        echo
-        echo "Performance stats were skipped because the package sets differ between the two revisions."
-        echo
-        echo "For further help please refer to: [ci/README.md](https://github.com/NixOS/nixpkgs/blob/master/ci/README.md)"
-      } >> $out/step-summary.md
-    fi
-
-    jq -r -f ${./generate-step-summary.jq} < ${changed-paths} >> $out/step-summary.md
+    jq -r -f ${./generate-step-summary.jq} < ${changed-paths} > $out/step-summary.md
 
     cp "$maintainersPath" "$out/maintainers.json"
+
+    # TODO: Compare eval stats
   ''
diff --git a/ci/eval/default.nix b/ci/eval/default.nix
index 639e75ec4211f..8537084b1bd5e 100644
--- a/ci/eval/default.nix
+++ b/ci/eval/default.nix
@@ -9,7 +9,6 @@
   nixVersions,
   jq,
   sta,
-  python3,
 }:
 
 let
@@ -271,7 +270,6 @@ let
       runCommand
       writeText
       supportedSystems
-      python3
       ;
   };