Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions ci/eval/compare/cmp-stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import json
import os
from scipy.stats import ttest_rel
import pandas as pd
import numpy as np
from pathlib import Path

# Define metrics of interest (can be expanded as needed)
METRIC_PREFIXES = ("nr", "gc")

def flatten_data(json_data: dict) -> dict:
"""
Extracts and flattens metrics from JSON data.
This is needed because the JSON data can be nested.
For example, the JSON data entry might look like this:

"gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}

Flattened:

"gc.cycles": 13
"gc.heapSize": 5404549120
...

Args:
json_data (dict): JSON data containing metrics.
Returns:
dict: Flattened metrics with keys as metric names.
"""
flat_metrics = {}
for k, v in json_data.items():
if isinstance(v, (int, float)):
flat_metrics[k] = v
elif isinstance(v, dict):
for sub_k, sub_v in v.items():
flat_metrics[f"{k}.{sub_k}"] = sub_v
return flat_metrics




def load_all_metrics(directory: Path) -> dict:
"""
Loads all stats JSON files in the specified directory and extracts metrics.

Args:
directory (Path): Directory containing JSON files.
Returns:
dict: Dictionary with filenames as keys and extracted metrics as values.
"""
metrics = {}
for system_dir in directory.iterdir():
assert system_dir.is_dir()

for chunk_output in system_dir.iterdir():
with chunk_output.open() as f:
data = json.load(f)
metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)

return metrics

def dataframe_to_markdown(df: pd.DataFrame) -> str:
markdown_lines = []

# Header (get column names and format them)
header = '\n| ' + ' | '.join(df.columns) + ' |'
markdown_lines.append(header)
markdown_lines.append("| - " * (len(df.columns)) + "|") # Separator line

# Iterate over rows to build Markdown rows
for _, row in df.iterrows():
# TODO: define threshold for highlighting
highlight = False

fmt = lambda x: f"**{x}**" if highlight else f"{x}"

# Check for no change and NaN in p_value/t_stat
row_values = []
for val in row:
if isinstance(val, float) and np.isnan(val): # For NaN values in p-value or t-stat
row_values.append("-") # Custom symbol for NaN
elif isinstance(val, float) and val == 0: # For no change (mean_diff == 0)
row_values.append("-") # Custom symbol for no change
else:
row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val)))

markdown_lines.append('| ' + ' | '.join(row_values) + ' |')

return '\n'.join(markdown_lines)


def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:
common_files = sorted(set(before_metrics) & set(after_metrics))
all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() })

results = []

for key in all_keys:
before_vals, after_vals = [], []

for fname in common_files:
if key in before_metrics[fname] and key in after_metrics[fname]:
before_vals.append(before_metrics[fname][key])
after_vals.append(after_metrics[fname][key])

if len(before_vals) >= 2:
before_arr = np.array(before_vals)
after_arr = np.array(after_vals)

diff = after_arr - before_arr
pct_change = 100 * diff / before_arr
t_stat, p_val = ttest_rel(after_arr, before_arr)

results.append({
"metric": key,
"mean_before": np.mean(before_arr),
"mean_after": np.mean(after_arr),
"mean_diff": np.mean(diff),
"mean_%_change": np.mean(pct_change),
"p_value": p_val,
"t_stat": t_stat
})

df = pd.DataFrame(results).sort_values("p_value")
return df


if __name__ == "__main__":
before_dir = os.environ.get("BEFORE_DIR")
after_dir = os.environ.get("AFTER_DIR")

if not before_dir or not after_dir:
print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.")
exit(1)

before_metrics = load_all_metrics(Path(before_dir) / "stats")
after_metrics = load_all_metrics(Path(after_dir) / "stats")

df1 = perform_pairwise_tests(before_metrics, after_metrics)
markdown_table = dataframe_to_markdown(df1)
print(markdown_table)
50 changes: 46 additions & 4 deletions ci/eval/compare/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
jq,
runCommand,
writeText,
python3,
...
}:
{
Expand Down Expand Up @@ -125,18 +126,59 @@ let
in
runCommand "compare"
{
nativeBuildInputs = [ jq ];
nativeBuildInputs = [
jq
(python3.withPackages (
ps: with ps; [
numpy
pandas
scipy
]
))

];
maintainers = builtins.toJSON maintainers;
passAsFile = [ "maintainers" ];
env = {
BEFORE_DIR = "${beforeResultDir}";
AFTER_DIR = "${afterResultDir}";
};
}
''
mkdir $out

cp ${changed-paths} $out/changed-paths.json

jq -r -f ${./generate-step-summary.jq} < ${changed-paths} > $out/step-summary.md

cp "$maintainersPath" "$out/maintainers.json"
if jq -e '(.attrdiff.added | length == 0) and (.attrdiff.removed | length == 0)' "${changed-paths}" > /dev/null; then
# Chunks have changed between revisions
# We cannot generate a performance comparison
{
echo
echo "# Performance comparison"
echo
echo "This compares the performance of this branch against its pull request base branch (e.g., 'master')"
echo
echo "For further help please refer to: [ci/README.md](https://github.com/NixOS/nixpkgs/blob/master/ci/README.md)"
echo
} >> $out/step-summary.md

python3 ${./cmp-stats.py} >> $out/step-summary.md

# TODO: Compare eval stats
else
# Package chunks are the same in both revisions
# We can use the to generate a performance comparison
{
echo
echo "# Performance Comparison"
echo
echo "Performance stats were skipped because the package sets differ between the two revisions."
echo
echo "For further help please refer to: [ci/README.md](https://github.com/NixOS/nixpkgs/blob/master/ci/README.md)"
} >> $out/step-summary.md
fi

jq -r -f ${./generate-step-summary.jq} < ${changed-paths} >> $out/step-summary.md

cp "$maintainersPath" "$out/maintainers.json"
''
2 changes: 2 additions & 0 deletions ci/eval/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
nixVersions,
jq,
sta,
python3,
}:

let
Expand Down Expand Up @@ -270,6 +271,7 @@ let
runCommand
writeText
supportedSystems
python3
;
};

Expand Down