Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,8 +462,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
last_results_file_path = hf_hub_url(repo_id=repo_id, filename=last_results_file, repo_type="dataset")
f = load_dataset("json", data_files=last_results_file_path, split="train")
results_dict = f["results"][0]
value = results_dict.pop("all")
new_dictionary = {"all": value}
new_dictionary = {"all": results_dict}
new_dictionary.update(results_dict)
results_string = json.dumps(new_dictionary, indent=4)

Expand Down
17 changes: 16 additions & 1 deletion src/lighteval/logging/info_loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,12 +490,21 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
hlog_warn(f"{task_name}, {metric_name} got an OVERFLOW ERROR when computing stderr.")

# We group subtasks which belong to the same parent task, like MMLU, to compute an average on them
# and compute an average of all metrics
grouped_tasks = collections.defaultdict(list)
for k in self.metric_aggregated.keys():
suite_average = {}
suite_nb = {}

# Build aggregation
for k, metrics in self.metric_aggregated.items():
if "|" in k:
suite, task, fewshot = k.split("|")
grouped_tasks[f"{suite}|{task.split(':')[0]}:_average|{fewshot}"].append(k)
for metric, value in metrics.items():
suite_average[metric] = suite_average.get(metric, 0) + value
suite_nb[metric] = suite_nb.get(metric, 0) + 1

# Compute average for sub groups
for average_task, list_of_subtasks in grouped_tasks.items():
if len(list_of_subtasks) > 1:
metrics = list(self.metric_aggregated[list_of_subtasks[0]].keys())
Expand All @@ -504,6 +513,12 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
for metric in metrics
}

# Compute average for all
for metric, value in suite_average.items():
suite_average[metric] = value / suite_nb[metric]

self.metric_aggregated["all"] = suite_average


class VersionsLogger:
"""Logger of the tasks versions.
Expand Down