diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 391226a50..4fd2ea5fd 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -462,8 +462,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: last_results_file_path = hf_hub_url(repo_id=repo_id, filename=last_results_file, repo_type="dataset") f = load_dataset("json", data_files=last_results_file_path, split="train") results_dict = f["results"][0] - value = results_dict.pop("all") - new_dictionary = {"all": value} + new_dictionary = {"all": results_dict} new_dictionary.update(results_dict) results_string = json.dumps(new_dictionary, indent=4) diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 172330a14..0caabaa85 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -490,12 +490,21 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = hlog_warn(f"{task_name}, {metric_name} got an OVERFLOW ERROR when computing stderr.") # We group subtasks which belong to the same parent task, like MMLU, to compute an average on them + # and compute an average of all metrics grouped_tasks = collections.defaultdict(list) - for k in self.metric_aggregated.keys(): + suite_average = {} + suite_nb = {} + + # Build aggregation + for k, metrics in self.metric_aggregated.items(): if "|" in k: suite, task, fewshot = k.split("|") grouped_tasks[f"{suite}|{task.split(':')[0]}:_average|{fewshot}"].append(k) + for metric, value in metrics.items(): + suite_average[metric] = suite_average.get(metric, 0) + value + suite_nb[metric] = suite_nb.get(metric, 0) + 1 + # Compute average for sub groups for average_task, list_of_subtasks in grouped_tasks.items(): if len(list_of_subtasks) > 1: metrics = list(self.metric_aggregated[list_of_subtasks[0]].keys()) @@ -504,6 +513,12 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = for metric in metrics } + # Compute average for all + for metric, value in suite_average.items(): + suite_average[metric] = value / suite_nb[metric] + + self.metric_aggregated["all"] = suite_average + class VersionsLogger: """Logger of the tasks versions.