From b8bfd222e7680a4718be802d9b7e7bc055e9be16 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Mar 2024 10:51:59 +0000 Subject: [PATCH 1/6] task path fixed + added prompt formatting function --- src/lighteval/tasks/tasks_prompt_formatting.py | 9 +++++++++ src/lighteval/tasks/tasks_table.jsonl | 17 +++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/lighteval/tasks/tasks_prompt_formatting.py b/src/lighteval/tasks/tasks_prompt_formatting.py index 08f088b29..0d8d4392d 100644 --- a/src/lighteval/tasks/tasks_prompt_formatting.py +++ b/src/lighteval/tasks/tasks_prompt_formatting.py @@ -48,6 +48,15 @@ def anli(line, task_name: str = None): ) +def agieval(line, task_name: str = None): + return Doc( + task_name=task_name, + query=line["query"], + choices=line["choices"], + gold_index=line["gold"], + ) + + def apps(line, task_name: str = None): answer_type = "\nUse Call-Based format\n" if line["starter_code"] != "" else "\nUse Standard Input format\n" return Doc( diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl index f7268e122..65f7e15ae 100644 --- a/src/lighteval/tasks/tasks_table.jsonl +++ b/src/lighteval/tasks/tasks_table.jsonl @@ -1,4 +1,21 @@ {"name":"abstract_narrative_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"abstract_narrative_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:aqua-rat","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-aqua-rat","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-biology","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-biology","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-chemistry","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chemistry","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-chinese","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chinese","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-english","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-english","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-geography","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-geography","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-history","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-history","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-mathqa","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-mathqa","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-physics","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-physics","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:logiqa-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:logiqa-zh","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-zh","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:lsat-ar","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-ar","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:lsat-lr","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-lr","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:lsat-rc","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-rc","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:sat-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:sat-en-without-passage","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en-without-passage","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:sat-math","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-math","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"anachronisms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"anachronisms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"analogical_similarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analogical_similarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"analytic_entailment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analytic_entailment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} From cf5052eff713b55ca7f94520cc7f0cf827079e5f Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Mar 2024 14:45:47 +0000 Subject: [PATCH 2/6] results within error rates when compared to yall --- src/lighteval/metrics/metrics_sample.py | 6 ++-- .../tasks/tasks_prompt_formatting.py | 2 +- src/lighteval/tasks/tasks_table.jsonl | 34 +++++++++---------- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 669c2c3c1..770d8d942 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -233,10 +233,8 @@ def compute(self, gold_ixs: list[int], choices_logprob: list[float], formatted_d if self.length_normalization: normalized_log_probs = [] for ix, choice in enumerate(formatted_doc.choices): - if self.ignore_first_space: - normalized_log_probs.append( - choices_logprob[ix] / (len(choice) - 1 if choice[0] == " " else len(choice)) - ) + if self.ignore_first_space and choice[0] == " ": + normalized_log_probs.append(choices_logprob[ix] / (len(choice) - 1)) else: normalized_log_probs.append(choices_logprob[ix] / len(choice)) diff --git a/src/lighteval/tasks/tasks_prompt_formatting.py b/src/lighteval/tasks/tasks_prompt_formatting.py index 0d8d4392d..2092bd24f 100644 --- a/src/lighteval/tasks/tasks_prompt_formatting.py +++ b/src/lighteval/tasks/tasks_prompt_formatting.py @@ -52,7 +52,7 @@ def agieval(line, task_name: str = None): return Doc( task_name=task_name, query=line["query"], - choices=line["choices"], + choices=[f" {c}" for c in line["choices"]], gold_index=line["gold"], ) diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl index 65f7e15ae..074510c20 100644 --- a/src/lighteval/tasks/tasks_table.jsonl +++ b/src/lighteval/tasks/tasks_table.jsonl @@ -1,21 +1,21 @@ {"name":"abstract_narrative_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"abstract_narrative_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:aqua-rat","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-aqua-rat","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-biology","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-biology","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-chemistry","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chemistry","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-chinese","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chinese","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-english","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-english","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-geography","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-geography","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-history","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-history","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-mathqa","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-mathqa","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-physics","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-physics","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:logiqa-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:logiqa-zh","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-zh","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:lsat-ar","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-ar","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:lsat-lr","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-lr","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:lsat-rc","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-rc","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:sat-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:sat-en-without-passage","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en-without-passage","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:sat-math","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-math","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:aqua-rat","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-aqua-rat","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-biology","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-biology","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-chemistry","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chemistry","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-chinese","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chinese","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-english","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-english","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-geography","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-geography","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-history","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-history","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-mathqa","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-mathqa","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:gaokao-physics","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-physics","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:logiqa-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:logiqa-zh","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-zh","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:lsat-ar","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-ar","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:lsat-lr","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-lr","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:lsat-rc","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-rc","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:sat-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:sat-en-without-passage","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en-without-passage","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"agieval:sat-math","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-math","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"anachronisms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"anachronisms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"analogical_similarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analogical_similarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"analytic_entailment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analytic_entailment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} From 7b3718728b4c0f12e0d54274d8b987da003b126a Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Mar 2024 14:59:26 +0000 Subject: [PATCH 3/6] add gpt 10 samples test --- .../reference_scores/reference_task_scores.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/reference_scores/reference_task_scores.py b/tests/reference_scores/reference_task_scores.py index 92695470c..b18a01bbe 100644 --- a/tests/reference_scores/reference_task_scores.py +++ b/tests/reference_scores/reference_task_scores.py @@ -615,5 +615,59 @@ "lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0": {"acc": 0.2000, "acc_stderr": 0.1333}, "lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0": {"acc": 0.3000, "acc_stderr": 0.1528}, "lighteval|bigbench:tracking_shuffled_objects_three_objects|3|0": {"acc": 0.4000, "acc_stderr": 0.1633}, + "lighteval|agieval:_average|0|0": { + "acc": 0.2125, + "acc_stderr": 0.1323, + "acc_norm": 0.2250, + "acc_norm_stderr": 0.1347, + }, + "lighteval|agieval:aqua-rat|0|0": { + "acc": 0.3000, + "acc_stderr": 0.1528, + "acc_norm": 0.3000, + "acc_norm_stderr": 0.1528, + }, + "lighteval|agieval:logiqa-en|0|0": { + "acc": 0.1000, + "acc_stderr": 0.1000, + "acc_norm": 0.3000, + "acc_norm_stderr": 0.1528, + }, + "lighteval|agieval:lsat-ar|0|0": { + "acc": 0.1000, + "acc_stderr": 0.1000, + "acc_norm": 0.1000, + "acc_norm_stderr": 0.1000, + }, + "lighteval|agieval:lsat-lr|0|0": { + "acc": 0.2000, + "acc_stderr": 0.1333, + "acc_norm": 0.2000, + "acc_norm_stderr": 0.1333, + }, + "lighteval|agieval:lsat-rc|0|0": { + "acc": 0.3000, + "acc_stderr": 0.1528, + "acc_norm": 0.2000, + "acc_norm_stderr": 0.1333, + }, + "lighteval|agieval:sat-en-without-passage|0|0": { + "acc": 0.2000, + "acc_stderr": 0.1333, + "acc_norm": 0.3000, + "acc_norm_stderr": 0.1528, + }, + "lighteval|agieval:sat-en|0|0": { + "acc": 0.2000, + "acc_stderr": 0.1333, + "acc_norm": 0.3000, + "acc_norm_stderr": 0.1528, + }, + "lighteval|agieval:sat-math|0|0": { + "acc": 0.3000, + "acc_stderr": 0.1528, + "acc_norm": 0.1000, + "acc_norm_stderr": 0.1000, + }, }, } From e0a24284055c6c9f3c29132a4850e7686d41b13c Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Mar 2024 15:02:05 +0000 Subject: [PATCH 4/6] added agieval to tests --- tests/reference_scores/reference_tasks.py | 14 ++++++++++++++ tests/test_main.py | 8 ++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/reference_scores/reference_tasks.py b/tests/reference_scores/reference_tasks.py index 91cefba84..ef2813f97 100644 --- a/tests/reference_scores/reference_tasks.py +++ b/tests/reference_scores/reference_tasks.py @@ -61,3 +61,17 @@ "helm|boolq|5|0", "helm|hellaswag|5|0", ] + +AGIEVAL_SUBSET = [ + "lighteval|agieval:_average|0|0", + "lighteval|agieval:aqua-rat|0|0", + "lighteval|agieval:logiqa-en|0|0", + "lighteval|agieval:lsat-ar|0|0", + "lighteval|agieval:lsat-lr|0|0", + "lighteval|agieval:lsat-rc|0|0", + "lighteval|agieval:sat-en-without-passage|0|0", + "lighteval|agieval:sat-en|0|0", + "lighteval|agieval:sat-math|0|0", +] + +ALL_SUBSETS = LEADERBOARD_SUBSET + STABLE_SUBSET + HELM_SUBSET + AGIEVAL_SUBSET diff --git a/tests/test_main.py b/tests/test_main.py index 913f022e5..00798cb4f 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -29,11 +29,7 @@ from lighteval.main_accelerate import main # noqa: E402 from run_evals_accelerate import get_parser from tests.reference_scores.reference_task_scores import RESULTS_FULL, RESULTS_LITE # noqa: E402 -from tests.reference_scores.reference_tasks import ( # noqa: E402 - HELM_SUBSET, - LEADERBOARD_SUBSET, - STABLE_SUBSET, -) +from tests.reference_scores.reference_tasks import ALL_SUBSETS # Set env var for deterministic run of models @@ -46,7 +42,7 @@ # To add new models or tasks, change here # ! The correct results must be present in reference_task_scores MODELS = ["gpt2"] -TASKS = LEADERBOARD_SUBSET + STABLE_SUBSET + HELM_SUBSET +TASKS = ALL_SUBSETS FULL_TEST = os.environ.get("LIGHTEVAL_FULL_TEST", False) From b1333ecc1cfe62560ec7452d684c30b93ef754a4 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Mar 2024 15:04:12 +0000 Subject: [PATCH 5/6] add missing bbh to tests --- tests/reference_scores/reference_tasks.py | 41 ++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/tests/reference_scores/reference_tasks.py b/tests/reference_scores/reference_tasks.py index ef2813f97..67638a85b 100644 --- a/tests/reference_scores/reference_tasks.py +++ b/tests/reference_scores/reference_tasks.py @@ -74,4 +74,43 @@ "lighteval|agieval:sat-math|0|0", ] -ALL_SUBSETS = LEADERBOARD_SUBSET + STABLE_SUBSET + HELM_SUBSET + AGIEVAL_SUBSET +BBH_SUBSET = [ + "lighteval|bigbench:causal_judgment|3|0", + "harness|bigbench:causal_judgment|3|0", + "lighteval|bigbench:date_understanding|3|0", + "harness|bigbench:date_understanding|3|0", + "lighteval|bigbench:disambiguation_qa|3|0", + "harness|bigbench:disambiguation_qa|3|0", + "lighteval|bigbench:geometric_shapes|3|0", + "harness|bigbench:geometric_shapes|3|0", + "lighteval|bigbench:logical_deduction_five_objects|3|0", + "harness|bigbench:logical_deduction_five_objects|3|0", + "lighteval|bigbench:logical_deduction_seven_objects|3|0", + "harness|bigbench:logical_deduction_seven_objects|3|0", + "lighteval|bigbench:logical_deduction_three_objects|3|0", + "harness|bigbench:logical_deduction_three_objects|3|0", + "lighteval|bigbench:movie_recommendation|3|0", + "harness|bigbench:movie_recommendation|3|0", + "lighteval|bigbench:navigate|3|0", + "harness|bigbench:navigate|3|0", + "lighteval|bigbench:reasoning_about_colored_objects|3|0", + "harness|bigbench:reasoning_about_colored_objects|3|0", + "lighteval|bigbench:ruin_names|3|0", + "harness|bigbench:ruin_names|3|0", + "lighteval|bigbench:salient_translation_error_detection|3|0", + "harness|bigbench:salient_translation_error_detection|3|0", + "lighteval|bigbench:snarks|3|0", + "harness|bigbench:snarks|3|0", + "lighteval|bigbench:sports_understanding|3|0", + "harness|bigbench:sports_understanding|3|0", + "lighteval|bigbench:temporal_sequences|3|0", + "harness|bigbench:temporal_sequences|3|0", + "lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0", + "harness|bigbench:tracking_shuffled_objects_five_objects|3|0", + "lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0", + "harness|bigbench:tracking_shuffled_objects_seven_objects|3|0", + "lighteval|bigbench:tracking_shuffled_objects_three_objects|3|0", + "harness|bigbench:tracking_shuffled_objects_three_objects|3|0", +] + +ALL_SUBSETS = LEADERBOARD_SUBSET + STABLE_SUBSET + HELM_SUBSET + AGIEVAL_SUBSET + BBH_SUBSET From 72a424556319f6096af9cad5c0c2db5cec6ab0c5 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Mar 2024 16:36:49 +0000 Subject: [PATCH 6/6] fixed precision --- .../reference_scores/reference_task_scores.py | 24 +++++++++---------- tests/reference_scores/reference_tasks.py | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/reference_scores/reference_task_scores.py b/tests/reference_scores/reference_task_scores.py index b18a01bbe..61f21243c 100644 --- a/tests/reference_scores/reference_task_scores.py +++ b/tests/reference_scores/reference_task_scores.py @@ -623,15 +623,15 @@ }, "lighteval|agieval:aqua-rat|0|0": { "acc": 0.3000, - "acc_stderr": 0.1528, + "acc_stderr": 0.15275, "acc_norm": 0.3000, - "acc_norm_stderr": 0.1528, + "acc_norm_stderr": 0.15275, }, "lighteval|agieval:logiqa-en|0|0": { "acc": 0.1000, "acc_stderr": 0.1000, "acc_norm": 0.3000, - "acc_norm_stderr": 0.1528, + "acc_norm_stderr": 0.15275, }, "lighteval|agieval:lsat-ar|0|0": { "acc": 0.1000, @@ -641,31 +641,31 @@ }, "lighteval|agieval:lsat-lr|0|0": { "acc": 0.2000, - "acc_stderr": 0.1333, + "acc_stderr": 0.13333, "acc_norm": 0.2000, - "acc_norm_stderr": 0.1333, + "acc_norm_stderr": 0.13333, }, "lighteval|agieval:lsat-rc|0|0": { "acc": 0.3000, - "acc_stderr": 0.1528, + "acc_stderr": 0.15275, "acc_norm": 0.2000, - "acc_norm_stderr": 0.1333, + "acc_norm_stderr": 0.13333, }, "lighteval|agieval:sat-en-without-passage|0|0": { "acc": 0.2000, - "acc_stderr": 0.1333, + "acc_stderr": 0.13333, "acc_norm": 0.3000, - "acc_norm_stderr": 0.1528, + "acc_norm_stderr": 0.15275, }, "lighteval|agieval:sat-en|0|0": { "acc": 0.2000, - "acc_stderr": 0.1333, + "acc_stderr": 0.13333, "acc_norm": 0.3000, - "acc_norm_stderr": 0.1528, + "acc_norm_stderr": 0.15275, }, "lighteval|agieval:sat-math|0|0": { "acc": 0.3000, - "acc_stderr": 0.1528, + "acc_stderr": 0.15275, "acc_norm": 0.1000, "acc_norm_stderr": 0.1000, }, diff --git a/tests/reference_scores/reference_tasks.py b/tests/reference_scores/reference_tasks.py index 67638a85b..5b28001c3 100644 --- a/tests/reference_scores/reference_tasks.py +++ b/tests/reference_scores/reference_tasks.py @@ -63,7 +63,6 @@ ] AGIEVAL_SUBSET = [ - "lighteval|agieval:_average|0|0", "lighteval|agieval:aqua-rat|0|0", "lighteval|agieval:logiqa-en|0|0", "lighteval|agieval:lsat-ar|0|0", @@ -113,4 +112,5 @@ "harness|bigbench:tracking_shuffled_objects_three_objects|3|0", ] -ALL_SUBSETS = LEADERBOARD_SUBSET + STABLE_SUBSET + HELM_SUBSET + AGIEVAL_SUBSET + BBH_SUBSET +ALL_SUBSETS = LEADERBOARD_SUBSET + STABLE_SUBSET + HELM_SUBSET + AGIEVAL_SUBSET +# + BBH_SUBSET - has a problem, to fix!, removed in this commit https://github.com/huggingface/lighteval/pull/7/commits/c136ad59fc74bb3eee6546dcf0802eb8c2f3bcbe