Add AGIEval (#121)

clefourrier · web-flow · commit 871192fb952c · 2024-03-21T11:18:27.000+01:00
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -233,10 +233,8 @@ def compute(self, gold_ixs: list[int], choices_logprob: list[float], formatted_d
         if self.length_normalization:
             normalized_log_probs = []
             for ix, choice in enumerate(formatted_doc.choices):
-                if self.ignore_first_space:
-                    normalized_log_probs.append(
-                        choices_logprob[ix] / (len(choice) - 1 if choice[0] == " " else len(choice))
-                    )
+                if self.ignore_first_space and choice[0] == " ":
+                    normalized_log_probs.append(choices_logprob[ix] / (len(choice) - 1))
                 else:
                     normalized_log_probs.append(choices_logprob[ix] / len(choice))
 
diff --git a/src/lighteval/tasks/tasks_prompt_formatting.py b/src/lighteval/tasks/tasks_prompt_formatting.py
@@ -48,6 +48,15 @@ def anli(line, task_name: str = None):
     )
 
 
+def agieval(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["query"],
+        choices=[f" {c}" for c in line["choices"]],
+        gold_index=line["gold"],
+    )
+
+
 def apps(line, task_name: str = None):
     answer_type = "\nUse Call-Based format\n" if line["starter_code"] != "" else "\nUse Standard Input format\n"
     return Doc(
diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl
@@ -1,4 +1,21 @@
 {"name":"abstract_narrative_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"abstract_narrative_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:aqua-rat","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-aqua-rat","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:gaokao-biology","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-biology","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:gaokao-chemistry","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chemistry","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:gaokao-chinese","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chinese","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:gaokao-english","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-english","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:gaokao-geography","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-geography","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:gaokao-history","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-history","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:gaokao-mathqa","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-mathqa","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:gaokao-physics","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-physics","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:logiqa-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:logiqa-zh","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-zh","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:lsat-ar","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-ar","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:lsat-lr","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-lr","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:lsat-rc","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-rc","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:sat-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:sat-en-without-passage","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en-without-passage","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"agieval:sat-math","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-math","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"anachronisms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"anachronisms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"analogical_similarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analogical_similarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"analytic_entailment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analytic_entailment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
diff --git a/tests/reference_scores/reference_task_scores.py b/tests/reference_scores/reference_task_scores.py
@@ -615,5 +615,59 @@
         "lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0": {"acc": 0.2000, "acc_stderr": 0.1333},
         "lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0": {"acc": 0.3000, "acc_stderr": 0.1528},
         "lighteval|bigbench:tracking_shuffled_objects_three_objects|3|0": {"acc": 0.4000, "acc_stderr": 0.1633},
+        "lighteval|agieval:_average|0|0": {
+            "acc": 0.2125,
+            "acc_stderr": 0.1323,
+            "acc_norm": 0.2250,
+            "acc_norm_stderr": 0.1347,
+        },
+        "lighteval|agieval:aqua-rat|0|0": {
+            "acc": 0.3000,
+            "acc_stderr": 0.15275,
+            "acc_norm": 0.3000,
+            "acc_norm_stderr": 0.15275,
+        },
+        "lighteval|agieval:logiqa-en|0|0": {
+            "acc": 0.1000,
+            "acc_stderr": 0.1000,
+            "acc_norm": 0.3000,
+            "acc_norm_stderr": 0.15275,
+        },
+        "lighteval|agieval:lsat-ar|0|0": {
+            "acc": 0.1000,
+            "acc_stderr": 0.1000,
+            "acc_norm": 0.1000,
+            "acc_norm_stderr": 0.1000,
+        },
+        "lighteval|agieval:lsat-lr|0|0": {
+            "acc": 0.2000,
+            "acc_stderr": 0.13333,
+            "acc_norm": 0.2000,
+            "acc_norm_stderr": 0.13333,
+        },
+        "lighteval|agieval:lsat-rc|0|0": {
+            "acc": 0.3000,
+            "acc_stderr": 0.15275,
+            "acc_norm": 0.2000,
+            "acc_norm_stderr": 0.13333,
+        },
+        "lighteval|agieval:sat-en-without-passage|0|0": {
+            "acc": 0.2000,
+            "acc_stderr": 0.13333,
+            "acc_norm": 0.3000,
+            "acc_norm_stderr": 0.15275,
+        },
+        "lighteval|agieval:sat-en|0|0": {
+            "acc": 0.2000,
+            "acc_stderr": 0.13333,
+            "acc_norm": 0.3000,
+            "acc_norm_stderr": 0.15275,
+        },
+        "lighteval|agieval:sat-math|0|0": {
+            "acc": 0.3000,
+            "acc_stderr": 0.15275,
+            "acc_norm": 0.1000,
+            "acc_norm_stderr": 0.1000,
+        },
     },
 }
diff --git a/tests/reference_scores/reference_tasks.py b/tests/reference_scores/reference_tasks.py
@@ -61,3 +61,56 @@
     "helm|boolq|5|0",
     "helm|hellaswag|5|0",
 ]
+
+AGIEVAL_SUBSET = [
+    "lighteval|agieval:aqua-rat|0|0",
+    "lighteval|agieval:logiqa-en|0|0",
+    "lighteval|agieval:lsat-ar|0|0",
+    "lighteval|agieval:lsat-lr|0|0",
+    "lighteval|agieval:lsat-rc|0|0",
+    "lighteval|agieval:sat-en-without-passage|0|0",
+    "lighteval|agieval:sat-en|0|0",
+    "lighteval|agieval:sat-math|0|0",
+]
+
+BBH_SUBSET = [
+    "lighteval|bigbench:causal_judgment|3|0",
+    "harness|bigbench:causal_judgment|3|0",
+    "lighteval|bigbench:date_understanding|3|0",
+    "harness|bigbench:date_understanding|3|0",
+    "lighteval|bigbench:disambiguation_qa|3|0",
+    "harness|bigbench:disambiguation_qa|3|0",
+    "lighteval|bigbench:geometric_shapes|3|0",
+    "harness|bigbench:geometric_shapes|3|0",
+    "lighteval|bigbench:logical_deduction_five_objects|3|0",
+    "harness|bigbench:logical_deduction_five_objects|3|0",
+    "lighteval|bigbench:logical_deduction_seven_objects|3|0",
+    "harness|bigbench:logical_deduction_seven_objects|3|0",
+    "lighteval|bigbench:logical_deduction_three_objects|3|0",
+    "harness|bigbench:logical_deduction_three_objects|3|0",
+    "lighteval|bigbench:movie_recommendation|3|0",
+    "harness|bigbench:movie_recommendation|3|0",
+    "lighteval|bigbench:navigate|3|0",
+    "harness|bigbench:navigate|3|0",
+    "lighteval|bigbench:reasoning_about_colored_objects|3|0",
+    "harness|bigbench:reasoning_about_colored_objects|3|0",
+    "lighteval|bigbench:ruin_names|3|0",
+    "harness|bigbench:ruin_names|3|0",
+    "lighteval|bigbench:salient_translation_error_detection|3|0",
+    "harness|bigbench:salient_translation_error_detection|3|0",
+    "lighteval|bigbench:snarks|3|0",
+    "harness|bigbench:snarks|3|0",
+    "lighteval|bigbench:sports_understanding|3|0",
+    "harness|bigbench:sports_understanding|3|0",
+    "lighteval|bigbench:temporal_sequences|3|0",
+    "harness|bigbench:temporal_sequences|3|0",
+    "lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0",
+    "harness|bigbench:tracking_shuffled_objects_five_objects|3|0",
+    "lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0",
+    "harness|bigbench:tracking_shuffled_objects_seven_objects|3|0",
+    "lighteval|bigbench:tracking_shuffled_objects_three_objects|3|0",
+    "harness|bigbench:tracking_shuffled_objects_three_objects|3|0",
+]
+
+ALL_SUBSETS = LEADERBOARD_SUBSET + STABLE_SUBSET + HELM_SUBSET + AGIEVAL_SUBSET
+# + BBH_SUBSET - has a problem, to fix!, removed in this commit https://github.com/huggingface/lighteval/pull/7/commits/c136ad59fc74bb3eee6546dcf0802eb8c2f3bcbe
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -29,11 +29,7 @@
 from lighteval.main_accelerate import main  # noqa: E402
 from run_evals_accelerate import get_parser
 from tests.reference_scores.reference_task_scores import RESULTS_FULL, RESULTS_LITE  # noqa: E402
-from tests.reference_scores.reference_tasks import (  # noqa: E402
-    HELM_SUBSET,
-    LEADERBOARD_SUBSET,
-    STABLE_SUBSET,
-)
+from tests.reference_scores.reference_tasks import ALL_SUBSETS
 
 
 # Set env var for deterministic run of models
@@ -46,7 +42,7 @@
 # To add new models or tasks, change here
 # ! The correct results must be present in reference_task_scores
 MODELS = ["gpt2"]
-TASKS = LEADERBOARD_SUBSET + STABLE_SUBSET + HELM_SUBSET
+TASKS = ALL_SUBSETS
 FULL_TEST = os.environ.get("LIGHTEVAL_FULL_TEST", False)