Skip to content

Commit 871192f

Browse files
authored
Add AGIEval (#121)
1 parent af4d6d6 commit 871192f

File tree

6 files changed

+137
-10
lines changed

6 files changed

+137
-10
lines changed

src/lighteval/metrics/metrics_sample.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -233,10 +233,8 @@ def compute(self, gold_ixs: list[int], choices_logprob: list[float], formatted_d
233233
if self.length_normalization:
234234
normalized_log_probs = []
235235
for ix, choice in enumerate(formatted_doc.choices):
236-
if self.ignore_first_space:
237-
normalized_log_probs.append(
238-
choices_logprob[ix] / (len(choice) - 1 if choice[0] == " " else len(choice))
239-
)
236+
if self.ignore_first_space and choice[0] == " ":
237+
normalized_log_probs.append(choices_logprob[ix] / (len(choice) - 1))
240238
else:
241239
normalized_log_probs.append(choices_logprob[ix] / len(choice))
242240

src/lighteval/tasks/tasks_prompt_formatting.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,15 @@ def anli(line, task_name: str = None):
4848
)
4949

5050

51+
def agieval(line, task_name: str = None):
52+
return Doc(
53+
task_name=task_name,
54+
query=line["query"],
55+
choices=[f" {c}" for c in line["choices"]],
56+
gold_index=line["gold"],
57+
)
58+
59+
5160
def apps(line, task_name: str = None):
5261
answer_type = "\nUse Call-Based format\n" if line["starter_code"] != "" else "\nUse Standard Input format\n"
5362
return Doc(

src/lighteval/tasks/tasks_table.jsonl

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,21 @@
11
{"name":"abstract_narrative_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"abstract_narrative_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
2+
{"name":"agieval:aqua-rat","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-aqua-rat","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
3+
{"name":"agieval:gaokao-biology","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-biology","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
4+
{"name":"agieval:gaokao-chemistry","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chemistry","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
5+
{"name":"agieval:gaokao-chinese","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chinese","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
6+
{"name":"agieval:gaokao-english","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-english","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
7+
{"name":"agieval:gaokao-geography","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-geography","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
8+
{"name":"agieval:gaokao-history","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-history","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
9+
{"name":"agieval:gaokao-mathqa","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-mathqa","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
10+
{"name":"agieval:gaokao-physics","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-physics","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
11+
{"name":"agieval:logiqa-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
12+
{"name":"agieval:logiqa-zh","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-zh","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
13+
{"name":"agieval:lsat-ar","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-ar","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
14+
{"name":"agieval:lsat-lr","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-lr","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
15+
{"name":"agieval:lsat-rc","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-rc","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
16+
{"name":"agieval:sat-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
17+
{"name":"agieval:sat-en-without-passage","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en-without-passage","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
18+
{"name":"agieval:sat-math","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-math","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
219
{"name":"anachronisms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"anachronisms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
320
{"name":"analogical_similarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analogical_similarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
421
{"name":"analytic_entailment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analytic_entailment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}

tests/reference_scores/reference_task_scores.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -615,5 +615,59 @@
615615
"lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0": {"acc": 0.2000, "acc_stderr": 0.1333},
616616
"lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0": {"acc": 0.3000, "acc_stderr": 0.1528},
617617
"lighteval|bigbench:tracking_shuffled_objects_three_objects|3|0": {"acc": 0.4000, "acc_stderr": 0.1633},
618+
"lighteval|agieval:_average|0|0": {
619+
"acc": 0.2125,
620+
"acc_stderr": 0.1323,
621+
"acc_norm": 0.2250,
622+
"acc_norm_stderr": 0.1347,
623+
},
624+
"lighteval|agieval:aqua-rat|0|0": {
625+
"acc": 0.3000,
626+
"acc_stderr": 0.15275,
627+
"acc_norm": 0.3000,
628+
"acc_norm_stderr": 0.15275,
629+
},
630+
"lighteval|agieval:logiqa-en|0|0": {
631+
"acc": 0.1000,
632+
"acc_stderr": 0.1000,
633+
"acc_norm": 0.3000,
634+
"acc_norm_stderr": 0.15275,
635+
},
636+
"lighteval|agieval:lsat-ar|0|0": {
637+
"acc": 0.1000,
638+
"acc_stderr": 0.1000,
639+
"acc_norm": 0.1000,
640+
"acc_norm_stderr": 0.1000,
641+
},
642+
"lighteval|agieval:lsat-lr|0|0": {
643+
"acc": 0.2000,
644+
"acc_stderr": 0.13333,
645+
"acc_norm": 0.2000,
646+
"acc_norm_stderr": 0.13333,
647+
},
648+
"lighteval|agieval:lsat-rc|0|0": {
649+
"acc": 0.3000,
650+
"acc_stderr": 0.15275,
651+
"acc_norm": 0.2000,
652+
"acc_norm_stderr": 0.13333,
653+
},
654+
"lighteval|agieval:sat-en-without-passage|0|0": {
655+
"acc": 0.2000,
656+
"acc_stderr": 0.13333,
657+
"acc_norm": 0.3000,
658+
"acc_norm_stderr": 0.15275,
659+
},
660+
"lighteval|agieval:sat-en|0|0": {
661+
"acc": 0.2000,
662+
"acc_stderr": 0.13333,
663+
"acc_norm": 0.3000,
664+
"acc_norm_stderr": 0.15275,
665+
},
666+
"lighteval|agieval:sat-math|0|0": {
667+
"acc": 0.3000,
668+
"acc_stderr": 0.15275,
669+
"acc_norm": 0.1000,
670+
"acc_norm_stderr": 0.1000,
671+
},
618672
},
619673
}

tests/reference_scores/reference_tasks.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,56 @@
6161
"helm|boolq|5|0",
6262
"helm|hellaswag|5|0",
6363
]
64+
65+
AGIEVAL_SUBSET = [
66+
"lighteval|agieval:aqua-rat|0|0",
67+
"lighteval|agieval:logiqa-en|0|0",
68+
"lighteval|agieval:lsat-ar|0|0",
69+
"lighteval|agieval:lsat-lr|0|0",
70+
"lighteval|agieval:lsat-rc|0|0",
71+
"lighteval|agieval:sat-en-without-passage|0|0",
72+
"lighteval|agieval:sat-en|0|0",
73+
"lighteval|agieval:sat-math|0|0",
74+
]
75+
76+
BBH_SUBSET = [
77+
"lighteval|bigbench:causal_judgment|3|0",
78+
"harness|bigbench:causal_judgment|3|0",
79+
"lighteval|bigbench:date_understanding|3|0",
80+
"harness|bigbench:date_understanding|3|0",
81+
"lighteval|bigbench:disambiguation_qa|3|0",
82+
"harness|bigbench:disambiguation_qa|3|0",
83+
"lighteval|bigbench:geometric_shapes|3|0",
84+
"harness|bigbench:geometric_shapes|3|0",
85+
"lighteval|bigbench:logical_deduction_five_objects|3|0",
86+
"harness|bigbench:logical_deduction_five_objects|3|0",
87+
"lighteval|bigbench:logical_deduction_seven_objects|3|0",
88+
"harness|bigbench:logical_deduction_seven_objects|3|0",
89+
"lighteval|bigbench:logical_deduction_three_objects|3|0",
90+
"harness|bigbench:logical_deduction_three_objects|3|0",
91+
"lighteval|bigbench:movie_recommendation|3|0",
92+
"harness|bigbench:movie_recommendation|3|0",
93+
"lighteval|bigbench:navigate|3|0",
94+
"harness|bigbench:navigate|3|0",
95+
"lighteval|bigbench:reasoning_about_colored_objects|3|0",
96+
"harness|bigbench:reasoning_about_colored_objects|3|0",
97+
"lighteval|bigbench:ruin_names|3|0",
98+
"harness|bigbench:ruin_names|3|0",
99+
"lighteval|bigbench:salient_translation_error_detection|3|0",
100+
"harness|bigbench:salient_translation_error_detection|3|0",
101+
"lighteval|bigbench:snarks|3|0",
102+
"harness|bigbench:snarks|3|0",
103+
"lighteval|bigbench:sports_understanding|3|0",
104+
"harness|bigbench:sports_understanding|3|0",
105+
"lighteval|bigbench:temporal_sequences|3|0",
106+
"harness|bigbench:temporal_sequences|3|0",
107+
"lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0",
108+
"harness|bigbench:tracking_shuffled_objects_five_objects|3|0",
109+
"lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0",
110+
"harness|bigbench:tracking_shuffled_objects_seven_objects|3|0",
111+
"lighteval|bigbench:tracking_shuffled_objects_three_objects|3|0",
112+
"harness|bigbench:tracking_shuffled_objects_three_objects|3|0",
113+
]
114+
115+
ALL_SUBSETS = LEADERBOARD_SUBSET + STABLE_SUBSET + HELM_SUBSET + AGIEVAL_SUBSET
116+
# + BBH_SUBSET - has a problem, to fix!, removed in this commit https://github.com/huggingface/lighteval/pull/7/commits/c136ad59fc74bb3eee6546dcf0802eb8c2f3bcbe

tests/test_main.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,7 @@
2929
from lighteval.main_accelerate import main # noqa: E402
3030
from run_evals_accelerate import get_parser
3131
from tests.reference_scores.reference_task_scores import RESULTS_FULL, RESULTS_LITE # noqa: E402
32-
from tests.reference_scores.reference_tasks import ( # noqa: E402
33-
HELM_SUBSET,
34-
LEADERBOARD_SUBSET,
35-
STABLE_SUBSET,
36-
)
32+
from tests.reference_scores.reference_tasks import ALL_SUBSETS
3733

3834

3935
# Set env var for deterministic run of models
@@ -46,7 +42,7 @@
4642
# To add new models or tasks, change here
4743
# ! The correct results must be present in reference_task_scores
4844
MODELS = ["gpt2"]
49-
TASKS = LEADERBOARD_SUBSET + STABLE_SUBSET + HELM_SUBSET
45+
TASKS = ALL_SUBSETS
5046
FULL_TEST = os.environ.get("LIGHTEVAL_FULL_TEST", False)
5147

5248

0 commit comments

Comments
 (0)