Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,8 @@ def compute(self, gold_ixs: list[int], choices_logprob: list[float], formatted_d
if self.length_normalization:
normalized_log_probs = []
for ix, choice in enumerate(formatted_doc.choices):
if self.ignore_first_space:
normalized_log_probs.append(
choices_logprob[ix] / (len(choice) - 1 if choice[0] == " " else len(choice))
)
if self.ignore_first_space and choice[0] == " ":
normalized_log_probs.append(choices_logprob[ix] / (len(choice) - 1))
else:
normalized_log_probs.append(choices_logprob[ix] / len(choice))

Expand Down
9 changes: 9 additions & 0 deletions src/lighteval/tasks/tasks_prompt_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,15 @@ def anli(line, task_name: str = None):
)


def agieval(line, task_name: str = None):
return Doc(
task_name=task_name,
query=line["query"],
choices=[f" {c}" for c in line["choices"]],
gold_index=line["gold"],
)


def apps(line, task_name: str = None):
answer_type = "\nUse Call-Based format\n" if line["starter_code"] != "" else "\nUse Standard Input format\n"
return Doc(
Expand Down
17 changes: 17 additions & 0 deletions src/lighteval/tasks/tasks_table.jsonl
Original file line number Diff line number Diff line change
@@ -1,4 +1,21 @@
{"name":"abstract_narrative_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"abstract_narrative_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:aqua-rat","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-aqua-rat","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:gaokao-biology","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-biology","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:gaokao-chemistry","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chemistry","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:gaokao-chinese","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chinese","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:gaokao-english","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-english","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:gaokao-geography","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-geography","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:gaokao-history","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-history","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:gaokao-mathqa","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-mathqa","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:gaokao-physics","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-physics","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:logiqa-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:logiqa-zh","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-zh","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:lsat-ar","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-ar","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:lsat-lr","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-lr","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:lsat-rc","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-rc","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:sat-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:sat-en-without-passage","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en-without-passage","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"agieval:sat-math","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-math","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"anachronisms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"anachronisms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"analogical_similarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analogical_similarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
{"name":"analytic_entailment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analytic_entailment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
Expand Down
54 changes: 54 additions & 0 deletions tests/reference_scores/reference_task_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,5 +615,59 @@
"lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0": {"acc": 0.2000, "acc_stderr": 0.1333},
"lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0": {"acc": 0.3000, "acc_stderr": 0.1528},
"lighteval|bigbench:tracking_shuffled_objects_three_objects|3|0": {"acc": 0.4000, "acc_stderr": 0.1633},
"lighteval|agieval:_average|0|0": {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

those come from YALL or you computed them with lighteval ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These our own results on this model, which is not in YALL iirc - but I tested on a range of models from YALL and we are always within stderr range.

"acc": 0.2125,
"acc_stderr": 0.1323,
"acc_norm": 0.2250,
"acc_norm_stderr": 0.1347,
},
"lighteval|agieval:aqua-rat|0|0": {
"acc": 0.3000,
"acc_stderr": 0.15275,
"acc_norm": 0.3000,
"acc_norm_stderr": 0.15275,
},
"lighteval|agieval:logiqa-en|0|0": {
"acc": 0.1000,
"acc_stderr": 0.1000,
"acc_norm": 0.3000,
"acc_norm_stderr": 0.15275,
},
"lighteval|agieval:lsat-ar|0|0": {
"acc": 0.1000,
"acc_stderr": 0.1000,
"acc_norm": 0.1000,
"acc_norm_stderr": 0.1000,
},
"lighteval|agieval:lsat-lr|0|0": {
"acc": 0.2000,
"acc_stderr": 0.13333,
"acc_norm": 0.2000,
"acc_norm_stderr": 0.13333,
},
"lighteval|agieval:lsat-rc|0|0": {
"acc": 0.3000,
"acc_stderr": 0.15275,
"acc_norm": 0.2000,
"acc_norm_stderr": 0.13333,
},
"lighteval|agieval:sat-en-without-passage|0|0": {
"acc": 0.2000,
"acc_stderr": 0.13333,
"acc_norm": 0.3000,
"acc_norm_stderr": 0.15275,
},
"lighteval|agieval:sat-en|0|0": {
"acc": 0.2000,
"acc_stderr": 0.13333,
"acc_norm": 0.3000,
"acc_norm_stderr": 0.15275,
},
"lighteval|agieval:sat-math|0|0": {
"acc": 0.3000,
"acc_stderr": 0.15275,
"acc_norm": 0.1000,
"acc_norm_stderr": 0.1000,
},
},
}
53 changes: 53 additions & 0 deletions tests/reference_scores/reference_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,56 @@
"helm|boolq|5|0",
"helm|hellaswag|5|0",
]

AGIEVAL_SUBSET = [
"lighteval|agieval:aqua-rat|0|0",
"lighteval|agieval:logiqa-en|0|0",
"lighteval|agieval:lsat-ar|0|0",
"lighteval|agieval:lsat-lr|0|0",
"lighteval|agieval:lsat-rc|0|0",
"lighteval|agieval:sat-en-without-passage|0|0",
"lighteval|agieval:sat-en|0|0",
"lighteval|agieval:sat-math|0|0",
]

BBH_SUBSET = [
"lighteval|bigbench:causal_judgment|3|0",
"harness|bigbench:causal_judgment|3|0",
"lighteval|bigbench:date_understanding|3|0",
"harness|bigbench:date_understanding|3|0",
"lighteval|bigbench:disambiguation_qa|3|0",
"harness|bigbench:disambiguation_qa|3|0",
"lighteval|bigbench:geometric_shapes|3|0",
"harness|bigbench:geometric_shapes|3|0",
"lighteval|bigbench:logical_deduction_five_objects|3|0",
"harness|bigbench:logical_deduction_five_objects|3|0",
"lighteval|bigbench:logical_deduction_seven_objects|3|0",
"harness|bigbench:logical_deduction_seven_objects|3|0",
"lighteval|bigbench:logical_deduction_three_objects|3|0",
"harness|bigbench:logical_deduction_three_objects|3|0",
"lighteval|bigbench:movie_recommendation|3|0",
"harness|bigbench:movie_recommendation|3|0",
"lighteval|bigbench:navigate|3|0",
"harness|bigbench:navigate|3|0",
"lighteval|bigbench:reasoning_about_colored_objects|3|0",
"harness|bigbench:reasoning_about_colored_objects|3|0",
"lighteval|bigbench:ruin_names|3|0",
"harness|bigbench:ruin_names|3|0",
"lighteval|bigbench:salient_translation_error_detection|3|0",
"harness|bigbench:salient_translation_error_detection|3|0",
"lighteval|bigbench:snarks|3|0",
"harness|bigbench:snarks|3|0",
"lighteval|bigbench:sports_understanding|3|0",
"harness|bigbench:sports_understanding|3|0",
"lighteval|bigbench:temporal_sequences|3|0",
"harness|bigbench:temporal_sequences|3|0",
"lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0",
"harness|bigbench:tracking_shuffled_objects_five_objects|3|0",
"lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0",
"harness|bigbench:tracking_shuffled_objects_seven_objects|3|0",
"lighteval|bigbench:tracking_shuffled_objects_three_objects|3|0",
"harness|bigbench:tracking_shuffled_objects_three_objects|3|0",
]

ALL_SUBSETS = LEADERBOARD_SUBSET + STABLE_SUBSET + HELM_SUBSET + AGIEVAL_SUBSET
# + BBH_SUBSET - has a problem, to fix!, removed in this commit https://github.com/huggingface/lighteval/pull/7/commits/c136ad59fc74bb3eee6546dcf0802eb8c2f3bcbe
8 changes: 2 additions & 6 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,7 @@
from lighteval.main_accelerate import main # noqa: E402
from run_evals_accelerate import get_parser
from tests.reference_scores.reference_task_scores import RESULTS_FULL, RESULTS_LITE # noqa: E402
from tests.reference_scores.reference_tasks import ( # noqa: E402
HELM_SUBSET,
LEADERBOARD_SUBSET,
STABLE_SUBSET,
)
from tests.reference_scores.reference_tasks import ALL_SUBSETS


# Set env var for deterministic run of models
Expand All @@ -46,7 +42,7 @@
# To add new models or tasks, change here
# ! The correct results must be present in reference_task_scores
MODELS = ["gpt2"]
TASKS = LEADERBOARD_SUBSET + STABLE_SUBSET + HELM_SUBSET
TASKS = ALL_SUBSETS
FULL_TEST = os.environ.get("LIGHTEVAL_FULL_TEST", False)


Expand Down