diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 1d036fa83..074ac4c56 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -111,7 +111,6 @@ def __init__( suite=["community"], generation_size=-1, stop_sequence=None, - trust_dataset=True, version=0, ) @@ -173,7 +172,6 @@ def __init__( suite=["community"], generation_size=-1, stop_sequence=None, - trust_dataset=True, version=0, ) @@ -238,7 +236,6 @@ def __init__( suite=["community"], generation_size=-1, stop_sequence=None, - trust_dataset=True, version=0, ) @@ -294,7 +291,6 @@ def __init__( suite=["community"], generation_size=-1, stop_sequence=None, - trust_dataset=True, version=0, ) @@ -351,7 +347,6 @@ def __init__( suite=["community"], generation_size=-1, stop_sequence=None, - trust_dataset=True, version=0, ) @@ -393,7 +388,6 @@ def arabic_exams_pfn(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, version=0, ) @@ -451,7 +445,6 @@ def __init__( suite=["community"], generation_size=-1, stop_sequence=None, - trust_dataset=True, version=0, ) @@ -471,7 +464,6 @@ def __init__( few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, version=0, ) @@ -488,7 +480,6 @@ def __init__( few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, version=0, ) @@ -505,7 +496,6 @@ def __init__( few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, version=0, ) @@ -522,7 +512,6 @@ def __init__( few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, version=0, ) @@ -539,7 +528,6 @@ def __init__( few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, version=0, ) @@ -556,7 +544,6 @@ def __init__( few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, version=0, ) @@ -594,7 +581,6 @@ def boolq_arabic_pfn(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, version=0, ) @@ -629,7 +615,6 @@ def copa_arabic_pfn(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, version=0, ) @@ -673,7 +658,6 @@ def hellaswag_arabic_pfn(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, version=0, ) @@ -710,7 +694,6 @@ def toxigen_arabic_pfn(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, version=0, ) @@ -761,7 +744,6 @@ def sciq_arabic_pfn(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, version=0, ) @@ -826,7 +808,6 @@ def __init__( suite=["community"], generation_size=-1, stop_sequence=None, - trust_dataset=True, version=0, ) @@ -1038,7 +1019,6 @@ def process_judge_response(response) -> float: hf_avail_splits=["train"], evaluation_splits=["train"], metrics=[wrapped_judge], - trust_dataset=True, generation_size=200, stop_sequence=[], version=0, diff --git a/community_tasks/french_evals.py b/community_tasks/french_evals.py index 75185113d..200216b00 100644 --- a/community_tasks/french_evals.py +++ b/community_tasks/french_evals.py @@ -121,7 +121,6 @@ def prompt_bac_fr(line, task_name: str = None): generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) @@ -139,7 +138,6 @@ def prompt_bac_fr(line, task_name: str = None): generation_size=1, metrics=[Metrics.quasi_exact_match_math, Metrics.exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) diff --git a/community_tasks/serbian_eval.py b/community_tasks/serbian_eval.py index 9cfe85e31..2da4041bf 100644 --- a/community_tasks/serbian_eval.py +++ b/community_tasks/serbian_eval.py @@ -283,10 +283,7 @@ def create_task_config( few_shots_select="sequential", metric=metric, generation_size=generation_size, - # Since we use trust_dataset, we have to be careful about what is inside the dataset - # script. We thus lock the revision to ensure that the script doesn't change hf_revision=HFSubsets.HF_REVISION.value, - trust_dataset=True, version=0, ) diff --git a/community_tasks/turkic_evals.py b/community_tasks/turkic_evals.py index 1a9caa380..f29b7ebd4 100644 --- a/community_tasks/turkic_evals.py +++ b/community_tasks/turkic_evals.py @@ -136,7 +136,6 @@ def __init__( suite=["community"], generation_size=-1, stop_sequence=None, - trust_dataset=False, version=0, ) diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx index 2a54aeaf4..aa09701a8 100644 --- a/docs/source/saving-and-reading-results.mdx +++ b/docs/source/saving-and-reading-results.mdx @@ -182,7 +182,6 @@ The detail file contains the following columns: ], "original_num_docs": 1319, "effective_num_docs": 1, - "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 } diff --git a/examples/custom_tasks_templates/custom_yourbench_task.py b/examples/custom_tasks_templates/custom_yourbench_task.py index c223ea378..dbae4d368 100644 --- a/examples/custom_tasks_templates/custom_yourbench_task.py +++ b/examples/custom_tasks_templates/custom_yourbench_task.py @@ -258,7 +258,6 @@ def yourbench_prompt(line, task_name: str = ""): generation_size=8192, metrics=[Metrics.yourbench_metrics], stop_sequence=[], - trust_dataset=True, version=0, ) diff --git a/examples/custom_tasks_templates/custom_yourbench_task_mcq.py b/examples/custom_tasks_templates/custom_yourbench_task_mcq.py index 961fa264b..3e281a66f 100644 --- a/examples/custom_tasks_templates/custom_yourbench_task_mcq.py +++ b/examples/custom_tasks_templates/custom_yourbench_task_mcq.py @@ -94,8 +94,7 @@ def yourbench_prompt(line, task_name: str = ""): few_shots_split=None, few_shots_select=None, generation_size=8192, - metric=[Metrics.yourbench_metrics], - trust_dataset=True, + metrics=[Metrics.yourbench_metrics], version=0, ) diff --git a/examples/custom_tasks_tests.py b/examples/custom_tasks_tests.py index 46b2f18ab..ceb0b08cd 100644 --- a/examples/custom_tasks_tests.py +++ b/examples/custom_tasks_tests.py @@ -38,7 +38,6 @@ generation_size=512, metrics=[Metrics.expr_gold_metric], stop_sequence=None, - trust_dataset=True, version=0, ) @@ -55,7 +54,6 @@ generation_size=2048, metrics=[Metrics.gpqa_instruct_pass_at_1_1n], stop_sequence=[], # no stop sequence, will use eos token - trust_dataset=True, version=0, ) diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py index 49010098c..fef7a61d9 100644 --- a/examples/nanotron/custom_evaluation_tasks.py +++ b/examples/nanotron/custom_evaluation_tasks.py @@ -90,7 +90,6 @@ def preprocess(text): hf_repo="hellaswag", hf_subset="default", metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], - trust_dataset=True, stop_sequence=["\n"], ), LightevalTaskConfig( @@ -99,7 +98,6 @@ def preprocess(text): hf_repo="winogrande", hf_subset="winogrande_xl", metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], - trust_dataset=True, stop_sequence=["\n"], ), LightevalTaskConfig( @@ -108,7 +106,6 @@ def preprocess(text): hf_repo="piqa", hf_subset="plain_text", metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], - trust_dataset=True, stop_sequence=["\n"], ), LightevalTaskConfig( @@ -118,7 +115,6 @@ def preprocess(text): hf_subset="default", hf_avail_splits=["train", "validation"], metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], - trust_dataset=True, stop_sequence=["\n"], ), LightevalTaskConfig( @@ -127,7 +123,6 @@ def preprocess(text): hf_repo="openbookqa", hf_subset="main", metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], - trust_dataset=True, stop_sequence=["\n"], ), LightevalTaskConfig( @@ -138,7 +133,6 @@ def preprocess(text): evaluation_splits=["test"], generation_size=1, metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], - trust_dataset=True, stop_sequence=["\n"], ), LightevalTaskConfig( @@ -149,7 +143,6 @@ def preprocess(text): evaluation_splits=["test"], generation_size=1, metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], - trust_dataset=True, stop_sequence=["\n"], ), LightevalTaskConfig( @@ -158,7 +151,6 @@ def preprocess(text): hf_repo="commonsense_qa", hf_subset="default", metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], - trust_dataset=True, stop_sequence=["\n"], ), ] @@ -189,7 +181,6 @@ def natural_questions_prompt(line, task_name: str = None): hf_subset="rc.nocontext", metric=[Metrics.quasi_exact_match], generation_size=20, - trust_dataset=True, stop_sequence=["\n", ".", ","], ), LightevalTaskConfig( @@ -199,7 +190,6 @@ def natural_questions_prompt(line, task_name: str = None): hf_subset="default", metric=[Metrics.quasi_exact_match], generation_size=20, - trust_dataset=True, stop_sequence=["\n", ".", ","], ), ] @@ -228,7 +218,6 @@ def boolq_prompt(line, task_name: str = None): hf_repo="super_glue", hf_subset="boolq", metric=[Metrics.target_perplexity], - trust_dataset=True, stop_sequence=["\n"], ), LightevalTaskConfig( @@ -238,7 +227,6 @@ def boolq_prompt(line, task_name: str = None): hf_subset="deault", metric=[Metrics.quasi_exact_match], generation_size=20, - trust_dataset=True, stop_sequence=["\n", ".", ","], ), ] @@ -266,7 +254,6 @@ def __init__( few_shots_select=None, suite=["custom"], generation_size=40, - trust_dataset=True, stop_sequence=None, ): super().__init__( @@ -281,7 +268,6 @@ def __init__( few_shots_select=few_shots_select, suite=suite, generation_size=generation_size, - trust_dataset=trust_dataset, stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]), ) @@ -365,7 +351,6 @@ def __init__( few_shots_select=None, suite=None, generation_size=-1, - trust_dataset=True, stop_sequence=None, ): super().__init__( @@ -380,7 +365,6 @@ def __init__( few_shots_select=few_shots_select, suite=suite, generation_size=generation_size, - trust_dataset=trust_dataset, stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]), ) @@ -478,7 +462,6 @@ def __init__( few_shots_select=None, suite=None, generation_size=4, - trust_dataset=True, stop_sequence=None, ): super().__init__( @@ -493,7 +476,6 @@ def __init__( few_shots_select=few_shots_select, suite=suite, generation_size=generation_size, - trust_dataset=trust_dataset, stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]), ) @@ -610,7 +592,6 @@ def __init__( few_shots_select=None, suite=None, generation_size=-1, - trust_dataset=True, stop_sequence=None, ): super().__init__( @@ -625,7 +606,6 @@ def __init__( few_shots_select=few_shots_select, suite=suite, generation_size=generation_size, - trust_dataset=trust_dataset, stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]), ) diff --git a/pyproject.toml b/pyproject.toml index 6cdbbf20d..3d0ceb956 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ dependencies = [ "huggingface_hub[hf_xet]>=0.30.2", "torch>=2.0,<3.0", "GitPython>=3.1.41", # for logging - "datasets>=3.5.0,<4.0.0", + "datasets>=4.0.0", "pydantic", "numpy>=2", # pinned to avoid incompatibilities "hf-xet==1.1.5", # pinned to avoid failing test suite diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index d054060b1..b647dae0a 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -39,7 +39,6 @@ generation_size=30, # expected an answer in a format 'Answer: B' metrics=[Metrics.gpqa_instruct_metric], stop_sequence=None, - trust_dataset=True, version=0, ) mmmu_pro_standard_10_options = LightevalTaskConfig( @@ -55,7 +54,6 @@ generation_size=30, # expected an answer in a format 'Answer: B' metrics=[Metrics.gpqa_instruct_metric], stop_sequence=None, - trust_dataset=True, version=0, ) mmmu_pro_vision = LightevalTaskConfig( @@ -71,14 +69,13 @@ generation_size=30, # expected an answer in a format 'Answer: B' metrics=[Metrics.gpqa_instruct_metric], stop_sequence=None, - trust_dataset=True, version=0, ) abstract_narrative_understanding_bigbench = LightevalTaskConfig( name="abstract_narrative_understanding", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="abstract_narrative_understanding", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -87,7 +84,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) agieval_aqua_rat_lighteval = LightevalTaskConfig( @@ -103,7 +99,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_gaokao_biology_lighteval = LightevalTaskConfig( @@ -119,7 +114,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_gaokao_chemistry_lighteval = LightevalTaskConfig( @@ -135,7 +129,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_gaokao_chinese_lighteval = LightevalTaskConfig( @@ -151,7 +144,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_gaokao_english_lighteval = LightevalTaskConfig( @@ -167,7 +159,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_gaokao_geography_lighteval = LightevalTaskConfig( @@ -183,7 +174,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_gaokao_history_lighteval = LightevalTaskConfig( @@ -199,7 +189,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_gaokao_mathqa_lighteval = LightevalTaskConfig( @@ -215,7 +204,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_gaokao_physics_lighteval = LightevalTaskConfig( @@ -231,7 +219,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_logiqa_en_lighteval = LightevalTaskConfig( @@ -247,7 +234,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_logiqa_zh_lighteval = LightevalTaskConfig( @@ -263,7 +249,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_lsat_ar_lighteval = LightevalTaskConfig( @@ -279,7 +264,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_lsat_lr_lighteval = LightevalTaskConfig( @@ -295,7 +279,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_lsat_rc_lighteval = LightevalTaskConfig( @@ -311,7 +294,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_sat_en_lighteval = LightevalTaskConfig( @@ -327,7 +309,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_sat_en_without_passage_lighteval = LightevalTaskConfig( @@ -343,7 +324,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) agieval_sat_math_lighteval = LightevalTaskConfig( @@ -359,7 +339,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=None, - trust_dataset=True, version=0, ) aime24 = LightevalTaskConfig( @@ -452,7 +431,7 @@ name="anachronisms", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="anachronisms", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -461,14 +440,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) analogical_similarity_bigbench = LightevalTaskConfig( name="analogical_similarity", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="analogical_similarity", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -477,14 +455,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) analytic_entailment_bigbench = LightevalTaskConfig( name="analytic_entailment", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="analytic_entailment", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -493,7 +470,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) anli_r1_lighteval = LightevalTaskConfig( @@ -509,7 +485,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) anli_r2_lighteval = LightevalTaskConfig( @@ -525,7 +500,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) anli_r3_lighteval = LightevalTaskConfig( @@ -541,7 +515,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arc_agi_2 = LightevalTaskConfig( @@ -557,7 +530,6 @@ generation_size=2048, metrics=[Metrics.exact_match], stop_sequence=None, - trust_dataset=False, version=0, ) arc_c_letters_original = LightevalTaskConfig( @@ -573,7 +545,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arc_c_options_original = LightevalTaskConfig( @@ -589,7 +560,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arc_c_simple_original = LightevalTaskConfig( @@ -605,7 +575,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arc_challenge_leaderboard = LightevalTaskConfig( @@ -621,7 +590,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arc_easy_lighteval = LightevalTaskConfig( @@ -637,7 +605,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arithmetic_1dc_lighteval = LightevalTaskConfig( @@ -653,7 +620,6 @@ generation_size=-1, metrics=[Metrics.acc_golds_likelihood], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arithmetic_2da_lighteval = LightevalTaskConfig( @@ -669,7 +635,6 @@ generation_size=-1, metrics=[Metrics.acc_golds_likelihood], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arithmetic_2dm_lighteval = LightevalTaskConfig( @@ -685,7 +650,6 @@ generation_size=-1, metrics=[Metrics.acc_golds_likelihood], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arithmetic_2ds_lighteval = LightevalTaskConfig( @@ -701,7 +665,6 @@ generation_size=-1, metrics=[Metrics.acc_golds_likelihood], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arithmetic_3da_lighteval = LightevalTaskConfig( @@ -717,7 +680,6 @@ generation_size=-1, metrics=[Metrics.acc_golds_likelihood], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arithmetic_3ds_lighteval = LightevalTaskConfig( @@ -733,7 +695,6 @@ generation_size=-1, metrics=[Metrics.acc_golds_likelihood], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arithmetic_4da_lighteval = LightevalTaskConfig( @@ -749,7 +710,6 @@ generation_size=-1, metrics=[Metrics.acc_golds_likelihood], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arithmetic_4ds_lighteval = LightevalTaskConfig( @@ -765,7 +725,6 @@ generation_size=-1, metrics=[Metrics.acc_golds_likelihood], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arithmetic_5da_lighteval = LightevalTaskConfig( @@ -781,7 +740,6 @@ generation_size=-1, metrics=[Metrics.acc_golds_likelihood], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arithmetic_5ds_lighteval = LightevalTaskConfig( @@ -797,14 +755,13 @@ generation_size=-1, metrics=[Metrics.acc_golds_likelihood], stop_sequence=["\n"], - trust_dataset=True, version=0, ) arithmetic_bb_bigbench = LightevalTaskConfig( name="arithmetic_bb", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="arithmetic", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -813,14 +770,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) ascii_word_recognition_bigbench = LightevalTaskConfig( name="ascii_word_recognition", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="ascii_word_recognition", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -829,7 +785,6 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) asdiv_lighteval = LightevalTaskConfig( @@ -845,14 +800,13 @@ generation_size=-1, metrics=[Metrics.acc_golds_likelihood], stop_sequence=["\n"], - trust_dataset=True, version=0, ) authorship_verification_bigbench = LightevalTaskConfig( name="authorship_verification", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="authorship_verification", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -861,14 +815,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) auto_categorization_bigbench = LightevalTaskConfig( name="auto_categorization", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="auto_categorization", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -877,14 +830,13 @@ generation_size=1, metrics=[Metrics.bleu], stop_sequence=["\n"], - trust_dataset=True, version=0, ) auto_debugging_bigbench_lite = LightevalTaskConfig( name="auto_debugging", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_linefeed_before_and_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="auto_debugging", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -893,7 +845,6 @@ generation_size=100, metrics=[Metrics.perfect_exact_match], stop_sequence=None, - trust_dataset=True, version=0, ) babi_qa_helm = LightevalTaskConfig( @@ -914,7 +865,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_causal_judgment_lighteval = LightevalTaskConfig( @@ -930,7 +880,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_date_understanding_lighteval = LightevalTaskConfig( @@ -946,7 +895,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_disambiguation_qa_lighteval = LightevalTaskConfig( @@ -962,7 +910,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_geometric_shapes_lighteval = LightevalTaskConfig( @@ -978,7 +925,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_logical_deduction_five_objects_lighteval = LightevalTaskConfig( @@ -994,7 +940,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_logical_deduction_seven_objects_lighteval = LightevalTaskConfig( @@ -1010,7 +955,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_logical_deduction_three_objects_lighteval = LightevalTaskConfig( @@ -1026,7 +970,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_movie_recommendation_lighteval = LightevalTaskConfig( @@ -1042,7 +985,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_navigate_lighteval = LightevalTaskConfig( @@ -1058,7 +1000,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_reasoning_about_colored_objects_lighteval = LightevalTaskConfig( @@ -1074,7 +1015,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_ruin_names_lighteval = LightevalTaskConfig( @@ -1090,7 +1030,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_salient_translation_error_detection_lighteval = LightevalTaskConfig( @@ -1106,7 +1045,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_snarks_lighteval = LightevalTaskConfig( @@ -1122,7 +1060,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_sports_understanding_lighteval = LightevalTaskConfig( @@ -1138,7 +1075,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_temporal_sequences_lighteval = LightevalTaskConfig( @@ -1154,7 +1090,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_tracking_shuffled_objects_five_objects_lighteval = LightevalTaskConfig( @@ -1170,7 +1105,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_tracking_shuffled_objects_seven_objects_lighteval = LightevalTaskConfig( @@ -1186,7 +1120,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_tracking_shuffled_objects_three_objects_lighteval = LightevalTaskConfig( @@ -1202,7 +1135,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bigbench_causal_judgment_harness = LightevalTaskConfig( @@ -1219,7 +1151,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_date_understanding_harness = LightevalTaskConfig( @@ -1236,7 +1167,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_disambiguation_qa_harness = LightevalTaskConfig( @@ -1253,7 +1183,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_geometric_shapes_harness = LightevalTaskConfig( @@ -1270,7 +1199,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_logical_deduction_five_objects_harness = LightevalTaskConfig( @@ -1287,7 +1215,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_logical_deduction_seven_objects_harness = LightevalTaskConfig( @@ -1304,7 +1231,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_logical_deduction_three_objects_harness = LightevalTaskConfig( @@ -1321,7 +1247,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_movie_recommendation_harness = LightevalTaskConfig( @@ -1338,7 +1263,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_navigate_harness = LightevalTaskConfig( @@ -1355,7 +1279,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_reasoning_about_colored_objects_harness = LightevalTaskConfig( @@ -1372,7 +1295,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_ruin_names_harness = LightevalTaskConfig( @@ -1389,7 +1311,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_salient_translation_error_detection_harness = LightevalTaskConfig( @@ -1406,7 +1327,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_snarks_harness = LightevalTaskConfig( @@ -1423,7 +1343,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_sports_understanding_harness = LightevalTaskConfig( @@ -1440,7 +1359,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_temporal_sequences_harness = LightevalTaskConfig( @@ -1457,7 +1375,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_tracking_shuffled_objects_five_objects_harness = LightevalTaskConfig( @@ -1474,7 +1391,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_tracking_shuffled_objects_seven_objects_harness = LightevalTaskConfig( @@ -1491,7 +1407,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bigbench_tracking_shuffled_objects_three_objects_harness = LightevalTaskConfig( @@ -1508,7 +1423,6 @@ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, - trust_dataset=True, version=0, ) bbh_boolean_expressions_harness = LightevalTaskConfig( @@ -1530,7 +1444,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_causal_judgment_harness = LightevalTaskConfig( @@ -1552,7 +1465,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_date_understanding_harness = LightevalTaskConfig( @@ -1574,7 +1486,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_disambiguation_qa_harness = LightevalTaskConfig( @@ -1596,7 +1507,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_dyck_languages_harness = LightevalTaskConfig( @@ -1618,7 +1528,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_formal_fallacies_harness = LightevalTaskConfig( @@ -1640,7 +1549,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_geometric_shapes_harness = LightevalTaskConfig( @@ -1662,7 +1570,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_hyperbaton_harness = LightevalTaskConfig( @@ -1684,7 +1591,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_logical_deduction_five_objects_harness = LightevalTaskConfig( @@ -1706,7 +1612,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_logical_deduction_seven_objects_harness = LightevalTaskConfig( @@ -1728,7 +1633,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_logical_deduction_three_objects_harness = LightevalTaskConfig( @@ -1750,7 +1654,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_movie_recommendation_harness = LightevalTaskConfig( @@ -1772,7 +1675,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_multistep_arithmetic_two_harness = LightevalTaskConfig( @@ -1794,7 +1696,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_navigate_harness = LightevalTaskConfig( @@ -1816,7 +1717,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_object_counting_harness = LightevalTaskConfig( @@ -1838,7 +1738,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_penguins_in_a_table_harness = LightevalTaskConfig( @@ -1860,7 +1759,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_reasoning_about_colored_objects_harness = LightevalTaskConfig( @@ -1882,7 +1780,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_ruin_names_harness = LightevalTaskConfig( @@ -1904,7 +1801,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_salient_translation_error_detection_harness = LightevalTaskConfig( @@ -1926,7 +1822,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_snarks_harness = LightevalTaskConfig( @@ -1948,7 +1843,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_sports_understanding_harness = LightevalTaskConfig( @@ -1970,7 +1864,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_temporal_sequences_harness = LightevalTaskConfig( @@ -1992,7 +1885,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_tracking_shuffled_objects_five_objects_harness = LightevalTaskConfig( @@ -2014,7 +1906,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_tracking_shuffled_objects_seven_objects_harness = LightevalTaskConfig( @@ -2036,7 +1927,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_tracking_shuffled_objects_three_objects_harness = LightevalTaskConfig( @@ -2058,7 +1948,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_web_of_lies_harness = LightevalTaskConfig( @@ -2080,7 +1969,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbh_word_sorting_harness = LightevalTaskConfig( @@ -2102,7 +1990,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["", "Q=", "\n\n"], - trust_dataset=True, version=0, ) bbq_helm = LightevalTaskConfig( @@ -2124,7 +2011,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bbq_Age_helm = LightevalTaskConfig( @@ -2146,7 +2032,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bbq_Disability_status_helm = LightevalTaskConfig( @@ -2168,7 +2053,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bbq_Gender_identity_helm = LightevalTaskConfig( @@ -2190,7 +2074,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bbq_Nationality_helm = LightevalTaskConfig( @@ -2212,7 +2095,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bbq_Physical_appearance_helm = LightevalTaskConfig( @@ -2234,7 +2116,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bbq_Race_ethnicity_helm = LightevalTaskConfig( @@ -2256,7 +2137,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bbq_Race_x_SES_helm = LightevalTaskConfig( @@ -2278,7 +2158,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bbq_Race_x_gender_helm = LightevalTaskConfig( @@ -2300,7 +2179,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bbq_Religion_helm = LightevalTaskConfig( @@ -2322,7 +2200,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bbq_SES_helm = LightevalTaskConfig( @@ -2344,7 +2221,6 @@ Metrics.perfect_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bbq_Sexual_orientation_helm = LightevalTaskConfig( @@ -2366,14 +2242,13 @@ Metrics.perfect_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bbq_lite_json_bigbench_lite = LightevalTaskConfig( name="bbq_lite_json", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="bbq_lite_json", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -2382,7 +2257,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_auto_debugging_helm = LightevalTaskConfig( @@ -2398,7 +2272,6 @@ generation_size=100, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_age_ambig_helm = LightevalTaskConfig( @@ -2419,7 +2292,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_age_disambig_helm = LightevalTaskConfig( @@ -2440,7 +2312,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_disability_status_ambig_helm = LightevalTaskConfig( @@ -2461,7 +2332,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_disability_status_disambig_helm = LightevalTaskConfig( @@ -2482,7 +2352,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_gender_identity_ambig_helm = LightevalTaskConfig( @@ -2503,7 +2372,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_gender_identity_disambig_helm = LightevalTaskConfig( @@ -2524,7 +2392,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_nationality_ambig_helm = LightevalTaskConfig( @@ -2545,7 +2412,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_nationality_disambig_helm = LightevalTaskConfig( @@ -2566,7 +2432,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_physical_appearance_ambig_helm = LightevalTaskConfig( @@ -2587,7 +2452,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_physical_appearance_disambig_helm = LightevalTaskConfig( @@ -2608,7 +2472,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_race_ethnicity_ambig_helm = LightevalTaskConfig( @@ -2629,7 +2492,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_race_ethnicity_disambig_helm = LightevalTaskConfig( @@ -2650,7 +2512,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_religion_ambig_helm = LightevalTaskConfig( @@ -2671,7 +2532,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_religion_disambig_helm = LightevalTaskConfig( @@ -2692,7 +2552,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_ses_ambig_helm = LightevalTaskConfig( @@ -2713,7 +2572,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_ses_disambig_helm = LightevalTaskConfig( @@ -2734,7 +2592,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_sexual_orientation_ambig_helm = LightevalTaskConfig( @@ -2755,7 +2612,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_bbq_lite_json_sexual_orientation_disambig_helm = LightevalTaskConfig( @@ -2776,7 +2632,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_code_line_description_helm = LightevalTaskConfig( @@ -2797,7 +2652,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conceptual_combinations_contradictions_helm = LightevalTaskConfig( @@ -2818,7 +2672,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conceptual_combinations_emergent_properties_helm = LightevalTaskConfig( @@ -2839,7 +2692,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conceptual_combinations_fanciful_fictional_combinations_helm = LightevalTaskConfig( @@ -2860,7 +2712,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conceptual_combinations_homonyms_helm = LightevalTaskConfig( @@ -2881,7 +2732,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conceptual_combinations_invented_words_helm = LightevalTaskConfig( @@ -2902,7 +2752,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_adna_from_helm = LightevalTaskConfig( @@ -2918,7 +2767,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_adna_to_helm = LightevalTaskConfig( @@ -2934,7 +2782,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_atikampe_from_helm = LightevalTaskConfig( @@ -2950,7 +2797,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_atikampe_to_helm = LightevalTaskConfig( @@ -2966,7 +2812,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_gornam_from_helm = LightevalTaskConfig( @@ -2982,7 +2827,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_gornam_to_helm = LightevalTaskConfig( @@ -2998,7 +2842,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_holuan_from_helm = LightevalTaskConfig( @@ -3014,7 +2857,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_holuan_to_helm = LightevalTaskConfig( @@ -3030,7 +2872,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_mkafala_from_helm = LightevalTaskConfig( @@ -3046,7 +2887,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_mkafala_to_helm = LightevalTaskConfig( @@ -3062,7 +2902,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_postpositive_english_from_helm = LightevalTaskConfig( @@ -3078,7 +2917,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_postpositive_english_to_helm = LightevalTaskConfig( @@ -3094,7 +2932,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_unapuri_from_helm = LightevalTaskConfig( @@ -3110,7 +2947,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_unapuri_to_helm = LightevalTaskConfig( @@ -3126,7 +2962,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_vaomi_from_helm = LightevalTaskConfig( @@ -3142,7 +2977,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_conlang_translation_vaomi_to_helm = LightevalTaskConfig( @@ -3158,7 +2992,6 @@ generation_size=100, metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_emoji_movie_helm = LightevalTaskConfig( @@ -3179,7 +3012,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_formal_fallacies_syllogisms_negation_helm = LightevalTaskConfig( @@ -3200,7 +3032,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_hindu_knowledge_helm = LightevalTaskConfig( @@ -3221,7 +3052,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_known_unknowns_helm = LightevalTaskConfig( @@ -3242,7 +3072,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_language_identification_helm = LightevalTaskConfig( @@ -3263,7 +3092,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_linguistics_puzzles_helm = LightevalTaskConfig( @@ -3279,7 +3107,6 @@ generation_size=100, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_logic_grid_puzzle_helm = LightevalTaskConfig( @@ -3300,7 +3127,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_logical_deduction_five_objects_helm = LightevalTaskConfig( @@ -3321,7 +3147,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_logical_deduction_seven_objects_helm = LightevalTaskConfig( @@ -3342,7 +3167,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_logical_deduction_three_objects_helm = LightevalTaskConfig( @@ -3363,7 +3187,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_misconceptions_russian_helm = LightevalTaskConfig( @@ -3384,7 +3207,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_novel_concepts_helm = LightevalTaskConfig( @@ -3405,7 +3227,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_operators_helm = LightevalTaskConfig( @@ -3421,7 +3242,6 @@ generation_size=100, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_parsinlu_reading_comprehension_helm = LightevalTaskConfig( @@ -3437,7 +3257,6 @@ generation_size=100, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_play_dialog_same_or_different_helm = LightevalTaskConfig( @@ -3458,7 +3277,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_repeat_copy_logic_helm = LightevalTaskConfig( @@ -3474,7 +3292,6 @@ generation_size=100, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_strange_stories_boolean_helm = LightevalTaskConfig( @@ -3495,7 +3312,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_strange_stories_multiple_choice_helm = LightevalTaskConfig( @@ -3516,7 +3332,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_strategyqa_helm = LightevalTaskConfig( @@ -3537,7 +3352,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_symbol_interpretation_adversarial_helm = LightevalTaskConfig( @@ -3558,7 +3372,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_symbol_interpretation_emoji_agnostic_helm = LightevalTaskConfig( @@ -3579,7 +3392,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_symbol_interpretation_name_agnostic_helm = LightevalTaskConfig( @@ -3600,7 +3412,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_symbol_interpretation_plain_helm = LightevalTaskConfig( @@ -3621,7 +3432,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_symbol_interpretation_tricky_helm = LightevalTaskConfig( @@ -3642,7 +3452,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_vitaminc_fact_verification_helm = LightevalTaskConfig( @@ -3663,7 +3472,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bigbench_winowhy_helm = LightevalTaskConfig( @@ -3684,7 +3492,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_adjunct_island_lighteval = LightevalTaskConfig( @@ -3700,7 +3507,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_adjunct_island_helm = LightevalTaskConfig( @@ -3716,7 +3522,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_anaphor_gender_agreement_lighteval = LightevalTaskConfig( @@ -3732,7 +3537,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_anaphor_gender_agreement_helm = LightevalTaskConfig( @@ -3748,7 +3552,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_anaphor_number_agreement_lighteval = LightevalTaskConfig( @@ -3764,7 +3567,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_anaphor_number_agreement_helm = LightevalTaskConfig( @@ -3780,7 +3582,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_animate_subject_passive_lighteval = LightevalTaskConfig( @@ -3796,7 +3597,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_animate_subject_passive_helm = LightevalTaskConfig( @@ -3812,7 +3612,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_animate_subject_trans_lighteval = LightevalTaskConfig( @@ -3828,7 +3627,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_animate_subject_trans_helm = LightevalTaskConfig( @@ -3844,7 +3642,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_causative_lighteval = LightevalTaskConfig( @@ -3860,7 +3657,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_causative_helm = LightevalTaskConfig( @@ -3876,7 +3672,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_complex_NP_island_lighteval = LightevalTaskConfig( @@ -3892,7 +3687,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_complex_NP_island_helm = LightevalTaskConfig( @@ -3908,7 +3702,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_coordinate_structure_constraint_complex_left_branch_lighteval = LightevalTaskConfig( @@ -3924,7 +3717,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_coordinate_structure_constraint_complex_left_branch_helm = LightevalTaskConfig( @@ -3940,7 +3732,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_coordinate_structure_constraint_object_extraction_lighteval = LightevalTaskConfig( @@ -3956,7 +3747,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_coordinate_structure_constraint_object_extraction_helm = LightevalTaskConfig( @@ -3972,7 +3762,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_1_lighteval = LightevalTaskConfig( @@ -3988,7 +3777,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_1_helm = LightevalTaskConfig( @@ -4004,7 +3792,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_2_lighteval = LightevalTaskConfig( @@ -4020,7 +3807,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_2_helm = LightevalTaskConfig( @@ -4036,7 +3822,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_irregular_1_lighteval = LightevalTaskConfig( @@ -4052,7 +3837,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_irregular_1_helm = LightevalTaskConfig( @@ -4068,7 +3852,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_irregular_2_lighteval = LightevalTaskConfig( @@ -4084,7 +3867,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_irregular_2_helm = LightevalTaskConfig( @@ -4100,7 +3882,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_with_adj_2_lighteval = LightevalTaskConfig( @@ -4116,7 +3897,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_with_adj_2_helm = LightevalTaskConfig( @@ -4132,7 +3912,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_with_adj_irregular_1_lighteval = LightevalTaskConfig( @@ -4148,7 +3927,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_with_adj_irregular_1_helm = LightevalTaskConfig( @@ -4164,7 +3942,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_with_adj_irregular_2_lighteval = LightevalTaskConfig( @@ -4180,7 +3957,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_with_adj_irregular_2_helm = LightevalTaskConfig( @@ -4196,7 +3972,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_with_adjective_1_lighteval = LightevalTaskConfig( @@ -4212,7 +3987,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_determiner_noun_agreement_with_adjective_1_helm = LightevalTaskConfig( @@ -4228,7 +4002,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_distractor_agreement_relational_noun_lighteval = LightevalTaskConfig( @@ -4244,7 +4017,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_distractor_agreement_relational_noun_helm = LightevalTaskConfig( @@ -4260,7 +4032,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_distractor_agreement_relative_clause_lighteval = LightevalTaskConfig( @@ -4276,7 +4047,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_distractor_agreement_relative_clause_helm = LightevalTaskConfig( @@ -4292,7 +4062,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_drop_argument_lighteval = LightevalTaskConfig( @@ -4308,7 +4077,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_drop_argument_helm = LightevalTaskConfig( @@ -4324,7 +4092,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_ellipsis_n_bar_1_lighteval = LightevalTaskConfig( @@ -4340,7 +4107,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_ellipsis_n_bar_1_helm = LightevalTaskConfig( @@ -4356,7 +4122,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_ellipsis_n_bar_2_lighteval = LightevalTaskConfig( @@ -4372,7 +4137,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_ellipsis_n_bar_2_helm = LightevalTaskConfig( @@ -4388,7 +4152,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_existential_there_object_raising_lighteval = LightevalTaskConfig( @@ -4404,7 +4167,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_existential_there_object_raising_helm = LightevalTaskConfig( @@ -4420,7 +4182,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_existential_there_quantifiers_1_lighteval = LightevalTaskConfig( @@ -4436,7 +4197,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_existential_there_quantifiers_1_helm = LightevalTaskConfig( @@ -4452,7 +4212,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_existential_there_quantifiers_2_lighteval = LightevalTaskConfig( @@ -4468,7 +4227,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_existential_there_quantifiers_2_helm = LightevalTaskConfig( @@ -4484,7 +4242,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_existential_there_subject_raising_lighteval = LightevalTaskConfig( @@ -4500,7 +4257,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_existential_there_subject_raising_helm = LightevalTaskConfig( @@ -4516,7 +4272,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_expletive_it_object_raising_lighteval = LightevalTaskConfig( @@ -4532,7 +4287,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_expletive_it_object_raising_helm = LightevalTaskConfig( @@ -4548,7 +4302,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_inchoative_lighteval = LightevalTaskConfig( @@ -4564,7 +4317,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_inchoative_helm = LightevalTaskConfig( @@ -4580,7 +4332,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_intransitive_lighteval = LightevalTaskConfig( @@ -4596,7 +4347,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_intransitive_helm = LightevalTaskConfig( @@ -4612,7 +4362,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_irregular_past_participle_adjectives_lighteval = LightevalTaskConfig( @@ -4628,7 +4377,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_irregular_past_participle_adjectives_helm = LightevalTaskConfig( @@ -4644,7 +4392,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_irregular_past_participle_verbs_lighteval = LightevalTaskConfig( @@ -4660,7 +4407,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_irregular_past_participle_verbs_helm = LightevalTaskConfig( @@ -4676,7 +4422,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_irregular_plural_subject_verb_agreement_1_lighteval = LightevalTaskConfig( @@ -4692,7 +4437,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_irregular_plural_subject_verb_agreement_1_helm = LightevalTaskConfig( @@ -4708,7 +4452,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_irregular_plural_subject_verb_agreement_2_lighteval = LightevalTaskConfig( @@ -4724,7 +4467,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_irregular_plural_subject_verb_agreement_2_helm = LightevalTaskConfig( @@ -4740,7 +4482,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_left_branch_island_echo_question_lighteval = LightevalTaskConfig( @@ -4756,7 +4497,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_left_branch_island_echo_question_helm = LightevalTaskConfig( @@ -4772,7 +4512,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_left_branch_island_simple_question_lighteval = LightevalTaskConfig( @@ -4788,7 +4527,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_left_branch_island_simple_question_helm = LightevalTaskConfig( @@ -4804,7 +4542,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_matrix_question_npi_licensor_present_lighteval = LightevalTaskConfig( @@ -4820,7 +4557,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_matrix_question_npi_licensor_present_helm = LightevalTaskConfig( @@ -4836,7 +4572,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_npi_present_1_lighteval = LightevalTaskConfig( @@ -4852,7 +4587,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_npi_present_1_helm = LightevalTaskConfig( @@ -4868,7 +4602,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_npi_present_2_lighteval = LightevalTaskConfig( @@ -4884,7 +4617,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_npi_present_2_helm = LightevalTaskConfig( @@ -4900,7 +4632,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_only_npi_licensor_present_lighteval = LightevalTaskConfig( @@ -4916,7 +4647,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_only_npi_licensor_present_helm = LightevalTaskConfig( @@ -4932,7 +4662,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_only_npi_scope_lighteval = LightevalTaskConfig( @@ -4948,7 +4677,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_only_npi_scope_helm = LightevalTaskConfig( @@ -4964,7 +4692,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_passive_1_lighteval = LightevalTaskConfig( @@ -4980,7 +4707,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_passive_1_helm = LightevalTaskConfig( @@ -4996,7 +4722,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_passive_2_lighteval = LightevalTaskConfig( @@ -5012,7 +4737,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_passive_2_helm = LightevalTaskConfig( @@ -5028,7 +4752,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_c_command_lighteval = LightevalTaskConfig( @@ -5044,7 +4767,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_c_command_helm = LightevalTaskConfig( @@ -5060,7 +4782,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_case_1_lighteval = LightevalTaskConfig( @@ -5076,7 +4797,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_case_1_helm = LightevalTaskConfig( @@ -5092,7 +4812,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_case_2_lighteval = LightevalTaskConfig( @@ -5108,7 +4827,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_case_2_helm = LightevalTaskConfig( @@ -5124,7 +4842,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_domain_1_lighteval = LightevalTaskConfig( @@ -5140,7 +4857,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_domain_1_helm = LightevalTaskConfig( @@ -5156,7 +4872,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_domain_2_lighteval = LightevalTaskConfig( @@ -5172,7 +4887,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_domain_2_helm = LightevalTaskConfig( @@ -5188,7 +4902,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_domain_3_lighteval = LightevalTaskConfig( @@ -5204,7 +4917,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_domain_3_helm = LightevalTaskConfig( @@ -5220,7 +4932,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_reconstruction_lighteval = LightevalTaskConfig( @@ -5236,7 +4947,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_principle_A_reconstruction_helm = LightevalTaskConfig( @@ -5252,7 +4962,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_regular_plural_subject_verb_agreement_1_lighteval = LightevalTaskConfig( @@ -5268,7 +4977,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_regular_plural_subject_verb_agreement_1_helm = LightevalTaskConfig( @@ -5284,7 +4992,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_regular_plural_subject_verb_agreement_2_lighteval = LightevalTaskConfig( @@ -5300,7 +5007,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_regular_plural_subject_verb_agreement_2_helm = LightevalTaskConfig( @@ -5316,7 +5022,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_sentential_negation_npi_licensor_present_lighteval = LightevalTaskConfig( @@ -5332,7 +5037,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_sentential_negation_npi_licensor_present_helm = LightevalTaskConfig( @@ -5348,7 +5052,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_sentential_negation_npi_scope_lighteval = LightevalTaskConfig( @@ -5364,7 +5067,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_sentential_negation_npi_scope_helm = LightevalTaskConfig( @@ -5380,7 +5082,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_sentential_subject_island_lighteval = LightevalTaskConfig( @@ -5396,7 +5097,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_sentential_subject_island_helm = LightevalTaskConfig( @@ -5412,7 +5112,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_superlative_quantifiers_1_lighteval = LightevalTaskConfig( @@ -5428,7 +5127,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_superlative_quantifiers_1_helm = LightevalTaskConfig( @@ -5444,7 +5142,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_superlative_quantifiers_2_lighteval = LightevalTaskConfig( @@ -5460,7 +5157,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_superlative_quantifiers_2_helm = LightevalTaskConfig( @@ -5476,7 +5172,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_tough_vs_raising_1_lighteval = LightevalTaskConfig( @@ -5492,7 +5187,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_tough_vs_raising_1_helm = LightevalTaskConfig( @@ -5508,7 +5202,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_tough_vs_raising_2_lighteval = LightevalTaskConfig( @@ -5524,7 +5217,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_tough_vs_raising_2_helm = LightevalTaskConfig( @@ -5540,7 +5232,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_transitive_lighteval = LightevalTaskConfig( @@ -5556,7 +5247,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_transitive_helm = LightevalTaskConfig( @@ -5572,7 +5262,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_island_lighteval = LightevalTaskConfig( @@ -5588,7 +5277,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_island_helm = LightevalTaskConfig( @@ -5604,7 +5292,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_questions_object_gap_lighteval = LightevalTaskConfig( @@ -5620,7 +5307,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_questions_object_gap_helm = LightevalTaskConfig( @@ -5636,7 +5322,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_questions_subject_gap_lighteval = LightevalTaskConfig( @@ -5652,7 +5337,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_questions_subject_gap_helm = LightevalTaskConfig( @@ -5668,7 +5352,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_questions_subject_gap_long_distance_lighteval = LightevalTaskConfig( @@ -5684,7 +5367,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_questions_subject_gap_long_distance_helm = LightevalTaskConfig( @@ -5700,7 +5382,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_vs_that_no_gap_lighteval = LightevalTaskConfig( @@ -5716,7 +5397,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_vs_that_no_gap_helm = LightevalTaskConfig( @@ -5732,7 +5412,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_vs_that_no_gap_long_distance_lighteval = LightevalTaskConfig( @@ -5748,7 +5427,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_vs_that_no_gap_long_distance_helm = LightevalTaskConfig( @@ -5764,7 +5442,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_vs_that_with_gap_lighteval = LightevalTaskConfig( @@ -5780,7 +5457,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_vs_that_with_gap_helm = LightevalTaskConfig( @@ -5796,7 +5472,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_vs_that_with_gap_long_distance_lighteval = LightevalTaskConfig( @@ -5812,7 +5487,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) blimp_wh_vs_that_with_gap_long_distance_helm = LightevalTaskConfig( @@ -5828,7 +5502,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bold_helm = LightevalTaskConfig( @@ -5844,7 +5517,6 @@ generation_size=100, metrics=[Metrics.prediction_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bold_gender_helm = LightevalTaskConfig( @@ -5860,7 +5532,6 @@ generation_size=100, metrics=[Metrics.prediction_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bold_political_ideology_helm = LightevalTaskConfig( @@ -5876,7 +5547,6 @@ generation_size=100, metrics=[Metrics.prediction_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bold_profession_helm = LightevalTaskConfig( @@ -5892,7 +5562,6 @@ generation_size=100, metrics=[Metrics.prediction_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bold_race_helm = LightevalTaskConfig( @@ -5908,7 +5577,6 @@ generation_size=100, metrics=[Metrics.prediction_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bold_religious_ideology_helm = LightevalTaskConfig( @@ -5924,7 +5592,6 @@ generation_size=100, metrics=[Metrics.prediction_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) boolq_helm = LightevalTaskConfig( @@ -5945,7 +5612,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) boolq_contrastset_helm = LightevalTaskConfig( @@ -5966,14 +5632,13 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) bridging_anaphora_resolution_barqa_bigbench = LightevalTaskConfig( name="bridging_anaphora_resolution_barqa", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="bridging_anaphora_resolution_barqa", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -5982,14 +5647,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) causal_judgment_bigbench = LightevalTaskConfig( name="causal_judgment", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="causal_judgment", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -5998,14 +5662,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) cause_and_effect_bigbench = LightevalTaskConfig( name="cause_and_effect", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="cause_and_effect", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6014,14 +5677,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) checkmate_in_one_bigbench = LightevalTaskConfig( name="checkmate_in_one", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="checkmate_in_one", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6030,14 +5692,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) chess_state_tracking_bigbench = LightevalTaskConfig( name="chess_state_tracking", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="chess_state_tracking", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6046,14 +5707,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) chinese_remainder_theorem_bigbench = LightevalTaskConfig( name="chinese_remainder_theorem", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="chinese_remainder_theorem", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6062,14 +5722,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) cifar10_classification_bigbench = LightevalTaskConfig( name="cifar10_classification", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="cifar10_classification", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6078,7 +5737,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) civil_comments_helm = LightevalTaskConfig( @@ -6101,7 +5759,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) civil_comments_LGBTQ_helm = LightevalTaskConfig( @@ -6124,7 +5781,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) civil_comments_black_helm = LightevalTaskConfig( @@ -6147,7 +5803,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) civil_comments_christian_helm = LightevalTaskConfig( @@ -6170,7 +5825,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) civil_comments_female_helm = LightevalTaskConfig( @@ -6193,7 +5847,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) civil_comments_male_helm = LightevalTaskConfig( @@ -6216,7 +5869,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) civil_comments_muslim_helm = LightevalTaskConfig( @@ -6239,7 +5891,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) civil_comments_other_religions_helm = LightevalTaskConfig( @@ -6262,7 +5913,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) civil_comments_white_helm = LightevalTaskConfig( @@ -6285,14 +5935,13 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) code_line_description_bigbench_lite = LightevalTaskConfig( name="code_line_description", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_linefeed_before_and_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="code_line_description", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6301,14 +5950,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) codenames_bigbench = LightevalTaskConfig( name="codenames", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="codenames", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6317,14 +5965,13 @@ generation_size=1, metrics=[Metrics.rouge_t5, Metrics.bleu], stop_sequence=["\n"], - trust_dataset=True, version=0, ) color_bigbench = LightevalTaskConfig( name="color", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="color", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6333,14 +5980,13 @@ generation_size=1, metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) common_morpheme_bigbench = LightevalTaskConfig( name="common_morpheme", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="common_morpheme", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6349,7 +5995,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) commonsenseqa_helm = LightevalTaskConfig( @@ -6370,14 +6015,13 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) conceptual_combinations_bigbench_lite = LightevalTaskConfig( name="conceptual_combinations", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="conceptual_combinations", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6386,14 +6030,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) conlang_translation_bigbench_lite = LightevalTaskConfig( name="conlang_translation", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="conlang_translation", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6402,14 +6045,13 @@ generation_size=100, metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.perfect_exact_match], stop_sequence=[".", ";", "!", "?"], - trust_dataset=True, version=0, ) contextual_parametric_knowledge_conflicts_bigbench = LightevalTaskConfig( name="contextual_parametric_knowledge_conflicts", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="contextual_parametric_knowledge_conflicts", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6418,7 +6060,6 @@ generation_size=1, metrics=[Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_n_books_1000_extractions_per_book_1_prefix_length_125_helm = LightevalTaskConfig( @@ -6434,7 +6075,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_n_books_1000_extractions_per_book_1_prefix_length_25_helm = LightevalTaskConfig( @@ -6450,7 +6090,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_n_books_1000_extractions_per_book_1_prefix_length_5_helm = LightevalTaskConfig( @@ -6466,7 +6105,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_n_books_1000_extractions_per_book_3_prefix_length_125_helm = LightevalTaskConfig( @@ -6482,7 +6120,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_n_books_1000_extractions_per_book_3_prefix_length_25_helm = LightevalTaskConfig( @@ -6498,7 +6135,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_n_books_1000_extractions_per_book_3_prefix_length_5_helm = LightevalTaskConfig( @@ -6514,7 +6150,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_oh_the_places_helm = LightevalTaskConfig( @@ -6530,7 +6165,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_pilot_helm = LightevalTaskConfig( @@ -6546,7 +6180,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_popular_books_prefix_length_10_helm = LightevalTaskConfig( @@ -6562,7 +6195,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_popular_books_prefix_length_125_helm = LightevalTaskConfig( @@ -6578,7 +6210,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_popular_books_prefix_length_25_helm = LightevalTaskConfig( @@ -6594,7 +6225,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_popular_books_prefix_length_250_helm = LightevalTaskConfig( @@ -6610,7 +6240,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_popular_books_prefix_length_5_helm = LightevalTaskConfig( @@ -6626,7 +6255,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_popular_books_prefix_length_50_helm = LightevalTaskConfig( @@ -6642,7 +6270,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_prompt_num_line_1_min_lines_20_helm = LightevalTaskConfig( @@ -6658,7 +6285,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_prompt_num_line_10_min_lines_20_helm = LightevalTaskConfig( @@ -6674,7 +6300,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) copyright_prompt_num_line_5_min_lines_20_helm = LightevalTaskConfig( @@ -6690,7 +6315,6 @@ generation_size=100, metrics=[Metrics.copyright], stop_sequence=["\n"], - trust_dataset=True, version=0, ) coqa_first_question = LightevalTaskConfig( @@ -6729,7 +6353,6 @@ generation_size=10, metrics=[Metrics.perfect_exact_match, Metrics.f1_score], stop_sequence=["\n"], - trust_dataset=True, version=0, ) covid_dialogue_helm = LightevalTaskConfig( @@ -6752,14 +6375,13 @@ Metrics.bleu_4, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) crash_blossom_bigbench = LightevalTaskConfig( name="crash_blossom", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="crash_blossom", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6768,14 +6390,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) crass_ai_bigbench = LightevalTaskConfig( name="crass_ai", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="crass_ai", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6784,14 +6405,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) cryobiology_spanish_bigbench = LightevalTaskConfig( name="cryobiology_spanish", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="cryobiology_spanish", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6800,14 +6420,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) cryptonite_bigbench = LightevalTaskConfig( name="cryptonite", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="cryptonite", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6816,14 +6435,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) cs_algorithms_bigbench = LightevalTaskConfig( name="cs_algorithms", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="cs_algorithms", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6832,14 +6450,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) dark_humor_detection_bigbench = LightevalTaskConfig( name="dark_humor_detection", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="dark_humor_detection", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6848,14 +6465,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) date_understanding_bigbench = LightevalTaskConfig( name="date_understanding", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="date_understanding", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6864,14 +6480,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) disambiguation_qa_bigbench = LightevalTaskConfig( name="disambiguation_qa", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="disambiguation_qa", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6880,14 +6495,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) discourse_marker_prediction_bigbench = LightevalTaskConfig( name="discourse_marker_prediction", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="discourse_marker_prediction", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6896,14 +6510,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) disfl_qa_bigbench = LightevalTaskConfig( name="disfl_qa", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="disfl_qa", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -6912,7 +6525,6 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) drop_qa = LightevalTaskConfig( @@ -6966,7 +6578,6 @@ generation_size=5, metrics=[Metrics.exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) dyck_language_3_helm = LightevalTaskConfig( @@ -6982,7 +6593,6 @@ generation_size=5, metrics=[Metrics.exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) dyck_language_4_helm = LightevalTaskConfig( @@ -6998,14 +6608,13 @@ generation_size=5, metrics=[Metrics.exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) dyck_languages_bigbench = LightevalTaskConfig( name="dyck_languages", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="dyck_languages", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7014,14 +6623,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) elementary_math_qa_bigbench = LightevalTaskConfig( name="elementary_math_qa", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="elementary_math_qa", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7030,14 +6638,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) emoji_movie_bigbench_lite = LightevalTaskConfig( name="emoji_movie", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="emoji_movie", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7046,14 +6653,13 @@ generation_size=1, metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) emojis_emotion_prediction_bigbench = LightevalTaskConfig( name="emojis_emotion_prediction", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="emojis_emotion_prediction", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7062,14 +6668,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) empirical_judgments_bigbench = LightevalTaskConfig( name="empirical_judgments", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="empirical_judgments", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7078,14 +6683,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) english_proverbs_bigbench = LightevalTaskConfig( name="english_proverbs", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="english_proverbs", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7094,14 +6698,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) english_russian_proverbs_bigbench = LightevalTaskConfig( name="english_russian_proverbs", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="english_russian_proverbs", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7110,14 +6713,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entailed_polarity_bigbench = LightevalTaskConfig( name="entailed_polarity", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="entailed_polarity", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7126,14 +6728,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entailed_polarity_hindi_bigbench = LightevalTaskConfig( name="entailed_polarity_hindi", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="entailed_polarity_hindi", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7142,7 +6743,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_data_imputation_Buy_helm = LightevalTaskConfig( @@ -7163,7 +6763,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_data_imputation_Restaurant_helm = LightevalTaskConfig( @@ -7184,7 +6783,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_Abt_Buy_helm = LightevalTaskConfig( @@ -7205,7 +6803,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_Amazon_Google_helm = LightevalTaskConfig( @@ -7226,7 +6823,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_Beer_helm = LightevalTaskConfig( @@ -7247,7 +6843,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_Company_helm = LightevalTaskConfig( @@ -7268,7 +6863,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_DBLP_ACM_helm = LightevalTaskConfig( @@ -7289,7 +6883,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_DBLP_GoogleScholar_helm = LightevalTaskConfig( @@ -7310,7 +6903,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_Dirty_DBLP_ACM_helm = LightevalTaskConfig( @@ -7331,7 +6923,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_Dirty_DBLP_GoogleScholar_helm = LightevalTaskConfig( @@ -7352,7 +6943,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_Dirty_Walmart_Amazon_helm = LightevalTaskConfig( @@ -7373,7 +6963,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_Dirty_iTunes_Amazon_helm = LightevalTaskConfig( @@ -7394,7 +6983,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_Fodors_Zagats_helm = LightevalTaskConfig( @@ -7415,7 +7003,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_Walmart_Amazon_helm = LightevalTaskConfig( @@ -7436,7 +7023,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) entity_matching_iTunes_Amazon_helm = LightevalTaskConfig( @@ -7457,14 +7043,13 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) epistemic_reasoning_bigbench = LightevalTaskConfig( name="epistemic_reasoning", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="epistemic_reasoning", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7473,7 +7058,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) ethics_commonsense_lighteval = LightevalTaskConfig( @@ -7489,7 +7073,6 @@ generation_size=5, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) ethics_deontology_lighteval = LightevalTaskConfig( @@ -7505,7 +7088,6 @@ generation_size=5, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) ethics_justice_lighteval = LightevalTaskConfig( @@ -7521,7 +7103,6 @@ generation_size=5, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) ethics_utilitarianism_lighteval = LightevalTaskConfig( @@ -7537,7 +7118,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) ethics_virtue_lighteval = LightevalTaskConfig( @@ -7553,14 +7133,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) evaluating_information_essentiality_bigbench = LightevalTaskConfig( name="evaluating_information_essentiality", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="evaluating_information_essentiality", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7569,14 +7148,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) fact_checker_bigbench = LightevalTaskConfig( name="fact_checker", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="fact_checker", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7585,14 +7163,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) fantasy_reasoning_bigbench = LightevalTaskConfig( name="fantasy_reasoning", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="fantasy_reasoning", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7601,14 +7178,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) few_shot_nlg_bigbench = LightevalTaskConfig( name="few_shot_nlg", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="few_shot_nlg", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7617,14 +7193,13 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.bleurt], stop_sequence=["\n"], - trust_dataset=True, version=0, ) figure_of_speech_detection_bigbench = LightevalTaskConfig( name="figure_of_speech_detection", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="figure_of_speech_detection", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7633,14 +7208,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) formal_fallacies_syllogisms_negation_bigbench_lite = LightevalTaskConfig( name="formal_fallacies_syllogisms_negation", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="formal_fallacies_syllogisms_negation", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7649,14 +7223,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) gem_bigbench = LightevalTaskConfig( name="gem", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="gem", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7665,14 +7238,13 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.rouge_t5], stop_sequence=["\n"], - trust_dataset=True, version=0, ) gender_inclusive_sentences_german_bigbench = LightevalTaskConfig( name="gender_inclusive_sentences_german", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="gender_inclusive_sentences_german", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7681,14 +7253,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) general_knowledge_bigbench = LightevalTaskConfig( name="general_knowledge", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="general_knowledge", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7697,14 +7268,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) geometric_shapes_bigbench = LightevalTaskConfig( name="geometric_shapes", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="geometric_shapes", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7713,7 +7283,6 @@ generation_size=1, metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) glue_cola_lighteval = LightevalTaskConfig( @@ -7729,7 +7298,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.mcc_single_token], stop_sequence=["\n"], - trust_dataset=True, version=0, ) glue_mnli_lighteval = LightevalTaskConfig( @@ -7745,7 +7313,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) glue_mnli_mismatched_lighteval = LightevalTaskConfig( @@ -7761,7 +7328,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) glue_mrpc_lighteval = LightevalTaskConfig( @@ -7777,7 +7343,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_f1], stop_sequence=["\n"], - trust_dataset=True, version=0, ) glue_qnli_lighteval = LightevalTaskConfig( @@ -7793,7 +7358,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) glue_qqp_lighteval = LightevalTaskConfig( @@ -7809,7 +7373,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_f1], stop_sequence=["\n"], - trust_dataset=True, version=0, ) glue_rte_lighteval = LightevalTaskConfig( @@ -7825,7 +7388,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) glue_sst2_lighteval = LightevalTaskConfig( @@ -7841,7 +7403,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) glue_stsb_lighteval = LightevalTaskConfig( @@ -7857,7 +7418,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) glue_wnli_lighteval = LightevalTaskConfig( @@ -7873,14 +7433,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) goal_step_wikihow_bigbench = LightevalTaskConfig( name="goal_step_wikihow", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="goal_step_wikihow", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7889,7 +7448,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) gpqa_lighteval = LightevalTaskConfig( @@ -7905,7 +7463,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) gpqa_diamond_instruct_lighteval = LightevalTaskConfig( @@ -7925,7 +7482,6 @@ # Metrics.gpqa_instruct_pass_at_1_8n, ], stop_sequence=[], # no stop sequence, will use eos token - trust_dataset=True, version=1, ) gpqa_extended_instruct_lighteval = LightevalTaskConfig( @@ -7941,7 +7497,6 @@ generation_size=32768, # needed for reasoning models like R1 metrics=[Metrics.gpqa_instruct_metric], stop_sequence=[], # no stop sequence, will use eos token - trust_dataset=True, version=0, ) gpqa_main_instruct_lighteval = LightevalTaskConfig( @@ -7957,14 +7512,13 @@ generation_size=32768, # needed for reasoning models like R1 metrics=[Metrics.gpqa_instruct_metric], stop_sequence=[], # no stop sequence, will use eos token - trust_dataset=True, version=0, ) gre_reading_comprehension_bigbench = LightevalTaskConfig( name="gre_reading_comprehension", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="gre_reading_comprehension", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -7973,7 +7527,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) gsm_plus = LightevalTaskConfig( @@ -7989,7 +7542,6 @@ generation_size=None, metrics=[Metrics.expr_gold_metric], stop_sequence=None, - trust_dataset=True, version=0, ) gsm8k_leaderboard = LightevalTaskConfig( @@ -8005,14 +7557,13 @@ generation_size=256, metrics=[Metrics.quasi_exact_match_gsm8k], stop_sequence=[], - trust_dataset=True, version=0, ) gsm8k_lighteval = LightevalTaskConfig( name="gsm8k", suite=["lighteval"], prompt_function=prompt.gsm8k, - hf_repo="gsm8k", + hf_repo="openai/gsm8k", hf_subset="main", hf_avail_splits=["train", "test"], evaluation_splits=["test"], @@ -8023,39 +7574,6 @@ Metrics.expr_gold_metric, ], stop_sequence=["Question:"], - trust_dataset=True, - version=0, -) -headqa_en_lighteval = LightevalTaskConfig( - name="headqa:en", - suite=["lighteval", "headqa"], - prompt_function=prompt.headqa, - hf_repo="lighteval/headqa_harness", - hf_subset="en", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], - stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -headqa_es_lighteval = LightevalTaskConfig( - name="headqa:es", - suite=["lighteval", "headqa"], - prompt_function=prompt.headqa, - hf_repo="lighteval/headqa_harness", - hf_subset="es", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) hellaswag_leaderboard = LightevalTaskConfig( @@ -8071,7 +7589,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) hellaswag_generative = LightevalTaskConfig( @@ -8092,14 +7609,13 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) hhh_alignment_bigbench = LightevalTaskConfig( name="hhh_alignment", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="hhh_alignment", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8108,14 +7624,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) hindi_question_answering_bigbench = LightevalTaskConfig( name="hindi_question_answering", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="hindi_question_answering", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8124,14 +7639,13 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) hindu_knowledge_bigbench_lite = LightevalTaskConfig( name="hindu_knowledge", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="hindu_knowledge", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8140,14 +7654,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) hinglish_toxicity_bigbench = LightevalTaskConfig( name="hinglish_toxicity", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="hinglish_toxicity", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8156,14 +7669,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) human_organs_senses_bigbench = LightevalTaskConfig( name="human_organs_senses", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="human_organs_senses", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8172,14 +7684,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) hyperbaton_bigbench = LightevalTaskConfig( name="hyperbaton", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="hyperbaton", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8188,14 +7699,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) identify_math_theorems_bigbench = LightevalTaskConfig( name="identify_math_theorems", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="identify_math_theorems", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8204,14 +7714,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) identify_odd_metaphor_bigbench = LightevalTaskConfig( name="identify_odd_metaphor", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="identify_odd_metaphor", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8220,7 +7729,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) imdb_helm = LightevalTaskConfig( @@ -8243,7 +7751,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) imdb_contrastset_helm = LightevalTaskConfig( @@ -8266,14 +7773,13 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) implicatures_bigbench = LightevalTaskConfig( name="implicatures", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="implicatures", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8282,14 +7788,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) implicit_relations_bigbench = LightevalTaskConfig( name="implicit_relations", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="implicit_relations", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8298,14 +7803,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) intent_recognition_bigbench = LightevalTaskConfig( name="intent_recognition", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="intent_recognition", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8314,7 +7818,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) interactive_qa_mmlu_abstract_algebra_helm = LightevalTaskConfig( @@ -8335,7 +7838,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) interactive_qa_mmlu_college_chemistry_helm = LightevalTaskConfig( @@ -8356,7 +7858,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) interactive_qa_mmlu_global_facts_helm = LightevalTaskConfig( @@ -8377,7 +7878,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) interactive_qa_mmlu_miscellaneous_helm = LightevalTaskConfig( @@ -8398,7 +7898,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) interactive_qa_mmlu_nutrition_helm = LightevalTaskConfig( @@ -8419,7 +7918,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) interactive_qa_mmlu_us_foreign_policy_helm = LightevalTaskConfig( @@ -8440,14 +7938,13 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) international_phonetic_alphabet_nli_bigbench = LightevalTaskConfig( name="international_phonetic_alphabet_nli", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="international_phonetic_alphabet_nli", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8456,14 +7953,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) international_phonetic_alphabet_transliterate_bigbench = LightevalTaskConfig( name="international_phonetic_alphabet_transliterate", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="international_phonetic_alphabet_transliterate", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8472,14 +7968,13 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) intersect_geometry_bigbench = LightevalTaskConfig( name="intersect_geometry", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="intersect_geometry", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8488,14 +7983,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) irony_identification_bigbench = LightevalTaskConfig( name="irony_identification", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="irony_identification", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8504,7 +7998,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) iwslt17_ar_en_lighteval = LightevalTaskConfig( @@ -8520,7 +8013,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) iwslt17_de_en_lighteval = LightevalTaskConfig( @@ -8536,7 +8028,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) iwslt17_en_ar_lighteval = LightevalTaskConfig( @@ -8552,7 +8043,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) iwslt17_en_de_lighteval = LightevalTaskConfig( @@ -8568,7 +8058,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) iwslt17_en_fr_lighteval = LightevalTaskConfig( @@ -8584,7 +8073,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) iwslt17_en_ja_lighteval = LightevalTaskConfig( @@ -8600,7 +8088,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) iwslt17_en_ko_lighteval = LightevalTaskConfig( @@ -8616,7 +8103,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) iwslt17_en_zh_lighteval = LightevalTaskConfig( @@ -8632,7 +8118,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) iwslt17_fr_en_lighteval = LightevalTaskConfig( @@ -8648,7 +8133,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) iwslt17_ja_en_lighteval = LightevalTaskConfig( @@ -8664,7 +8148,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) iwslt17_ko_en_lighteval = LightevalTaskConfig( @@ -8680,7 +8163,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) iwslt17_zh_en_lighteval = LightevalTaskConfig( @@ -8696,7 +8178,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) jeopardy = LightevalTaskConfig( @@ -8724,7 +8205,7 @@ name="kanji_ascii", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="kanji_ascii", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8733,14 +8214,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) kannada_bigbench = LightevalTaskConfig( name="kannada", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="kannada", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8749,14 +8229,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) key_value_maps_bigbench = LightevalTaskConfig( name="key_value_maps", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="key_value_maps", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8765,14 +8244,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) known_unknowns_bigbench_lite = LightevalTaskConfig( name="known_unknowns", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="known_unknowns", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8781,7 +8259,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lambada_standard_lighteval = LightevalTaskConfig( @@ -8797,7 +8274,6 @@ generation_size=10, metrics=[Metrics.target_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lambada_standard_cloze_lighteval = LightevalTaskConfig( @@ -8813,7 +8289,6 @@ generation_size=10, metrics=[Metrics.target_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lambada_openai_lighteval = LightevalTaskConfig( @@ -8829,7 +8304,6 @@ generation_size=10, metrics=[Metrics.target_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lambada_openai_de_lighteval = LightevalTaskConfig( @@ -8845,7 +8319,6 @@ generation_size=10, metrics=[Metrics.target_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lambada_openai_en_lighteval = LightevalTaskConfig( @@ -8861,7 +8334,6 @@ generation_size=10, metrics=[Metrics.target_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lambada_openai_es_lighteval = LightevalTaskConfig( @@ -8877,7 +8349,6 @@ generation_size=10, metrics=[Metrics.target_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lambada_openai_fr_lighteval = LightevalTaskConfig( @@ -8893,7 +8364,6 @@ generation_size=10, metrics=[Metrics.target_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lambada_openai_it_lighteval = LightevalTaskConfig( @@ -8909,7 +8379,6 @@ generation_size=10, metrics=[Metrics.target_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lambada_openai_cloze_lighteval = LightevalTaskConfig( @@ -8925,14 +8394,13 @@ generation_size=10, metrics=[Metrics.target_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) language_games_bigbench = LightevalTaskConfig( name="language_games", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="language_games", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8941,14 +8409,13 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) language_identification_bigbench_lite = LightevalTaskConfig( name="language_identification", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="language_identification", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -8957,7 +8424,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) legal_summarization_billsum_helm = LightevalTaskConfig( @@ -8980,7 +8446,6 @@ Metrics.bert_score, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) legal_summarization_eurlexsum_helm = LightevalTaskConfig( @@ -9003,7 +8468,6 @@ Metrics.bert_score, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) legal_summarization_multilexsum_helm = LightevalTaskConfig( @@ -9026,7 +8490,6 @@ Metrics.bert_score, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) legalsupport_helm = LightevalTaskConfig( @@ -9048,7 +8511,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lexglue_case_hold_helm = LightevalTaskConfig( @@ -9070,7 +8532,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lexglue_ecthr_a_helm = LightevalTaskConfig( @@ -9092,7 +8553,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lexglue_ecthr_b_helm = LightevalTaskConfig( @@ -9114,7 +8574,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lexglue_eurlex_helm = LightevalTaskConfig( @@ -9136,7 +8595,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lexglue_ledgar_helm = LightevalTaskConfig( @@ -9158,7 +8616,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lexglue_scotus_helm = LightevalTaskConfig( @@ -9180,7 +8637,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lexglue_unfair_tos_helm = LightevalTaskConfig( @@ -9202,7 +8658,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_brazilian_court_decisions_judgment_helm = LightevalTaskConfig( @@ -9224,7 +8679,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_brazilian_court_decisions_unanimity_helm = LightevalTaskConfig( @@ -9246,7 +8700,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_covid19_emergency_event_helm = LightevalTaskConfig( @@ -9268,7 +8721,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_german_argument_mining_helm = LightevalTaskConfig( @@ -9290,7 +8742,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_greek_legal_code_chapter_helm = LightevalTaskConfig( @@ -9312,7 +8763,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_greek_legal_code_subject_helm = LightevalTaskConfig( @@ -9334,7 +8784,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_greek_legal_code_volume_helm = LightevalTaskConfig( @@ -9350,7 +8799,6 @@ generation_size=20, metrics=[Metrics.exact_match, Metrics.quasi_exact_match, Metrics.f1_score], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_greek_legal_ner_helm = LightevalTaskConfig( @@ -9372,7 +8820,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_legalnero_helm = LightevalTaskConfig( @@ -9394,7 +8841,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_lener_br_helm = LightevalTaskConfig( @@ -9416,7 +8862,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_mapa_coarse_helm = LightevalTaskConfig( @@ -9438,7 +8883,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_mapa_fine_helm = LightevalTaskConfig( @@ -9460,7 +8904,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_multi_eurlex_level_1_helm = LightevalTaskConfig( @@ -9482,7 +8925,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_multi_eurlex_level_2_helm = LightevalTaskConfig( @@ -9504,7 +8946,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_multi_eurlex_level_3_helm = LightevalTaskConfig( @@ -9526,7 +8967,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_online_terms_of_service_clause_topics_helm = LightevalTaskConfig( @@ -9548,7 +8988,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_online_terms_of_service_unfairness_levels_helm = LightevalTaskConfig( @@ -9570,7 +9009,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lextreme_swiss_judgment_prediction_helm = LightevalTaskConfig( @@ -9592,14 +9030,13 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) linguistic_mappings_bigbench = LightevalTaskConfig( name="linguistic_mappings", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="linguistic_mappings", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -9608,14 +9045,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) linguistics_puzzles_bigbench_lite = LightevalTaskConfig( name="linguistics_puzzles", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="linguistics_puzzles", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -9624,14 +9060,13 @@ generation_size=100, metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], stop_sequence=None, - trust_dataset=True, version=0, ) logic_grid_puzzle_bigbench_lite = LightevalTaskConfig( name="logic_grid_puzzle", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="logic_grid_puzzle", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -9640,14 +9075,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) logical_args_bigbench = LightevalTaskConfig( name="logical_args", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="logical_args", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -9656,14 +9090,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) logical_deduction_bigbench_lite = LightevalTaskConfig( name="logical_deduction", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="logical_deduction", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -9672,14 +9105,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) logical_fallacy_detection_bigbench = LightevalTaskConfig( name="logical_fallacy_detection", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="logical_fallacy_detection", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -9688,14 +9120,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) logical_sequence_bigbench = LightevalTaskConfig( name="logical_sequence", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="logical_sequence", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -9704,7 +9135,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) logiqa_lighteval = LightevalTaskConfig( @@ -9720,7 +9150,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lsat_qa_helm = LightevalTaskConfig( @@ -9741,7 +9170,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lsat_qa_assignment_helm = LightevalTaskConfig( @@ -9762,7 +9190,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lsat_qa_grouping_helm = LightevalTaskConfig( @@ -9783,7 +9210,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lsat_qa_miscellaneous_helm = LightevalTaskConfig( @@ -9804,7 +9230,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) lsat_qa_ordering_helm = LightevalTaskConfig( @@ -9825,7 +9250,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) math_500 = LightevalTaskConfig( @@ -9872,7 +9296,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=1, ) math_counting_and_probability_lighteval = LightevalTaskConfig( @@ -9888,7 +9311,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=1, ) math_geometry_lighteval = LightevalTaskConfig( @@ -9904,7 +9326,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=1, ) math_intermediate_algebra_lighteval = LightevalTaskConfig( @@ -9920,7 +9341,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=1, ) math_number_theory_lighteval = LightevalTaskConfig( @@ -9936,7 +9356,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=1, ) math_prealgebra_lighteval = LightevalTaskConfig( @@ -9952,7 +9371,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=1, ) math_precalculus_lighteval = LightevalTaskConfig( @@ -9968,7 +9386,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=1, ) math_cot_algebra_lighteval = LightevalTaskConfig( @@ -9984,7 +9401,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=0, ) math_cot_counting_and_probability_lighteval = LightevalTaskConfig( @@ -10000,7 +9416,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=0, ) math_cot_geometry_lighteval = LightevalTaskConfig( @@ -10016,7 +9431,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=0, ) math_cot_intermediate_algebra_lighteval = LightevalTaskConfig( @@ -10032,7 +9446,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=0, ) math_cot_number_theory_lighteval = LightevalTaskConfig( @@ -10048,7 +9461,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=0, ) math_cot_prealgebra_lighteval = LightevalTaskConfig( @@ -10064,7 +9476,6 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=0, ) math_cot_precalculus_lighteval = LightevalTaskConfig( @@ -10080,14 +9491,13 @@ generation_size=2048, metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mathematical_induction_bigbench = LightevalTaskConfig( name="mathematical_induction", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="mathematical_induction", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -10096,14 +9506,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mathqa_lighteval = LightevalTaskConfig( name="mathqa", suite=["lighteval"], prompt_function=prompt.mathqa, - hf_repo="math_qa", + hf_repo="allenai/math_qa", hf_subset="default", hf_avail_splits=["train", "validation", "test"], evaluation_splits=["test"], @@ -10112,14 +9521,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) matrixshapes_bigbench = LightevalTaskConfig( name="matrixshapes", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="matrixshapes", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -10128,7 +9536,6 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) me_q_sum_helm = LightevalTaskConfig( @@ -10151,7 +9558,6 @@ Metrics.bleu_4, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) med_dialog_healthcaremagic_helm = LightevalTaskConfig( @@ -10174,7 +9580,6 @@ Metrics.bleu_4, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) med_dialog_icliniq_helm = LightevalTaskConfig( @@ -10197,7 +9602,6 @@ Metrics.bleu_4, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) med_mcqa_helm = LightevalTaskConfig( @@ -10219,7 +9623,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) med_paragraph_simplification_helm = LightevalTaskConfig( @@ -10242,7 +9645,6 @@ Metrics.bleu_4, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) med_qa_helm = LightevalTaskConfig( @@ -10264,14 +9666,13 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) metaphor_boolean_bigbench = LightevalTaskConfig( name="metaphor_boolean", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="metaphor_boolean", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -10280,14 +9681,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) metaphor_understanding_bigbench = LightevalTaskConfig( name="metaphor_understanding", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="metaphor_understanding", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -10296,7 +9696,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mgsm_en_lighteval = LightevalTaskConfig( @@ -10312,7 +9711,6 @@ generation_size=None, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n", "=", "Question="], - trust_dataset=True, version=0, ) mgsm_es_lighteval = LightevalTaskConfig( @@ -10328,7 +9726,6 @@ generation_size=None, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n", "=", "Pregunta="], - trust_dataset=True, version=0, ) mgsm_fr_lighteval = LightevalTaskConfig( @@ -10344,7 +9741,6 @@ generation_size=None, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n", "=", "Question="], - trust_dataset=True, version=0, ) mgsm_de_lighteval = LightevalTaskConfig( @@ -10360,7 +9756,6 @@ generation_size=None, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n", "=", "Frage="], - trust_dataset=True, version=0, ) mgsm_ru_lighteval = LightevalTaskConfig( @@ -10376,7 +9771,6 @@ generation_size=None, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n", "=", "\u0417\u0430\u0434\u0430\u0447\u0430="], - trust_dataset=True, version=0, ) mgsm_zh_lighteval = LightevalTaskConfig( @@ -10392,7 +9786,6 @@ generation_size=None, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n", "=", "\u95ee\u9898="], - trust_dataset=True, version=0, ) mgsm_ja_lighteval = LightevalTaskConfig( @@ -10408,7 +9801,6 @@ generation_size=None, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n", "=", "\u554f\u984c="], - trust_dataset=True, version=0, ) mgsm_th_lighteval = LightevalTaskConfig( @@ -10424,7 +9816,6 @@ generation_size=None, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n", "=", "\u0e42\u0e08\u0e17\u0e22\u0e4c="], - trust_dataset=True, version=0, ) mgsm_sw_lighteval = LightevalTaskConfig( @@ -10440,7 +9831,6 @@ generation_size=None, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n", "=", "Swali="], - trust_dataset=True, version=0, ) mgsm_bn_lighteval = LightevalTaskConfig( @@ -10456,7 +9846,6 @@ generation_size=None, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n", "=", "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8="], - trust_dataset=True, version=0, ) mgsm_te_lighteval = LightevalTaskConfig( @@ -10472,14 +9861,13 @@ generation_size=None, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n", "=", "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28="], - trust_dataset=True, version=0, ) minute_mysteries_qa_bigbench = LightevalTaskConfig( name="minute_mysteries_qa", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="minute_mysteries_qa", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -10488,14 +9876,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.rouge_t5], stop_sequence=["\n"], - trust_dataset=True, version=0, ) misconceptions_bigbench = LightevalTaskConfig( name="misconceptions", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="misconceptions", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -10504,14 +9891,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) misconceptions_russian_bigbench_lite = LightevalTaskConfig( name="misconceptions_russian", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="misconceptions_russian", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -10520,7 +9906,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_abstract_algebra_original = LightevalTaskConfig( @@ -10536,7 +9921,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_abstract_algebra_leaderboard = LightevalTaskConfig( @@ -10552,7 +9936,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_abstract_algebra_helm = LightevalTaskConfig( @@ -10573,7 +9956,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_anatomy_original = LightevalTaskConfig( @@ -10589,7 +9971,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_anatomy_leaderboard = LightevalTaskConfig( @@ -10605,7 +9986,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_anatomy_helm = LightevalTaskConfig( @@ -10626,7 +10006,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_astronomy_original = LightevalTaskConfig( @@ -10642,7 +10021,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_astronomy_leaderboard = LightevalTaskConfig( @@ -10658,7 +10036,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_astronomy_helm = LightevalTaskConfig( @@ -10679,7 +10056,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_business_ethics_original = LightevalTaskConfig( @@ -10695,7 +10071,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_business_ethics_leaderboard = LightevalTaskConfig( @@ -10711,7 +10086,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_business_ethics_helm = LightevalTaskConfig( @@ -10732,7 +10106,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_clinical_knowledge_original = LightevalTaskConfig( @@ -10748,7 +10121,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_clinical_knowledge_leaderboard = LightevalTaskConfig( @@ -10764,7 +10136,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_clinical_knowledge_helm = LightevalTaskConfig( @@ -10785,7 +10156,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_biology_original = LightevalTaskConfig( @@ -10801,7 +10171,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_biology_leaderboard = LightevalTaskConfig( @@ -10817,7 +10186,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_biology_helm = LightevalTaskConfig( @@ -10838,7 +10206,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_chemistry_original = LightevalTaskConfig( @@ -10854,7 +10221,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_chemistry_leaderboard = LightevalTaskConfig( @@ -10870,7 +10236,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_chemistry_helm = LightevalTaskConfig( @@ -10891,7 +10256,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_computer_science_original = LightevalTaskConfig( @@ -10907,7 +10271,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_computer_science_leaderboard = LightevalTaskConfig( @@ -10923,7 +10286,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_computer_science_helm = LightevalTaskConfig( @@ -10944,7 +10306,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_mathematics_original = LightevalTaskConfig( @@ -10960,7 +10321,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_mathematics_leaderboard = LightevalTaskConfig( @@ -10976,7 +10336,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_mathematics_helm = LightevalTaskConfig( @@ -10997,7 +10356,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_medicine_original = LightevalTaskConfig( @@ -11013,7 +10371,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_medicine_leaderboard = LightevalTaskConfig( @@ -11029,7 +10386,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_medicine_helm = LightevalTaskConfig( @@ -11050,7 +10406,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_physics_original = LightevalTaskConfig( @@ -11066,7 +10421,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_physics_leaderboard = LightevalTaskConfig( @@ -11082,7 +10436,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_college_physics_helm = LightevalTaskConfig( @@ -11103,7 +10456,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_computer_security_original = LightevalTaskConfig( @@ -11119,7 +10471,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_computer_security_leaderboard = LightevalTaskConfig( @@ -11135,7 +10486,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_computer_security_helm = LightevalTaskConfig( @@ -11156,7 +10506,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_conceptual_physics_original = LightevalTaskConfig( @@ -11172,7 +10521,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_conceptual_physics_leaderboard = LightevalTaskConfig( @@ -11188,7 +10536,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_conceptual_physics_helm = LightevalTaskConfig( @@ -11209,7 +10556,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_econometrics_original = LightevalTaskConfig( @@ -11225,7 +10571,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_econometrics_leaderboard = LightevalTaskConfig( @@ -11241,7 +10586,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_econometrics_helm = LightevalTaskConfig( @@ -11262,7 +10606,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_electrical_engineering_original = LightevalTaskConfig( @@ -11278,7 +10621,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_electrical_engineering_leaderboard = LightevalTaskConfig( @@ -11294,7 +10636,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_electrical_engineering_helm = LightevalTaskConfig( @@ -11315,7 +10656,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_elementary_mathematics_original = LightevalTaskConfig( @@ -11331,7 +10671,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_elementary_mathematics_leaderboard = LightevalTaskConfig( @@ -11347,7 +10686,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_elementary_mathematics_helm = LightevalTaskConfig( @@ -11368,7 +10706,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_formal_logic_original = LightevalTaskConfig( @@ -11384,7 +10721,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_formal_logic_leaderboard = LightevalTaskConfig( @@ -11400,7 +10736,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_formal_logic_helm = LightevalTaskConfig( @@ -11421,7 +10756,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_global_facts_original = LightevalTaskConfig( @@ -11437,7 +10771,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_global_facts_leaderboard = LightevalTaskConfig( @@ -11453,7 +10786,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_global_facts_helm = LightevalTaskConfig( @@ -11474,7 +10806,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_biology_original = LightevalTaskConfig( @@ -11490,7 +10821,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_biology_leaderboard = LightevalTaskConfig( @@ -11506,7 +10836,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_biology_helm = LightevalTaskConfig( @@ -11527,7 +10856,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_chemistry_original = LightevalTaskConfig( @@ -11543,7 +10871,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_chemistry_leaderboard = LightevalTaskConfig( @@ -11559,7 +10886,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_chemistry_helm = LightevalTaskConfig( @@ -11580,7 +10906,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_computer_science_original = LightevalTaskConfig( @@ -11596,7 +10921,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_computer_science_leaderboard = LightevalTaskConfig( @@ -11612,7 +10936,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_computer_science_helm = LightevalTaskConfig( @@ -11633,7 +10956,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_european_history_original = LightevalTaskConfig( @@ -11649,7 +10971,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_european_history_leaderboard = LightevalTaskConfig( @@ -11665,7 +10986,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_european_history_helm = LightevalTaskConfig( @@ -11686,7 +11006,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_geography_original = LightevalTaskConfig( @@ -11702,7 +11021,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_geography_leaderboard = LightevalTaskConfig( @@ -11718,7 +11036,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_geography_helm = LightevalTaskConfig( @@ -11739,7 +11056,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_government_and_politics_original = LightevalTaskConfig( @@ -11755,7 +11071,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_government_and_politics_leaderboard = LightevalTaskConfig( @@ -11771,7 +11086,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_government_and_politics_helm = LightevalTaskConfig( @@ -11792,7 +11106,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_macroeconomics_original = LightevalTaskConfig( @@ -11808,7 +11121,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_macroeconomics_leaderboard = LightevalTaskConfig( @@ -11824,7 +11136,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_macroeconomics_helm = LightevalTaskConfig( @@ -11845,7 +11156,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_mathematics_original = LightevalTaskConfig( @@ -11861,7 +11171,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_mathematics_leaderboard = LightevalTaskConfig( @@ -11877,7 +11186,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_mathematics_helm = LightevalTaskConfig( @@ -11898,7 +11206,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_microeconomics_original = LightevalTaskConfig( @@ -11914,7 +11221,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_microeconomics_leaderboard = LightevalTaskConfig( @@ -11930,7 +11236,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_microeconomics_helm = LightevalTaskConfig( @@ -11951,7 +11256,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_physics_original = LightevalTaskConfig( @@ -11967,7 +11271,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_physics_leaderboard = LightevalTaskConfig( @@ -11983,7 +11286,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_physics_helm = LightevalTaskConfig( @@ -12004,7 +11306,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_psychology_original = LightevalTaskConfig( @@ -12020,7 +11321,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_psychology_leaderboard = LightevalTaskConfig( @@ -12036,7 +11336,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_psychology_helm = LightevalTaskConfig( @@ -12057,7 +11356,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_statistics_original = LightevalTaskConfig( @@ -12073,7 +11371,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_statistics_leaderboard = LightevalTaskConfig( @@ -12089,7 +11386,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_statistics_helm = LightevalTaskConfig( @@ -12110,7 +11406,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_us_history_original = LightevalTaskConfig( @@ -12126,7 +11421,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_us_history_leaderboard = LightevalTaskConfig( @@ -12142,7 +11436,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_us_history_helm = LightevalTaskConfig( @@ -12163,7 +11456,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_world_history_original = LightevalTaskConfig( @@ -12179,7 +11471,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_world_history_leaderboard = LightevalTaskConfig( @@ -12195,7 +11486,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_high_school_world_history_helm = LightevalTaskConfig( @@ -12216,7 +11506,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_human_aging_original = LightevalTaskConfig( @@ -12232,7 +11521,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_human_aging_leaderboard = LightevalTaskConfig( @@ -12248,7 +11536,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_human_aging_helm = LightevalTaskConfig( @@ -12269,7 +11556,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_human_sexuality_original = LightevalTaskConfig( @@ -12285,7 +11571,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_human_sexuality_leaderboard = LightevalTaskConfig( @@ -12301,7 +11586,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_human_sexuality_helm = LightevalTaskConfig( @@ -12322,7 +11606,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_international_law_original = LightevalTaskConfig( @@ -12338,7 +11621,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_international_law_leaderboard = LightevalTaskConfig( @@ -12354,7 +11636,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_international_law_helm = LightevalTaskConfig( @@ -12375,7 +11656,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_jurisprudence_original = LightevalTaskConfig( @@ -12391,7 +11671,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_jurisprudence_leaderboard = LightevalTaskConfig( @@ -12407,7 +11686,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_jurisprudence_helm = LightevalTaskConfig( @@ -12428,7 +11706,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_logical_fallacies_original = LightevalTaskConfig( @@ -12444,7 +11721,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_logical_fallacies_leaderboard = LightevalTaskConfig( @@ -12460,7 +11736,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_logical_fallacies_helm = LightevalTaskConfig( @@ -12481,7 +11756,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_machine_learning_original = LightevalTaskConfig( @@ -12497,7 +11771,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_machine_learning_leaderboard = LightevalTaskConfig( @@ -12513,7 +11786,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_machine_learning_helm = LightevalTaskConfig( @@ -12534,7 +11806,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_management_original = LightevalTaskConfig( @@ -12550,7 +11821,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_management_leaderboard = LightevalTaskConfig( @@ -12566,7 +11836,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_management_helm = LightevalTaskConfig( @@ -12587,7 +11856,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_marketing_original = LightevalTaskConfig( @@ -12603,7 +11871,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_marketing_leaderboard = LightevalTaskConfig( @@ -12619,7 +11886,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_marketing_helm = LightevalTaskConfig( @@ -12640,7 +11906,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_medical_genetics_original = LightevalTaskConfig( @@ -12656,7 +11921,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_medical_genetics_leaderboard = LightevalTaskConfig( @@ -12672,7 +11936,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_medical_genetics_helm = LightevalTaskConfig( @@ -12693,7 +11956,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_miscellaneous_original = LightevalTaskConfig( @@ -12709,7 +11971,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_miscellaneous_leaderboard = LightevalTaskConfig( @@ -12725,7 +11986,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_miscellaneous_helm = LightevalTaskConfig( @@ -12746,7 +12006,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_moral_disputes_original = LightevalTaskConfig( @@ -12762,7 +12021,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_moral_disputes_leaderboard = LightevalTaskConfig( @@ -12778,7 +12036,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_moral_disputes_helm = LightevalTaskConfig( @@ -12799,7 +12056,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_moral_scenarios_original = LightevalTaskConfig( @@ -12815,7 +12071,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_moral_scenarios_leaderboard = LightevalTaskConfig( @@ -12831,7 +12086,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_moral_scenarios_helm = LightevalTaskConfig( @@ -12852,7 +12106,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_nutrition_original = LightevalTaskConfig( @@ -12868,7 +12121,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_nutrition_leaderboard = LightevalTaskConfig( @@ -12884,7 +12136,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_nutrition_helm = LightevalTaskConfig( @@ -12905,7 +12156,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_philosophy_original = LightevalTaskConfig( @@ -12921,7 +12171,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_philosophy_leaderboard = LightevalTaskConfig( @@ -12937,7 +12186,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_philosophy_helm = LightevalTaskConfig( @@ -12958,7 +12206,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_prehistory_original = LightevalTaskConfig( @@ -12974,7 +12221,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_prehistory_leaderboard = LightevalTaskConfig( @@ -12990,7 +12236,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_prehistory_helm = LightevalTaskConfig( @@ -13011,7 +12256,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_professional_accounting_original = LightevalTaskConfig( @@ -13027,7 +12271,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_professional_accounting_leaderboard = LightevalTaskConfig( @@ -13043,7 +12286,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_professional_accounting_helm = LightevalTaskConfig( @@ -13064,7 +12306,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_professional_law_original = LightevalTaskConfig( @@ -13080,7 +12321,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_professional_law_leaderboard = LightevalTaskConfig( @@ -13096,7 +12336,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_professional_law_helm = LightevalTaskConfig( @@ -13117,7 +12356,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_professional_medicine_original = LightevalTaskConfig( @@ -13133,7 +12371,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_professional_medicine_leaderboard = LightevalTaskConfig( @@ -13149,7 +12386,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_professional_medicine_helm = LightevalTaskConfig( @@ -13170,7 +12406,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_professional_psychology_original = LightevalTaskConfig( @@ -13186,7 +12421,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_professional_psychology_leaderboard = LightevalTaskConfig( @@ -13202,7 +12436,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_professional_psychology_helm = LightevalTaskConfig( @@ -13223,7 +12456,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_public_relations_original = LightevalTaskConfig( @@ -13239,7 +12471,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_public_relations_leaderboard = LightevalTaskConfig( @@ -13255,7 +12486,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_public_relations_helm = LightevalTaskConfig( @@ -13276,7 +12506,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_security_studies_original = LightevalTaskConfig( @@ -13292,7 +12521,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_security_studies_leaderboard = LightevalTaskConfig( @@ -13308,7 +12536,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_security_studies_helm = LightevalTaskConfig( @@ -13329,7 +12556,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_sociology_original = LightevalTaskConfig( @@ -13345,7 +12571,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_sociology_leaderboard = LightevalTaskConfig( @@ -13361,7 +12586,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_sociology_helm = LightevalTaskConfig( @@ -13382,7 +12606,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_us_foreign_policy_original = LightevalTaskConfig( @@ -13398,7 +12621,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_us_foreign_policy_leaderboard = LightevalTaskConfig( @@ -13414,7 +12636,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_us_foreign_policy_helm = LightevalTaskConfig( @@ -13435,7 +12656,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_virology_original = LightevalTaskConfig( @@ -13451,7 +12671,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_virology_leaderboard = LightevalTaskConfig( @@ -13467,7 +12686,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_virology_helm = LightevalTaskConfig( @@ -13488,7 +12706,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_world_religions_original = LightevalTaskConfig( @@ -13504,7 +12721,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_world_religions_leaderboard = LightevalTaskConfig( @@ -13520,7 +12736,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mmlu_world_religions_helm = LightevalTaskConfig( @@ -13541,14 +12756,13 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mnist_ascii_bigbench = LightevalTaskConfig( name="mnist_ascii", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="mnist_ascii", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -13557,14 +12771,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) modified_arithmetic_bigbench = LightevalTaskConfig( name="modified_arithmetic", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="modified_arithmetic", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -13573,14 +12786,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) moral_permissibility_bigbench = LightevalTaskConfig( name="moral_permissibility", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="moral_permissibility", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -13589,14 +12801,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) movie_dialog_same_or_different_bigbench = LightevalTaskConfig( name="movie_dialog_same_or_different", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="movie_dialog_same_or_different", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -13605,14 +12816,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) movie_recommendation_bigbench = LightevalTaskConfig( name="movie_recommendation", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="movie_recommendation", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -13621,7 +12831,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mtnt2019_en_fr_lighteval = LightevalTaskConfig( @@ -13637,7 +12846,6 @@ generation_size=200, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mtnt2019_en_ja_lighteval = LightevalTaskConfig( @@ -13653,7 +12861,6 @@ generation_size=200, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mtnt2019_fr_en_lighteval = LightevalTaskConfig( @@ -13669,7 +12876,6 @@ generation_size=200, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mtnt2019_ja_en_lighteval = LightevalTaskConfig( @@ -13685,14 +12891,13 @@ generation_size=200, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mult_data_wrangling_bigbench = LightevalTaskConfig( name="mult_data_wrangling", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="mult_data_wrangling", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -13701,14 +12906,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) multiemo_bigbench = LightevalTaskConfig( name="multiemo", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="multiemo", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -13717,7 +12921,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) musr_murder_mysteries = LightevalTaskConfig( @@ -13733,7 +12936,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) musr_object_placements = LightevalTaskConfig( @@ -13749,7 +12951,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) musr_team_allocation = LightevalTaskConfig( @@ -13765,7 +12966,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mutual_lighteval = LightevalTaskConfig( @@ -13781,7 +12981,6 @@ generation_size=-1, metrics=[Metrics.recall_at_1, Metrics.recall_at_2, Metrics.mrr], stop_sequence=["\n"], - trust_dataset=True, version=0, ) mutual_plus_lighteval = LightevalTaskConfig( @@ -13797,7 +12996,6 @@ generation_size=-1, metrics=[Metrics.recall_at_1, Metrics.recall_at_2, Metrics.mrr], stop_sequence=["\n"], - trust_dataset=True, version=0, ) narrativeqa_helm = LightevalTaskConfig( @@ -13820,14 +13018,13 @@ Metrics.bleu_4, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) natural_instructions_bigbench = LightevalTaskConfig( name="natural_instructions", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="natural_instructions", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -13836,7 +13033,6 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.rouge_t5], stop_sequence=["\n"], - trust_dataset=True, version=0, ) natural_questions = LightevalTaskConfig( @@ -13861,7 +13057,7 @@ name="navigate", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="navigate", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -13870,14 +13066,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) nonsense_words_grammar_bigbench = LightevalTaskConfig( name="nonsense_words_grammar", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="nonsense_words_grammar", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -13886,14 +13081,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) novel_concepts_bigbench_lite = LightevalTaskConfig( name="novel_concepts", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="novel_concepts", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -13902,7 +13096,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) numeracy_linear_example_helm = LightevalTaskConfig( @@ -13918,7 +13111,6 @@ generation_size=20, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) numeracy_linear_standard_helm = LightevalTaskConfig( @@ -13934,7 +13126,6 @@ generation_size=20, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) numeracy_parabola_example_helm = LightevalTaskConfig( @@ -13950,7 +13141,6 @@ generation_size=20, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) numeracy_parabola_standard_helm = LightevalTaskConfig( @@ -13966,7 +13156,6 @@ generation_size=20, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) numeracy_paraboloid_example_helm = LightevalTaskConfig( @@ -13982,7 +13171,6 @@ generation_size=20, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) numeracy_paraboloid_standard_helm = LightevalTaskConfig( @@ -13998,7 +13186,6 @@ generation_size=20, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) numeracy_plane_example_helm = LightevalTaskConfig( @@ -14014,7 +13201,6 @@ generation_size=20, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) numeracy_plane_standard_helm = LightevalTaskConfig( @@ -14030,14 +13216,13 @@ generation_size=20, metrics=[Metrics.exact_match, Metrics.quasi_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) object_counting_bigbench = LightevalTaskConfig( name="object_counting", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="object_counting", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14046,14 +13231,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) odd_one_out_bigbench = LightevalTaskConfig( name="odd_one_out", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="odd_one_out", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14062,7 +13246,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) openbookqa_helm = LightevalTaskConfig( @@ -14083,7 +13266,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) openbookqa_lighteval = LightevalTaskConfig( @@ -14099,14 +13281,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=["\n"], - trust_dataset=True, version=0, ) operators_bigbench_lite = LightevalTaskConfig( name="operators", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="operators", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14115,14 +13296,13 @@ generation_size=20, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) paragraph_segmentation_bigbench = LightevalTaskConfig( name="paragraph_segmentation", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="paragraph_segmentation", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14131,14 +13311,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) parsinlu_qa_bigbench = LightevalTaskConfig( name="parsinlu_qa", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="parsinlu_qa", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14147,14 +13326,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) parsinlu_reading_comprehension_bigbench_lite = LightevalTaskConfig( name="parsinlu_reading_comprehension", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="parsinlu_reading_comprehension", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14163,14 +13341,13 @@ generation_size=100, metrics=[Metrics.perfect_exact_match], stop_sequence=None, - trust_dataset=True, version=0, ) penguins_in_a_table_bigbench = LightevalTaskConfig( name="penguins_in_a_table", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="penguins_in_a_table", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14179,14 +13356,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) periodic_elements_bigbench = LightevalTaskConfig( name="periodic_elements", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="periodic_elements", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14195,14 +13371,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) persian_idioms_bigbench = LightevalTaskConfig( name="persian_idioms", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="persian_idioms", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14211,14 +13386,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) phrase_relatedness_bigbench = LightevalTaskConfig( name="phrase_relatedness", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="phrase_relatedness", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14227,14 +13401,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) physical_intuition_bigbench = LightevalTaskConfig( name="physical_intuition", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="physical_intuition", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14243,14 +13416,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) physics_bigbench = LightevalTaskConfig( name="physics", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="physics", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14259,14 +13431,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) physics_questions_bigbench = LightevalTaskConfig( name="physics_questions", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="physics_questions", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14275,7 +13446,6 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) piqa_lighteval = LightevalTaskConfig( @@ -14291,7 +13461,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=["\n"], - trust_dataset=True, version=0, ) piqa_helm = LightevalTaskConfig( @@ -14312,14 +13481,13 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) play_dialog_same_or_different_bigbench_lite = LightevalTaskConfig( name="play_dialog_same_or_different", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="play_dialog_same_or_different", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14328,14 +13496,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) polish_sequence_labeling_bigbench = LightevalTaskConfig( name="polish_sequence_labeling", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="polish_sequence_labeling", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14344,14 +13511,13 @@ generation_size=1, metrics=[Metrics.f1_score], stop_sequence=["\n"], - trust_dataset=True, version=0, ) presuppositions_as_nli_bigbench = LightevalTaskConfig( name="presuppositions_as_nli", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="presuppositions_as_nli", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14360,14 +13526,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) prost_lighteval = LightevalTaskConfig( name="prost", suite=["lighteval"], prompt_function=prompt.prost, - hf_repo="corypaik/prost", + hf_repo="lighteval/prost", hf_subset="default", hf_avail_splits=["test"], evaluation_splits=["test"], @@ -14376,7 +13541,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=["\n"], - trust_dataset=True, version=0, ) pubmedqa_lighteval = LightevalTaskConfig( @@ -14392,7 +13556,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) pubmedqa_helm = LightevalTaskConfig( @@ -14413,7 +13576,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) qa4mre_2011_lighteval = LightevalTaskConfig( @@ -14429,7 +13591,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=["\n"], - trust_dataset=True, version=0, ) qa4mre_2012_lighteval = LightevalTaskConfig( @@ -14445,7 +13606,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=["\n"], - trust_dataset=True, version=0, ) qa4mre_2013_lighteval = LightevalTaskConfig( @@ -14461,14 +13621,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=["\n"], - trust_dataset=True, version=0, ) qa_wikidata_bigbench = LightevalTaskConfig( name="qa_wikidata", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="qa_wikidata", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14477,14 +13636,13 @@ generation_size=1, metrics=[Metrics.bleurt, Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) qasper_lighteval = LightevalTaskConfig( name="qasper", suite=["lighteval"], prompt_function=prompt.qasper, - hf_repo="qasper", + hf_repo="allenai/qasper", hf_subset="qasper", hf_avail_splits=["train", "validation"], evaluation_splits=["validation"], @@ -14493,14 +13651,13 @@ generation_size=20, metrics=[Metrics.f1_score_quasi], stop_sequence=["\n"], - trust_dataset=True, version=0, ) qasper_ll_lighteval = LightevalTaskConfig( name="qasper_ll", suite=["lighteval"], prompt_function=prompt.qasper_ll, - hf_repo="qasper", + hf_repo="allenai/qasper", hf_subset="qasper", hf_avail_splits=["train", "validation"], evaluation_splits=["validation"], @@ -14509,7 +13666,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) quac_helm = LightevalTaskConfig( @@ -14525,14 +13681,13 @@ generation_size=100, metrics=[Metrics.exact_match, Metrics.quasi_exact_match, Metrics.f1_score], stop_sequence=["\n"], - trust_dataset=True, version=0, ) question_selection_bigbench = LightevalTaskConfig( name="question_selection", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="question_selection", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14541,7 +13696,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) race_high_lighteval = LightevalTaskConfig( @@ -14557,7 +13711,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) raft_ade_corpus_v2_helm = LightevalTaskConfig( @@ -14580,7 +13733,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) raft_banking_77_helm = LightevalTaskConfig( @@ -14603,7 +13755,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) raft_neurips_impact_statement_risks_helm = LightevalTaskConfig( @@ -14626,7 +13777,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) raft_one_stop_english_helm = LightevalTaskConfig( @@ -14649,7 +13799,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) raft_overruling_helm = LightevalTaskConfig( @@ -14672,7 +13821,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) raft_semiconductor_org_types_helm = LightevalTaskConfig( @@ -14695,7 +13843,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) raft_systematic_review_inclusion_helm = LightevalTaskConfig( @@ -14718,7 +13865,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) raft_tai_safety_research_helm = LightevalTaskConfig( @@ -14741,7 +13887,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) raft_terms_of_service_helm = LightevalTaskConfig( @@ -14764,7 +13909,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) raft_tweet_eval_hate_helm = LightevalTaskConfig( @@ -14787,7 +13931,6 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) raft_twitter_complaints_helm = LightevalTaskConfig( @@ -14810,14 +13953,13 @@ Metrics.f1_score_micro, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) real_or_fake_text_bigbench = LightevalTaskConfig( name="real_or_fake_text", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="real_or_fake_text", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14826,7 +13968,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) real_toxicity_prompts_helm = LightevalTaskConfig( @@ -14842,14 +13983,13 @@ generation_size=20, metrics=[Metrics.prediction_perplexity], stop_sequence=["\n"], - trust_dataset=True, version=0, ) reasoning_about_colored_objects_bigbench = LightevalTaskConfig( name="reasoning_about_colored_objects", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="reasoning_about_colored_objects", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14858,14 +13998,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) repeat_copy_logic_bigbench_lite = LightevalTaskConfig( name="repeat_copy_logic", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="repeat_copy_logic", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14874,14 +14013,13 @@ generation_size=100, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) rephrase_bigbench = LightevalTaskConfig( name="rephrase", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="rephrase", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14890,14 +14028,13 @@ generation_size=1, metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) rhyming_bigbench = LightevalTaskConfig( name="rhyming", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="rhyming", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14906,14 +14043,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) riddle_sense_bigbench = LightevalTaskConfig( name="riddle_sense", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="riddle_sense", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14922,14 +14058,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) ruin_names_bigbench = LightevalTaskConfig( name="ruin_names", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="ruin_names", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14938,14 +14073,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) salient_translation_error_detection_bigbench = LightevalTaskConfig( name="salient_translation_error_detection", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="salient_translation_error_detection", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14954,14 +14088,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) scientific_press_release_bigbench = LightevalTaskConfig( name="scientific_press_release", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="scientific_press_release", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -14970,7 +14103,6 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) sciq_lighteval = LightevalTaskConfig( @@ -14986,14 +14118,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=["\n"], - trust_dataset=True, version=0, ) semantic_parsing_in_context_sparc_bigbench = LightevalTaskConfig( name="semantic_parsing_in_context_sparc", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="semantic_parsing_in_context_sparc", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15002,14 +14133,13 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) semantic_parsing_spider_bigbench = LightevalTaskConfig( name="semantic_parsing_spider", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="semantic_parsing_spider", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15018,14 +14148,13 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) sentence_ambiguity_bigbench = LightevalTaskConfig( name="sentence_ambiguity", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="sentence_ambiguity", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15034,14 +14163,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) similarities_abstraction_bigbench = LightevalTaskConfig( name="similarities_abstraction", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="similarities_abstraction", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15050,14 +14178,13 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) simp_turing_concept_bigbench = LightevalTaskConfig( name="simp_turing_concept", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="simp_turing_concept", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15066,7 +14193,6 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) simpleqa = LightevalTaskConfig( @@ -15082,14 +14208,13 @@ generation_size=2048, metrics=[Metrics.simpleqa_judge], stop_sequence=["\n"], - trust_dataset=True, version=0, ) simple_arithmetic_json_bigbench = LightevalTaskConfig( name="simple_arithmetic_json", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="simple_arithmetic_json", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15098,14 +14223,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) simple_arithmetic_json_multiple_choice_bigbench = LightevalTaskConfig( name="simple_arithmetic_json_multiple_choice", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="simple_arithmetic_json_multiple_choice", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15114,14 +14238,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) simple_arithmetic_json_subtasks_bigbench = LightevalTaskConfig( name="simple_arithmetic_json_subtasks", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="simple_arithmetic_json_subtasks", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15130,14 +14253,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) simple_arithmetic_multiple_targets_json_bigbench = LightevalTaskConfig( name="simple_arithmetic_multiple_targets_json", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="simple_arithmetic_multiple_targets_json", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15146,14 +14268,13 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) simple_ethical_questions_bigbench = LightevalTaskConfig( name="simple_ethical_questions", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="simple_ethical_questions", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15162,14 +14283,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) simple_text_editing_bigbench = LightevalTaskConfig( name="simple_text_editing", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="simple_text_editing", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15178,14 +14298,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) siqa_helm = LightevalTaskConfig( name="siqa", suite=["helm", "commonsense_scenario"], prompt_function=prompt.siqa, - hf_repo="social_i_qa", + hf_repo="allenai/social_i_qa", hf_subset="default", hf_avail_splits=["train", "validation"], evaluation_splits=["validation"], @@ -15199,14 +14318,13 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) snarks_bigbench = LightevalTaskConfig( name="snarks", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="snarks", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15215,14 +14333,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) social_iqa_bigbench = LightevalTaskConfig( name="social_iqa", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="social_iqa", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15231,14 +14348,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) social_support_bigbench = LightevalTaskConfig( name="social_support", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="social_support", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15247,14 +14363,13 @@ generation_size=1, metrics=[Metrics.f1_score_macro], stop_sequence=["\n"], - trust_dataset=True, version=0, ) sports_understanding_bigbench = LightevalTaskConfig( name="sports_understanding", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="sports_understanding", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15263,7 +14378,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) squad_v2 = LightevalTaskConfig( @@ -15293,7 +14407,7 @@ name="storycloze:2016", suite=["lighteval", "storycloze"], prompt_function=prompt.storycloze, - hf_repo="story_cloze", + hf_repo="MoE-UNC/story_cloze", hf_subset="2016", hf_avail_splits=["validation"], evaluation_splits=["validation"], @@ -15302,14 +14416,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) storycloze_2018_lighteval = LightevalTaskConfig( name="storycloze:2018", suite=["lighteval", "storycloze"], prompt_function=prompt.storycloze, - hf_repo="story_cloze", + hf_repo="MoE-UNC/story_cloze", hf_subset="2018", hf_avail_splits=["validation"], evaluation_splits=["validation"], @@ -15318,14 +14431,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) strange_stories_bigbench_lite = LightevalTaskConfig( name="strange_stories", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="strange_stories", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15334,14 +14446,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) strategyqa_bigbench_lite = LightevalTaskConfig( name="strategyqa", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="strategyqa", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15350,14 +14461,13 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) sufficient_information_bigbench = LightevalTaskConfig( name="sufficient_information", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="sufficient_information", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15366,14 +14476,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) suicide_risk_bigbench = LightevalTaskConfig( name="suicide_risk", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="suicide_risk", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15382,7 +14491,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) summarization_cnn_dm_helm = LightevalTaskConfig( @@ -15405,7 +14513,6 @@ Metrics.bert_score, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) summarization_xsum_helm = LightevalTaskConfig( @@ -15428,7 +14535,6 @@ Metrics.bert_score, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) summarization_xsum_sampled_helm = LightevalTaskConfig( @@ -15451,7 +14557,6 @@ Metrics.bert_score, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) super_glue_boolq_lighteval = LightevalTaskConfig( @@ -15467,7 +14572,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) super_glue_cb_lighteval = LightevalTaskConfig( @@ -15483,7 +14587,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc, Metrics.multi_f1_numeric], stop_sequence=["\n"], - trust_dataset=True, version=0, ) super_glue_copa_lighteval = LightevalTaskConfig( @@ -15499,7 +14602,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) super_glue_rte_lighteval = LightevalTaskConfig( @@ -15515,7 +14617,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) super_glue_multirc_lighteval = LightevalTaskConfig( @@ -15531,7 +14632,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) super_glue_wic_lighteval = LightevalTaskConfig( @@ -15547,7 +14647,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) super_glue_wsc_lighteval = LightevalTaskConfig( @@ -15563,14 +14662,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) swahili_english_proverbs_bigbench = LightevalTaskConfig( name="swahili_english_proverbs", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="swahili_english_proverbs", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15579,7 +14677,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) swag_lighteval = LightevalTaskConfig( @@ -15595,14 +14692,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], stop_sequence=["\n"], - trust_dataset=True, version=0, ) swedish_to_german_proverbs_bigbench = LightevalTaskConfig( name="swedish_to_german_proverbs", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="swedish_to_german_proverbs", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15611,14 +14707,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) symbol_interpretation_bigbench_lite = LightevalTaskConfig( name="symbol_interpretation", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="symbol_interpretation", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15627,7 +14722,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) synthetic_reasoning_induction_helm = LightevalTaskConfig( @@ -15648,7 +14742,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) synthetic_reasoning_natural_easy_helm = LightevalTaskConfig( @@ -15664,7 +14757,6 @@ generation_size=20, metrics=[Metrics.exact_match, Metrics.f1_score], stop_sequence=["\n"], - trust_dataset=True, version=0, ) synthetic_reasoning_natural_hard_helm = LightevalTaskConfig( @@ -15680,7 +14772,6 @@ generation_size=20, metrics=[Metrics.exact_match, Metrics.f1_score], stop_sequence=["\n"], - trust_dataset=True, version=0, ) synthetic_reasoning_pattern_match_helm = LightevalTaskConfig( @@ -15701,7 +14792,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) synthetic_reasoning_variable_substitution_helm = LightevalTaskConfig( @@ -15722,14 +14812,13 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) tellmewhy_bigbench = LightevalTaskConfig( name="tellmewhy", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="tellmewhy", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15738,14 +14827,13 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.rouge_t5], stop_sequence=["\n"], - trust_dataset=True, version=0, ) temporal_sequences_bigbench = LightevalTaskConfig( name="temporal_sequences", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="temporal_sequences", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15754,14 +14842,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) tense_bigbench = LightevalTaskConfig( name="tense", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="tense", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -15770,23 +14857,6 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_arxiv_lighteval = LightevalTaskConfig( - name="the_pile:arxiv", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_arxiv", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_arxiv_helm = LightevalTaskConfig( @@ -15802,7 +14872,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_bibliotik_helm = LightevalTaskConfig( @@ -15818,39 +14887,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_bookcorpus2_lighteval = LightevalTaskConfig( - name="the_pile:bookcorpus2", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_bookcorpus2", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_books3_lighteval = LightevalTaskConfig( - name="the_pile:books3", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_books3", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_commoncrawl_helm = LightevalTaskConfig( @@ -15866,23 +14902,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_dm_mathematics_lighteval = LightevalTaskConfig( - name="the_pile:dm-mathematics", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_dm-mathematics", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_dm_mathematics_helm = LightevalTaskConfig( @@ -15898,23 +14917,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_enron_lighteval = LightevalTaskConfig( - name="the_pile:enron", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_enron", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_enron_helm = LightevalTaskConfig( @@ -15930,23 +14932,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_europarl_lighteval = LightevalTaskConfig( - name="the_pile:europarl", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_europarl", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_europarl_helm = LightevalTaskConfig( @@ -15962,23 +14947,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_freelaw_lighteval = LightevalTaskConfig( - name="the_pile:freelaw", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_freelaw", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_freelaw_helm = LightevalTaskConfig( @@ -15994,23 +14962,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_github_lighteval = LightevalTaskConfig( - name="the_pile:github", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_github", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_github_helm = LightevalTaskConfig( @@ -16026,23 +14977,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_gutenberg_lighteval = LightevalTaskConfig( - name="the_pile:gutenberg", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_gutenberg", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_gutenberg_helm = LightevalTaskConfig( @@ -16058,23 +14992,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_hackernews_lighteval = LightevalTaskConfig( - name="the_pile:hackernews", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_hackernews", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_hackernews_helm = LightevalTaskConfig( @@ -16090,23 +15007,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_nih_exporter_lighteval = LightevalTaskConfig( - name="the_pile:nih-exporter", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_nih-exporter", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_nih_exporter_helm = LightevalTaskConfig( @@ -16122,23 +15022,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_opensubtitles_lighteval = LightevalTaskConfig( - name="the_pile:opensubtitles", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_opensubtitles", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_opensubtitles_helm = LightevalTaskConfig( @@ -16154,23 +15037,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_openwebtext2_lighteval = LightevalTaskConfig( - name="the_pile:openwebtext2", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_openwebtext2", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_openwebtext2_helm = LightevalTaskConfig( @@ -16186,55 +15052,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_philpapers_lighteval = LightevalTaskConfig( - name="the_pile:philpapers", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_philpapers", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_pile_cc_lighteval = LightevalTaskConfig( - name="the_pile:pile-cc", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_pile-cc", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_pubmed_abstracts_lighteval = LightevalTaskConfig( - name="the_pile:pubmed-abstracts", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_pubmed-abstracts", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_pubmed_abstracts_helm = LightevalTaskConfig( @@ -16250,23 +15067,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_pubmed_central_lighteval = LightevalTaskConfig( - name="the_pile:pubmed-central", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_pubmed-central", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_pubmed_central_helm = LightevalTaskConfig( @@ -16282,23 +15082,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_stackexchange_lighteval = LightevalTaskConfig( - name="the_pile:stackexchange", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_stackexchange", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_stackexchange_helm = LightevalTaskConfig( @@ -16314,39 +15097,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_ubuntu_irc_lighteval = LightevalTaskConfig( - name="the_pile:ubuntu-irc", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_ubuntu-irc", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_uspto_lighteval = LightevalTaskConfig( - name="the_pile:uspto", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_upsto", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_upsto_helm = LightevalTaskConfig( @@ -16362,23 +15112,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_wikipedia_lighteval = LightevalTaskConfig( - name="the_pile:wikipedia", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_wikipedia", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_wikipedia_helm = LightevalTaskConfig( @@ -16394,23 +15127,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, - version=0, -) -the_pile_youtubesubtitles_lighteval = LightevalTaskConfig( - name="the_pile:youtubesubtitles", - suite=["lighteval", "pile"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile", - hf_subset="pile_youtubesubtitles", - hf_avail_splits=["validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - trust_dataset=True, version=0, ) the_pile_youtubesubtitles_helm = LightevalTaskConfig( @@ -16426,14 +15142,13 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, version=0, ) timedial_bigbench = LightevalTaskConfig( name="timedial", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="timedial", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -16442,7 +15157,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) toxigen_lighteval = LightevalTaskConfig( @@ -16458,14 +15172,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], stop_sequence=["\n"], - trust_dataset=True, version=0, ) topical_chat_bigbench = LightevalTaskConfig( name="topical_chat", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="topical_chat", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -16474,14 +15187,13 @@ generation_size=1, metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.bleurt], stop_sequence=["\n"], - trust_dataset=True, version=0, ) tracking_shuffled_objects_bigbench = LightevalTaskConfig( name="tracking_shuffled_objects", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="tracking_shuffled_objects", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -16490,7 +15202,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) triviaqa_lighteval = LightevalTaskConfig( @@ -16506,7 +15217,6 @@ generation_size=20, metrics=[Metrics.quasi_exact_match_triviaqa], stop_sequence=["\n", ".", ","], - trust_dataset=True, version=0, ) truthfulqa_gen_lighteval = LightevalTaskConfig( @@ -16522,7 +15232,6 @@ generation_size=200, metrics=[Metrics.bleu, Metrics.rouge_t5], stop_sequence=["\n"], - trust_dataset=True, version=0, ) truthfulqa_mc_leaderboard = LightevalTaskConfig( @@ -16538,7 +15247,6 @@ generation_size=-1, metrics=[Metrics.truthfulqa_mc_metrics], stop_sequence=["\n"], - trust_dataset=True, version=0, ) truthfulqa_helm = LightevalTaskConfig( @@ -16560,7 +15268,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) twitterAAE_aa_helm = LightevalTaskConfig( @@ -16576,7 +15283,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, version=0, ) twitterAAE_white_helm = LightevalTaskConfig( @@ -16592,14 +15298,13 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, version=0, ) understanding_fables_bigbench = LightevalTaskConfig( name="understanding_fables", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="understanding_fables", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -16608,14 +15313,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) undo_permutation_bigbench = LightevalTaskConfig( name="undo_permutation", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="undo_permutation", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -16624,14 +15328,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) unit_conversion_bigbench = LightevalTaskConfig( name="unit_conversion", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="unit_conversion", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -16640,14 +15343,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) unit_interpretation_bigbench = LightevalTaskConfig( name="unit_interpretation", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="unit_interpretation", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -16656,14 +15358,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) unnatural_in_context_learning_bigbench = LightevalTaskConfig( name="unnatural_in_context_learning", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="unnatural_in_context_learning", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -16672,7 +15373,6 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) unscramble_anagrams1_lighteval = LightevalTaskConfig( @@ -16688,7 +15388,6 @@ generation_size=5, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) unscramble_anagrams2_lighteval = LightevalTaskConfig( @@ -16704,7 +15403,6 @@ generation_size=5, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) unscramble_cycle_letters_lighteval = LightevalTaskConfig( @@ -16720,7 +15418,6 @@ generation_size=5, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) unscramble_random_insertion_lighteval = LightevalTaskConfig( @@ -16736,7 +15433,6 @@ generation_size=5, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) unscramble_reversed_words_lighteval = LightevalTaskConfig( @@ -16752,14 +15448,13 @@ generation_size=5, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) vitaminc_fact_verification_bigbench_lite = LightevalTaskConfig( name="vitaminc_fact_verification", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="vitaminc_fact_verification", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -16768,7 +15463,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) webqs_lighteval = LightevalTaskConfig( @@ -16784,14 +15478,13 @@ generation_size=-1, metrics=[Metrics.acc_golds_likelihood], stop_sequence=["\n"], - trust_dataset=True, version=0, ) what_is_the_tao_bigbench = LightevalTaskConfig( name="what_is_the_tao", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="what_is_the_tao", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -16800,14 +15493,13 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) which_wiki_edit_bigbench = LightevalTaskConfig( name="which_wiki_edit", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="which_wiki_edit", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -16816,7 +15508,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_applies_to_jurisdiction_helm = LightevalTaskConfig( @@ -16837,7 +15528,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_atomic_number_helm = LightevalTaskConfig( @@ -16858,7 +15548,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_author_helm = LightevalTaskConfig( @@ -16879,7 +15568,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_award_received_helm = LightevalTaskConfig( @@ -16900,7 +15588,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_basic_form_of_government_helm = LightevalTaskConfig( @@ -16921,7 +15608,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_capital_helm = LightevalTaskConfig( @@ -16942,7 +15628,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_capital_of_helm = LightevalTaskConfig( @@ -16963,7 +15648,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_central_bank_helm = LightevalTaskConfig( @@ -16984,7 +15668,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_composer_helm = LightevalTaskConfig( @@ -17005,7 +15688,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_continent_helm = LightevalTaskConfig( @@ -17026,7 +15708,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_country_helm = LightevalTaskConfig( @@ -17047,7 +15728,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_country_of_citizenship_helm = LightevalTaskConfig( @@ -17068,7 +15748,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_country_of_origin_helm = LightevalTaskConfig( @@ -17089,7 +15768,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_creator_helm = LightevalTaskConfig( @@ -17110,7 +15788,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_currency_helm = LightevalTaskConfig( @@ -17131,7 +15808,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_defendant_helm = LightevalTaskConfig( @@ -17152,7 +15828,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_developer_helm = LightevalTaskConfig( @@ -17173,7 +15848,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_diplomatic_relation_helm = LightevalTaskConfig( @@ -17194,7 +15868,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_director_helm = LightevalTaskConfig( @@ -17215,7 +15888,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_discoverer_or_inventor_helm = LightevalTaskConfig( @@ -17236,7 +15908,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_drug_or_therapy_used_for_treatment_helm = LightevalTaskConfig( @@ -17257,7 +15928,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_educated_at_helm = LightevalTaskConfig( @@ -17278,7 +15948,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_electron_configuration_helm = LightevalTaskConfig( @@ -17299,7 +15968,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_employer_helm = LightevalTaskConfig( @@ -17320,7 +15988,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_field_of_work_helm = LightevalTaskConfig( @@ -17341,7 +16008,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_file_extension_helm = LightevalTaskConfig( @@ -17362,7 +16028,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_genetic_association_helm = LightevalTaskConfig( @@ -17383,7 +16048,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_genre_helm = LightevalTaskConfig( @@ -17404,7 +16068,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_has_part_helm = LightevalTaskConfig( @@ -17425,7 +16088,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_head_of_government_helm = LightevalTaskConfig( @@ -17446,7 +16108,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_head_of_state_helm = LightevalTaskConfig( @@ -17467,7 +16128,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_headquarters_location_helm = LightevalTaskConfig( @@ -17488,7 +16148,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_industry_helm = LightevalTaskConfig( @@ -17509,7 +16168,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_influenced_by_helm = LightevalTaskConfig( @@ -17530,7 +16188,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_instance_of_helm = LightevalTaskConfig( @@ -17551,7 +16208,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_instrument_helm = LightevalTaskConfig( @@ -17572,7 +16228,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_language_of_work_or_name_helm = LightevalTaskConfig( @@ -17593,7 +16248,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_languages_spoken_written_or_signed_helm = LightevalTaskConfig( @@ -17614,7 +16268,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_laws_applied_helm = LightevalTaskConfig( @@ -17635,7 +16288,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_located_in_the_administrative_territorial_entity_helm = LightevalTaskConfig( @@ -17656,7 +16308,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_location_helm = LightevalTaskConfig( @@ -17677,7 +16328,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_location_of_discovery_helm = LightevalTaskConfig( @@ -17698,7 +16348,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_location_of_formation_helm = LightevalTaskConfig( @@ -17719,7 +16368,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_majority_opinion_by_helm = LightevalTaskConfig( @@ -17740,7 +16388,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_manufacturer_helm = LightevalTaskConfig( @@ -17761,7 +16408,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_measured_physical_quantity_helm = LightevalTaskConfig( @@ -17782,7 +16428,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_medical_condition_treated_helm = LightevalTaskConfig( @@ -17803,7 +16448,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_member_of_helm = LightevalTaskConfig( @@ -17824,7 +16468,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_member_of_political_party_helm = LightevalTaskConfig( @@ -17845,7 +16488,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_member_of_sports_team_helm = LightevalTaskConfig( @@ -17866,7 +16508,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_movement_helm = LightevalTaskConfig( @@ -17887,7 +16528,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_named_after_helm = LightevalTaskConfig( @@ -17908,7 +16548,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_native_language_helm = LightevalTaskConfig( @@ -17929,7 +16568,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_number_of_processor_cores_helm = LightevalTaskConfig( @@ -17950,7 +16588,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_occupation_helm = LightevalTaskConfig( @@ -17971,7 +16608,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_office_held_by_head_of_government_helm = LightevalTaskConfig( @@ -17992,7 +16628,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_office_held_by_head_of_state_helm = LightevalTaskConfig( @@ -18013,7 +16648,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_official_language_helm = LightevalTaskConfig( @@ -18034,7 +16668,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_operating_system_helm = LightevalTaskConfig( @@ -18055,7 +16688,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_original_language_of_film_or_TV_show_helm = LightevalTaskConfig( @@ -18076,7 +16708,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_original_network_helm = LightevalTaskConfig( @@ -18097,7 +16728,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_overrules_helm = LightevalTaskConfig( @@ -18118,7 +16748,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_owned_by_helm = LightevalTaskConfig( @@ -18139,7 +16768,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_part_of_helm = LightevalTaskConfig( @@ -18160,7 +16788,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_participating_team_helm = LightevalTaskConfig( @@ -18181,7 +16808,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_place_of_birth_helm = LightevalTaskConfig( @@ -18202,7 +16828,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_place_of_death_helm = LightevalTaskConfig( @@ -18223,7 +16848,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_plaintiff_helm = LightevalTaskConfig( @@ -18244,7 +16868,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_position_held_helm = LightevalTaskConfig( @@ -18265,7 +16888,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_position_played_on_team_helm = LightevalTaskConfig( @@ -18286,7 +16908,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_programming_language_helm = LightevalTaskConfig( @@ -18307,7 +16928,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_recommended_unit_of_measurement_helm = LightevalTaskConfig( @@ -18328,7 +16948,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_record_label_helm = LightevalTaskConfig( @@ -18349,7 +16968,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_religion_helm = LightevalTaskConfig( @@ -18370,7 +16988,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_repealed_by_helm = LightevalTaskConfig( @@ -18391,7 +17008,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_shares_border_with_helm = LightevalTaskConfig( @@ -18412,7 +17028,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_solved_by_helm = LightevalTaskConfig( @@ -18433,7 +17048,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_statement_describes_helm = LightevalTaskConfig( @@ -18454,7 +17068,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_stock_exchange_helm = LightevalTaskConfig( @@ -18475,7 +17088,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_subclass_of_helm = LightevalTaskConfig( @@ -18496,7 +17108,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_subsidiary_helm = LightevalTaskConfig( @@ -18517,7 +17128,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_symptoms_and_signs_helm = LightevalTaskConfig( @@ -18538,7 +17148,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_therapeutic_area_helm = LightevalTaskConfig( @@ -18559,7 +17168,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_time_of_discovery_or_invention_helm = LightevalTaskConfig( @@ -18580,7 +17188,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_twinned_administrative_body_helm = LightevalTaskConfig( @@ -18601,7 +17208,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikifact_work_location_helm = LightevalTaskConfig( @@ -18622,7 +17228,6 @@ Metrics.prefix_quasi_exact_match, ], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikitext_2_lighteval = LightevalTaskConfig( @@ -18638,7 +17243,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikitext_103_document_level_harness = LightevalTaskConfig( @@ -18654,7 +17258,6 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wikitext_103_document_level_helm = LightevalTaskConfig( @@ -18670,14 +17273,13 @@ generation_size=-1, metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wino_x_german_bigbench = LightevalTaskConfig( name="wino_x_german", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="wino_x_german", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -18686,7 +17288,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) winogrande_leaderboard = LightevalTaskConfig( @@ -18702,14 +17303,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) winowhy_bigbench_lite = LightevalTaskConfig( name="winowhy", suite=["bigbench_lite", "bigbench", "bigbench_json"], prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="winowhy", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -18718,7 +17318,6 @@ generation_size=1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt08_cs_en_lighteval = LightevalTaskConfig( @@ -18734,7 +17333,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt08_de_en_lighteval = LightevalTaskConfig( @@ -18750,7 +17348,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt08_en_cs_lighteval = LightevalTaskConfig( @@ -18766,7 +17363,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt08_en_de_lighteval = LightevalTaskConfig( @@ -18782,7 +17378,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt08_en_es_lighteval = LightevalTaskConfig( @@ -18798,7 +17393,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt08_en_fr_lighteval = LightevalTaskConfig( @@ -18814,7 +17408,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt08_en_hu_lighteval = LightevalTaskConfig( @@ -18830,7 +17423,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt08_es_en_lighteval = LightevalTaskConfig( @@ -18846,7 +17438,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt08_fr_en_lighteval = LightevalTaskConfig( @@ -18862,7 +17453,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt08_hu_en_lighteval = LightevalTaskConfig( @@ -18878,7 +17468,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt09_cs_en_lighteval = LightevalTaskConfig( @@ -18894,7 +17483,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt09_de_en_lighteval = LightevalTaskConfig( @@ -18910,7 +17498,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt09_en_cs_lighteval = LightevalTaskConfig( @@ -18926,7 +17513,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt09_en_de_lighteval = LightevalTaskConfig( @@ -18942,7 +17528,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt09_en_es_lighteval = LightevalTaskConfig( @@ -18958,7 +17543,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt09_en_fr_lighteval = LightevalTaskConfig( @@ -18974,7 +17558,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt09_en_hu_lighteval = LightevalTaskConfig( @@ -18990,7 +17573,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt09_en_it_lighteval = LightevalTaskConfig( @@ -19006,7 +17588,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt09_es_en_lighteval = LightevalTaskConfig( @@ -19022,7 +17603,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt09_fr_en_lighteval = LightevalTaskConfig( @@ -19038,7 +17618,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt09_hu_en_lighteval = LightevalTaskConfig( @@ -19054,7 +17633,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt09_it_en_lighteval = LightevalTaskConfig( @@ -19070,7 +17648,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt10_cs_en_lighteval = LightevalTaskConfig( @@ -19086,7 +17663,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt10_de_en_lighteval = LightevalTaskConfig( @@ -19102,7 +17678,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt10_en_cs_lighteval = LightevalTaskConfig( @@ -19118,7 +17693,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt10_en_de_lighteval = LightevalTaskConfig( @@ -19134,7 +17708,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt10_en_es_lighteval = LightevalTaskConfig( @@ -19150,7 +17723,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt10_en_fr_lighteval = LightevalTaskConfig( @@ -19166,7 +17738,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt10_es_en_lighteval = LightevalTaskConfig( @@ -19182,7 +17753,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt10_fr_en_lighteval = LightevalTaskConfig( @@ -19198,7 +17768,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt11_cs_en_lighteval = LightevalTaskConfig( @@ -19214,7 +17783,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt11_de_en_lighteval = LightevalTaskConfig( @@ -19230,7 +17798,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt11_en_cs_lighteval = LightevalTaskConfig( @@ -19246,7 +17813,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt11_en_de_lighteval = LightevalTaskConfig( @@ -19262,7 +17828,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt11_en_es_lighteval = LightevalTaskConfig( @@ -19278,7 +17843,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt11_en_fr_lighteval = LightevalTaskConfig( @@ -19294,7 +17858,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt11_es_en_lighteval = LightevalTaskConfig( @@ -19310,7 +17873,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt11_fr_en_lighteval = LightevalTaskConfig( @@ -19326,7 +17888,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt12_cs_en_lighteval = LightevalTaskConfig( @@ -19342,7 +17903,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt12_de_en_lighteval = LightevalTaskConfig( @@ -19358,7 +17918,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt12_en_cs_lighteval = LightevalTaskConfig( @@ -19374,7 +17933,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt12_en_de_lighteval = LightevalTaskConfig( @@ -19390,7 +17948,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt12_en_es_lighteval = LightevalTaskConfig( @@ -19406,7 +17963,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt12_en_fr_lighteval = LightevalTaskConfig( @@ -19422,7 +17978,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt12_es_en_lighteval = LightevalTaskConfig( @@ -19438,7 +17993,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt12_fr_en_lighteval = LightevalTaskConfig( @@ -19454,7 +18008,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt13_cs_en_lighteval = LightevalTaskConfig( @@ -19470,7 +18023,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt13_de_en_lighteval = LightevalTaskConfig( @@ -19486,7 +18038,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt13_en_cs_lighteval = LightevalTaskConfig( @@ -19502,7 +18053,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt13_en_de_lighteval = LightevalTaskConfig( @@ -19518,7 +18068,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt13_en_es_lighteval = LightevalTaskConfig( @@ -19534,7 +18083,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt13_en_fr_lighteval = LightevalTaskConfig( @@ -19550,7 +18098,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt13_en_ru_lighteval = LightevalTaskConfig( @@ -19566,7 +18113,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt13_es_en_lighteval = LightevalTaskConfig( @@ -19582,7 +18128,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt13_fr_en_lighteval = LightevalTaskConfig( @@ -19598,7 +18143,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt13_ru_en_lighteval = LightevalTaskConfig( @@ -19614,7 +18158,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_cs_en_lighteval = LightevalTaskConfig( @@ -19630,7 +18173,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_de_en_lighteval = LightevalTaskConfig( @@ -19646,7 +18188,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_en_cs_lighteval = LightevalTaskConfig( @@ -19662,7 +18203,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_en_de_lighteval = LightevalTaskConfig( @@ -19678,7 +18218,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_en_fr_lighteval = LightevalTaskConfig( @@ -19694,7 +18233,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_en_fr_lighteval = LightevalTaskConfig( @@ -19710,7 +18248,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_en_hi_lighteval = LightevalTaskConfig( @@ -19726,7 +18263,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_en_ru_lighteval = LightevalTaskConfig( @@ -19742,7 +18278,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_fr_en_lighteval = LightevalTaskConfig( @@ -19758,7 +18293,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_fr_en_lighteval = LightevalTaskConfig( @@ -19774,7 +18308,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_hi_en_lighteval = LightevalTaskConfig( @@ -19790,7 +18323,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_ru_en_lighteval = LightevalTaskConfig( @@ -19806,14 +18338,13 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_cs_en_helm = LightevalTaskConfig( name="wmt14:cs-en", suite=["helm"], prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/wmt14", + hf_repo="lighteval/wmt_14", hf_subset="cs-en", hf_avail_splits=["train", "test", "validation"], evaluation_splits=["validation", "test"], @@ -19822,14 +18353,13 @@ generation_size=100, metrics=[Metrics.bleu], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_de_en_helm = LightevalTaskConfig( name="wmt14:de-en", suite=["helm"], prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/wmt14", + hf_repo="lighteval/wmt_14", hf_subset="de-en", hf_avail_splits=["train", "test", "validation"], evaluation_splits=["validation", "test"], @@ -19838,14 +18368,13 @@ generation_size=100, metrics=[Metrics.bleu], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_fr_en_helm = LightevalTaskConfig( name="wmt14:fr-en", suite=["helm"], prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/wmt14", + hf_repo="lighteval/wmt_14", hf_subset="fr-en", hf_avail_splits=["train", "test", "validation"], evaluation_splits=["validation", "test"], @@ -19854,14 +18383,13 @@ generation_size=100, metrics=[Metrics.bleu], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_hi_en_helm = LightevalTaskConfig( name="wmt14:hi-en", suite=["helm"], prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/wmt14", + hf_repo="lighteval/wmt_14", hf_subset="hi-en", hf_avail_splits=["train", "test", "validation"], evaluation_splits=["validation", "test"], @@ -19870,14 +18398,13 @@ generation_size=100, metrics=[Metrics.bleu], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt14_ru_en_helm = LightevalTaskConfig( name="wmt14:ru-en", suite=["helm"], prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/wmt14", + hf_repo="lighteval/wmt_14", hf_subset="ru-en", hf_avail_splits=["train", "test", "validation"], evaluation_splits=["validation", "test"], @@ -19886,7 +18413,6 @@ generation_size=100, metrics=[Metrics.bleu], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt15_cs_en_lighteval = LightevalTaskConfig( @@ -19902,7 +18428,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt15_de_en_lighteval = LightevalTaskConfig( @@ -19918,7 +18443,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt15_en_cs_lighteval = LightevalTaskConfig( @@ -19934,7 +18458,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt15_en_de_lighteval = LightevalTaskConfig( @@ -19950,7 +18473,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt15_en_fi_lighteval = LightevalTaskConfig( @@ -19966,7 +18488,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt15_en_fr_lighteval = LightevalTaskConfig( @@ -19982,7 +18503,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt15_en_ru_lighteval = LightevalTaskConfig( @@ -19998,7 +18518,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt15_fi_en_lighteval = LightevalTaskConfig( @@ -20014,7 +18533,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt15_fr_en_lighteval = LightevalTaskConfig( @@ -20030,7 +18548,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt15_ru_en_lighteval = LightevalTaskConfig( @@ -20046,7 +18563,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_cs_en_lighteval = LightevalTaskConfig( @@ -20062,7 +18578,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_de_en_lighteval = LightevalTaskConfig( @@ -20078,7 +18593,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_de_en_lighteval = LightevalTaskConfig( @@ -20094,7 +18608,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_en_cs_lighteval = LightevalTaskConfig( @@ -20110,7 +18623,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_en_de_lighteval = LightevalTaskConfig( @@ -20126,7 +18638,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_en_de_lighteval = LightevalTaskConfig( @@ -20142,7 +18653,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_en_fi_lighteval = LightevalTaskConfig( @@ -20158,7 +18668,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_en_ro_lighteval = LightevalTaskConfig( @@ -20174,7 +18683,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_en_ro_lighteval = LightevalTaskConfig( @@ -20190,7 +18698,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_en_ru_lighteval = LightevalTaskConfig( @@ -20206,7 +18713,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_en_tr_lighteval = LightevalTaskConfig( @@ -20222,7 +18728,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_fi_en_lighteval = LightevalTaskConfig( @@ -20238,7 +18743,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_ro_en_lighteval = LightevalTaskConfig( @@ -20254,7 +18758,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_ro_en_lighteval = LightevalTaskConfig( @@ -20270,7 +18773,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_ru_en_lighteval = LightevalTaskConfig( @@ -20286,7 +18788,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt16_tr_en_lighteval = LightevalTaskConfig( @@ -20302,7 +18803,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_cs_en_lighteval = LightevalTaskConfig( @@ -20318,7 +18818,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_de_en_lighteval = LightevalTaskConfig( @@ -20334,7 +18833,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_en_cs_lighteval = LightevalTaskConfig( @@ -20350,7 +18848,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_en_de_lighteval = LightevalTaskConfig( @@ -20366,7 +18863,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_en_fi_lighteval = LightevalTaskConfig( @@ -20382,7 +18878,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_en_lv_lighteval = LightevalTaskConfig( @@ -20398,7 +18893,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_en_ru_lighteval = LightevalTaskConfig( @@ -20414,7 +18908,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_en_tr_lighteval = LightevalTaskConfig( @@ -20430,7 +18923,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_en_zh_lighteval = LightevalTaskConfig( @@ -20446,7 +18938,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_fi_en_lighteval = LightevalTaskConfig( @@ -20462,7 +18953,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_lv_en_lighteval = LightevalTaskConfig( @@ -20478,7 +18968,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_ru_en_lighteval = LightevalTaskConfig( @@ -20494,7 +18983,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_tr_en_lighteval = LightevalTaskConfig( @@ -20510,7 +18998,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt17_zh_en_lighteval = LightevalTaskConfig( @@ -20526,7 +19013,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_cs_en_lighteval = LightevalTaskConfig( @@ -20542,7 +19028,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_de_en_lighteval = LightevalTaskConfig( @@ -20558,7 +19043,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_en_cs_lighteval = LightevalTaskConfig( @@ -20574,7 +19058,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_en_de_lighteval = LightevalTaskConfig( @@ -20590,7 +19073,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_en_et_lighteval = LightevalTaskConfig( @@ -20606,7 +19088,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_en_fi_lighteval = LightevalTaskConfig( @@ -20622,7 +19103,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_en_ru_lighteval = LightevalTaskConfig( @@ -20638,7 +19118,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_en_tr_lighteval = LightevalTaskConfig( @@ -20654,7 +19133,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_en_zh_lighteval = LightevalTaskConfig( @@ -20670,7 +19148,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_et_en_lighteval = LightevalTaskConfig( @@ -20686,7 +19163,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_fi_en_lighteval = LightevalTaskConfig( @@ -20702,7 +19178,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_ru_en_lighteval = LightevalTaskConfig( @@ -20718,7 +19193,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_tr_en_lighteval = LightevalTaskConfig( @@ -20734,7 +19208,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt18_zh_en_lighteval = LightevalTaskConfig( @@ -20750,7 +19223,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_cs_de_lighteval = LightevalTaskConfig( @@ -20766,7 +19238,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_de_cs_lighteval = LightevalTaskConfig( @@ -20782,7 +19253,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_de_en_lighteval = LightevalTaskConfig( @@ -20798,7 +19268,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_de_fr_lighteval = LightevalTaskConfig( @@ -20814,7 +19283,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_en_cs_lighteval = LightevalTaskConfig( @@ -20830,7 +19298,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_en_de_lighteval = LightevalTaskConfig( @@ -20846,7 +19313,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_en_fi_lighteval = LightevalTaskConfig( @@ -20862,7 +19328,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_en_gu_lighteval = LightevalTaskConfig( @@ -20878,7 +19343,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_en_kk_lighteval = LightevalTaskConfig( @@ -20894,7 +19358,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_en_lt_lighteval = LightevalTaskConfig( @@ -20910,7 +19373,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_en_ru_lighteval = LightevalTaskConfig( @@ -20926,7 +19388,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_en_zh_lighteval = LightevalTaskConfig( @@ -20942,7 +19403,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_fi_en_lighteval = LightevalTaskConfig( @@ -20958,7 +19418,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_fr_de_lighteval = LightevalTaskConfig( @@ -20974,7 +19433,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_gu_en_lighteval = LightevalTaskConfig( @@ -20990,7 +19448,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_kk_en_lighteval = LightevalTaskConfig( @@ -21006,7 +19463,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_lt_en_lighteval = LightevalTaskConfig( @@ -21022,7 +19478,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_ru_en_lighteval = LightevalTaskConfig( @@ -21038,7 +19493,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt19_zh_en_lighteval = LightevalTaskConfig( @@ -21054,7 +19508,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_cs_en_lighteval = LightevalTaskConfig( @@ -21070,7 +19523,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_de_en_lighteval = LightevalTaskConfig( @@ -21086,7 +19538,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_de_fr_lighteval = LightevalTaskConfig( @@ -21102,7 +19553,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_en_cs_lighteval = LightevalTaskConfig( @@ -21118,7 +19568,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_en_de_lighteval = LightevalTaskConfig( @@ -21134,7 +19583,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_en_iu_lighteval = LightevalTaskConfig( @@ -21150,7 +19598,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_en_ja_lighteval = LightevalTaskConfig( @@ -21166,7 +19613,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_en_km_lighteval = LightevalTaskConfig( @@ -21182,7 +19628,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_en_pl_lighteval = LightevalTaskConfig( @@ -21198,7 +19643,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_en_ps_lighteval = LightevalTaskConfig( @@ -21214,7 +19658,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_en_ru_lighteval = LightevalTaskConfig( @@ -21230,7 +19673,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_en_ta_lighteval = LightevalTaskConfig( @@ -21246,7 +19688,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_en_zh_lighteval = LightevalTaskConfig( @@ -21262,7 +19703,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_fr_de_lighteval = LightevalTaskConfig( @@ -21278,7 +19718,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_iu_en_lighteval = LightevalTaskConfig( @@ -21294,7 +19733,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_ja_en_lighteval = LightevalTaskConfig( @@ -21310,7 +19748,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_km_en_lighteval = LightevalTaskConfig( @@ -21326,7 +19763,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_pl_en_lighteval = LightevalTaskConfig( @@ -21342,7 +19778,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_ps_en_lighteval = LightevalTaskConfig( @@ -21358,7 +19793,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_ru_en_lighteval = LightevalTaskConfig( @@ -21374,7 +19808,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_ta_en_lighteval = LightevalTaskConfig( @@ -21390,7 +19823,6 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wmt20_zh_en_lighteval = LightevalTaskConfig( @@ -21406,14 +19838,13 @@ generation_size=None, metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], stop_sequence=["\n"], - trust_dataset=True, version=0, ) word_sorting_bigbench = LightevalTaskConfig( name="word_sorting", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="word_sorting", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -21422,14 +19853,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) word_unscrambling_bigbench = LightevalTaskConfig( name="word_unscrambling", suite=["bigbench", "bigbench_json"], prompt_function=prompt.bigbench, - hf_repo="bigbench", + hf_repo="tasksource/bigbench", hf_subset="word_unscrambling", hf_avail_splits=["default", "train", "validation"], evaluation_splits=["default"], @@ -21438,14 +19868,13 @@ generation_size=1, metrics=[Metrics.perfect_exact_match], stop_sequence=["\n"], - trust_dataset=True, version=0, ) wsc273_lighteval = LightevalTaskConfig( name="wsc273", suite=["lighteval"], prompt_function=prompt.wsc273, - hf_repo="winograd_wsc", + hf_repo="lighteval/winograd_wsc", hf_subset="wsc273", hf_avail_splits=["test"], evaluation_splits=["test"], @@ -21454,14 +19883,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xcopa_en_lighteval = LightevalTaskConfig( name="xcopa:en", suite=["lighteval"], prompt_function=prompt.xcopa_en, - hf_repo="xcopa", + hf_repo="cambridgeltl/xcopa", hf_subset="default", hf_avail_splits=["test", "train", "validation"], evaluation_splits=["test"], @@ -21470,14 +19898,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xcopa_et_lighteval = LightevalTaskConfig( name="xcopa:et", suite=["lighteval"], prompt_function=prompt.xcopa_et, - hf_repo="xcopa", + hf_repo="cambridgeltl/xcopa", hf_subset="et", hf_avail_splits=["test", "train", "validation"], evaluation_splits=["test"], @@ -21486,14 +19913,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xcopa_ht_lighteval = LightevalTaskConfig( name="xcopa:ht", suite=["lighteval"], prompt_function=prompt.xcopa_ht, - hf_repo="xcopa", + hf_repo="cambridgeltl/xcopa", hf_subset="ht", hf_avail_splits=["test", "train", "validation"], evaluation_splits=["test"], @@ -21502,14 +19928,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xcopa_it_lighteval = LightevalTaskConfig( name="xcopa:it", suite=["lighteval"], prompt_function=prompt.xcopa_it, - hf_repo="xcopa", + hf_repo="cambridgeltl/xcopa", hf_subset="it", hf_avail_splits=["test", "train", "validation"], evaluation_splits=["test"], @@ -21518,14 +19943,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xcopa_id_lighteval = LightevalTaskConfig( name="xcopa:id", suite=["lighteval"], prompt_function=prompt.xcopa_id, - hf_repo="xcopa", + hf_repo="cambridgeltl/xcopa", hf_subset="id", hf_avail_splits=["test", "train", "validation"], evaluation_splits=["test"], @@ -21534,14 +19958,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xcopa_qu_lighteval = LightevalTaskConfig( name="xcopa:qu", suite=["lighteval"], prompt_function=prompt.xcopa_qu, - hf_repo="xcopa", + hf_repo="cambridgeltl/xcopa", hf_subset="qu", hf_avail_splits=["test", "train", "validation"], evaluation_splits=["test"], @@ -21550,14 +19973,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xcopa_sw_lighteval = LightevalTaskConfig( name="xcopa:sw", suite=["lighteval"], prompt_function=prompt.xcopa_sw, - hf_repo="xcopa", + hf_repo="cambridgeltl/xcopa", hf_subset="sw", hf_avail_splits=["test", "train", "validation"], evaluation_splits=["test"], @@ -21566,14 +19988,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xcopa_zh_lighteval = LightevalTaskConfig( name="xcopa:zh", suite=["lighteval"], prompt_function=prompt.xcopa_zh, - hf_repo="xcopa", + hf_repo="cambridgeltl/xcopa", hf_subset="zh", hf_avail_splits=["test", "train", "validation"], evaluation_splits=["test"], @@ -21582,14 +20003,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xcopa_ta_lighteval = LightevalTaskConfig( name="xcopa:ta", suite=["lighteval"], prompt_function=prompt.xcopa_ta, - hf_repo="xcopa", + hf_repo="cambridgeltl/xcopa", hf_subset="ta", hf_avail_splits=["test", "train", "validation"], evaluation_splits=["test"], @@ -21598,14 +20018,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xcopa_th_lighteval = LightevalTaskConfig( name="xcopa:th", suite=["lighteval"], prompt_function=prompt.xcopa_th, - hf_repo="xcopa", + hf_repo="cambridgeltl/xcopa", hf_subset="th", hf_avail_splits=["test", "train", "validation"], evaluation_splits=["test"], @@ -21614,14 +20033,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xcopa_tr_lighteval = LightevalTaskConfig( name="xcopa:tr", suite=["lighteval"], prompt_function=prompt.xcopa_tr, - hf_repo="xcopa", + hf_repo="cambridgeltl/xcopa", hf_subset="tr", hf_avail_splits=["test", "train", "validation"], evaluation_splits=["test"], @@ -21630,14 +20048,13 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xcopa_vi_lighteval = LightevalTaskConfig( name="xcopa:vi", suite=["lighteval"], prompt_function=prompt.xcopa_vi, - hf_repo="xcopa", + hf_repo="cambridgeltl/xcopa", hf_subset="vi", hf_avail_splits=["test", "train", "validation"], evaluation_splits=["test"], @@ -21646,7 +20063,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xstory_cloze_en_lighteval = LightevalTaskConfig( @@ -21662,7 +20078,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xstory_cloze_ru_lighteval = LightevalTaskConfig( @@ -21678,7 +20093,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xstory_cloze_zh_lighteval = LightevalTaskConfig( @@ -21694,7 +20108,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xstory_cloze_es_lighteval = LightevalTaskConfig( @@ -21710,7 +20123,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xstory_cloze_ar_lighteval = LightevalTaskConfig( @@ -21726,7 +20138,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xstory_cloze_hi_lighteval = LightevalTaskConfig( @@ -21742,7 +20153,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xstory_cloze_id_lighteval = LightevalTaskConfig( @@ -21758,7 +20168,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xstory_cloze_te_lighteval = LightevalTaskConfig( @@ -21774,7 +20183,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xstory_cloze_sw_lighteval = LightevalTaskConfig( @@ -21790,7 +20198,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xstory_cloze_eu_lighteval = LightevalTaskConfig( @@ -21806,7 +20213,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xstory_cloze_my_lighteval = LightevalTaskConfig( @@ -21822,7 +20228,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xwinograd_en_lighteval = LightevalTaskConfig( @@ -21838,7 +20243,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xwinograd_fr_lighteval = LightevalTaskConfig( @@ -21854,7 +20258,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xwinograd_jp_lighteval = LightevalTaskConfig( @@ -21870,7 +20273,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xwinograd_pt_lighteval = LightevalTaskConfig( @@ -21886,7 +20288,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xwinograd_ru_lighteval = LightevalTaskConfig( @@ -21902,7 +20303,6 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) xwinograd_zh_lighteval = LightevalTaskConfig( @@ -21918,6 +20318,5 @@ generation_size=-1, metrics=[Metrics.loglikelihood_acc], stop_sequence=["\n"], - trust_dataset=True, version=0, ) diff --git a/src/lighteval/tasks/extended/hle/main.py b/src/lighteval/tasks/extended/hle/main.py index 2c36607da..fcc47820f 100644 --- a/src/lighteval/tasks/extended/hle/main.py +++ b/src/lighteval/tasks/extended/hle/main.py @@ -224,7 +224,6 @@ def hle_text_only(line, task_name: str = None): generation_size=8192, metrics=[Metrics.exact_match, Metrics.hle_metrics], stop_sequence=[], - trust_dataset=True, version=0, ) diff --git a/src/lighteval/tasks/extended/lcb/main.py b/src/lighteval/tasks/extended/lcb/main.py index 66a6844c4..ad49235fb 100644 --- a/src/lighteval/tasks/extended/lcb/main.py +++ b/src/lighteval/tasks/extended/lcb/main.py @@ -36,6 +36,7 @@ from aenum import extend_enum from lighteval.metrics.metrics import Metrics, SampleLevelMetric +from lighteval.models.model_output import ModelResponse from lighteval.tasks.extended.lcb.codegen_metrics import ( codegen_metrics, extract_code, @@ -79,17 +80,20 @@ def lcb_codegeneration_prompt_fn(line, task_name: str = "lcb:codegeneration") -> ) -def codegen_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> float: +def codegen_metric(model_response: ModelResponse, doc: Doc, **kwargs) -> float: """Estimates the Pass@1 metric for the code generation task. Extract the code from each prediction, Runs it for each sample and generations, and computes the Pass@1 over the outputs. """ + assert doc.specific is not None, "Doc specific field is required for codegen_metric" + + predictions = model_response.final_text # Extract generated code snippets generated_code_snippets = [[extract_code(pred) for pred in predictions]] # noqa: F841 evaluation_sample = { # noqa: F841 - "inputs": formatted_doc.specific["inputs"], - "outputs": formatted_doc.specific["outputs"], - "fn_name": formatted_doc.specific["fn_name"], + "inputs": doc.specific["inputs"], + "outputs": doc.specific["outputs"], + "fn_name": doc.specific["fn_name"], } # This is a list of lists because evaluation_sample = [{"input_output": json.dumps(evaluation_sample)}] @@ -149,14 +153,13 @@ def codegen_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> floa name=name, suite=["extended"], prompt_function=lcb_codegeneration_prompt_fn, - hf_repo="livecodebench/code_generation_lite", + hf_repo="lighteval/code_generation_lite", hf_subset=subset, # https://github.com/LiveCodeBench/LiveCodeBench/tree/main?tab=readme-ov-file#dataset-versions hf_avail_splits=["test"], evaluation_splits=["test"], generation_size=32768, metrics=[Metrics.lcb_codegen_metric], stop_sequence=[], # no stop sequence, will use EOS token - trust_dataset=True, version=0, ) tasks.append(task) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 42ba3408e..c146041a5 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -63,7 +63,6 @@ class LightevalTaskConfig: original_num_docs (int): Number of documents in the task effective_num_docs (int): Number of documents used in a specific evaluation truncated_num_docs (bool): Whether less than the total number of documents were used - trust_dataset (bool): Whether to trust the dataset at execution or not version (int): The version of the task. Defaults to 0. Can be increased if the underlying dataset or the prompt changes. """ @@ -80,9 +79,6 @@ class LightevalTaskConfig: hf_filter: Callable[[dict], bool] | None = None hf_avail_splits: ListLike[str] = field(default_factory=lambda: ["train", "validation", "test"]) - # We default to false, to reduce security issues - trust_dataset: bool = False - # Splits evaluation_splits: ListLike[str] = field(default_factory=lambda: ["validation"]) few_shots_split: str | None = None @@ -169,7 +165,6 @@ def __init__( self.dataset_config_name = config.hf_subset self.dataset_revision = config.hf_revision self.dataset_filter = config.hf_filter - self.trust_dataset = config.trust_dataset self.dataset: DatasetDict | None = None # Delayed download self.evaluation_split = as_list(config.evaluation_splits) self._docs = None @@ -374,10 +369,6 @@ def download_dataset_worker( dataset = load_dataset( path=task.dataset_path, name=task.dataset_config_name, - data_dir=None, - cache_dir=None, - download_mode=None, - trust_remote_code=task.trust_dataset, revision=task.dataset_revision, ) diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py index d9d0fad0e..1c00060e3 100644 --- a/src/lighteval/tasks/multilingual/tasks.py +++ b/src/lighteval/tasks/multilingual/tasks.py @@ -528,8 +528,6 @@ ), hf_repo="ai4bharat/IndicCOPA", hf_subset=f"translation-{standardize_tag(language.value)}", - # Since we use trust_dataset, we have to be careful about what is inside the dataset - # script. We thus lock the revision to ensure that the script doesn't change hf_revision="d356ef19a4eb287e88a51d07a56b73ba88c7f188", evaluation_splits=["test"], hf_avail_splits=["test"], @@ -540,7 +538,6 @@ loglikelihood_acc_metric(normalization=LogProbCharNorm()), ], ), - trust_dataset=True, ) for language in [ Language.ASSAMESE, @@ -624,8 +621,6 @@ ), hf_repo="jon-tow/okapi_hellaswag", hf_subset=standardize_tag(lang.value), - # Since we use trust_dataset, we have to be careful about what is inside the dataset - # script. We thus lock the revision to ensure that the script doesn't change hf_revision="96ed8e0dfc6172dad1d3df338d7b8ba6c1ff9d83", evaluation_splits=["validation"], hf_avail_splits=["validation"], @@ -636,7 +631,6 @@ loglikelihood_acc_metric(normalization=LogProbCharNorm()), ], ), - trust_dataset=True, ) for lang in [ Language.ARABIC, @@ -878,7 +872,6 @@ suite=("lighteval",), hf_repo="deepset/germanquad", hf_subset="plain_text", - trust_dataset=True, hf_revision="fff05ceaf2ffbe5b65c7e0c57e678f7b7e1a0581", hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), evaluation_splits=("test",), @@ -991,7 +984,6 @@ suite=("lighteval",), hf_repo="eraldoluis/faquad", hf_subset="plain_text", - trust_dataset=True, hf_revision="205ba826a2282a4a5aa9bd3651e55ee4f2da1546", hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), evaluation_splits=("validation",), @@ -1160,10 +1152,7 @@ hf_repo="ai4bharat/IndicQA", hf_subset=f"indicqa.{LangCodeLanguage.get(language.value).language}", hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), - # Since we use trust_dataset, we have to be careful about what is inside the dataset - # script. We thus lock the revision to ensure that the script doesn't change hf_revision="92d96092ae229950973dac3b9998f8b3a8949b0a", - trust_dataset=True, evaluation_splits=("test",), hf_avail_splits=("test",), generation_size=400, @@ -1331,13 +1320,10 @@ suite=["lighteval"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="race_ar", - # Since we use trust_dataset, we have to be careful about what is inside the dataset - # script. We thus lock the revision to ensure that the script doesn't change hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", - trust_dataset=True, metrics=get_metrics_for_formulation( formulation, [ @@ -1397,7 +1383,6 @@ hf_repo="facebook/mlqa", hf_subset=f"mlqa.{standardize_tag(lang.value)}.{standardize_tag(lang.value)}", hf_revision="397ed406c1a7902140303e7faf60fff35b58d285", - trust_dataset=True, evaluation_splits=("test",), hf_avail_splits=["test"], generation_size=400, @@ -1736,7 +1721,6 @@ hf_subset=standardize_tag(language.value), hf_revision="refs/pr/1", hf_filter=partial(lambda subset, line: line["id"].split("/")[0] == subset, subset), - trust_dataset=True, evaluation_splits=("test",), few_shots_split="dev", metrics=get_metrics_for_formulation( @@ -2339,7 +2323,6 @@ hf_repo="jon-tow/okapi_arc_challenge", hf_subset=standardize_tag(language.value), hf_revision="823d5d7bfaf8974a3ab52a825b6cf4903b35dbc4", - trust_dataset=True, evaluation_splits=("test",), few_shots_split="train", metrics=get_metrics_for_formulation( @@ -2398,7 +2381,6 @@ hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="arc_easy_ar", hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", - trust_dataset=True, evaluation_splits=["test"], few_shots_split="validation", metrics=get_metrics_for_formulation( @@ -2557,7 +2539,6 @@ loglikelihood_acc_metric(normalization=LogProbCharNorm()), ], ), - trust_dataset=True, ) for formulation in [ MCFFormulation(), @@ -2645,7 +2626,6 @@ hf_repo="jon-tow/okapi_truthfulqa", hf_subset=standardize_tag(language.value), hf_revision="cdd5db1a66fd04105622109d1c2a5cbc8cde7586", - trust_dataset=True, evaluation_splits=("validation",), hf_avail_splits=["validation"], metrics=get_metrics_for_formulation( @@ -3068,7 +3048,6 @@ hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", - trust_dataset=True, metrics=get_metrics_for_formulation( formulation, [ @@ -3105,7 +3084,6 @@ hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="openbook_qa_ext_ar", hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", - trust_dataset=True, evaluation_splits=["test"], few_shots_split="validation", metrics=get_metrics_for_formulation( @@ -3230,7 +3208,6 @@ loglikelihood_acc_metric(normalization=LogProbCharNorm()), ], ), - trust_dataset=True, ) for formulation in [ MCFFormulation(), @@ -3858,7 +3835,6 @@ language, subset, ), - trust_dataset=True, evaluation_splits=("train",), hf_avail_splits=["train"], stop_sequence=("\n",), @@ -4380,7 +4356,6 @@ def flores_adapter(lang1, lang2): generation_size=300, metrics=[Metrics.chrf_plus, Metrics.bleu, Metrics.bleu_1, Metrics.bleu_4], stop_sequence=["\n"], - trust_dataset=True, version=0, ) for (lang1, lang2) in permutations(flores_200_languages, 2) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 01c43e942..bdac9fb38 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -23,6 +23,7 @@ import collections import copy import importlib +import importlib.util import logging import os from functools import lru_cache @@ -30,8 +31,6 @@ from pathlib import Path from types import ModuleType -from datasets.load import dataset_module_factory - import lighteval.tasks.default_tasks as default_tasks from lighteval.tasks.extended import AVAILABLE_EXTENDED_TASKS_MODULES from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig @@ -304,8 +303,15 @@ def create_custom_tasks_module(custom_tasks: str | Path | ModuleType) -> ModuleT if isinstance(custom_tasks, ModuleType): return custom_tasks if isinstance(custom_tasks, (str, Path)) and os.path.exists(custom_tasks): - dataset_module = dataset_module_factory(str(custom_tasks), trust_remote_code=True) - return importlib.import_module(dataset_module.module_path) + module_name = os.path.splitext(os.path.basename(custom_tasks))[0] + spec = importlib.util.spec_from_file_location(module_name, custom_tasks) + + if spec is None: + raise ValueError(f"Cannot find module {module_name} at {custom_tasks}") + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module if isinstance(custom_tasks, (str, Path)): return importlib.import_module(str(custom_tasks)) diff --git a/tests/pipeline/test_reasoning_tags.py b/tests/pipeline/test_reasoning_tags.py index cadddd0d9..dd131e838 100644 --- a/tests/pipeline/test_reasoning_tags.py +++ b/tests/pipeline/test_reasoning_tags.py @@ -59,7 +59,6 @@ def setUp(self): few_shots_select=None, generation_size=10, stop_sequence=["\n"], - trust_dataset=True, num_fewshots=0, )