Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 0 additions & 20 deletions community_tasks/arabic_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ def __init__(
suite=["community"],
generation_size=-1,
stop_sequence=None,
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -173,7 +172,6 @@ def __init__(
suite=["community"],
generation_size=-1,
stop_sequence=None,
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -238,7 +236,6 @@ def __init__(
suite=["community"],
generation_size=-1,
stop_sequence=None,
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -294,7 +291,6 @@ def __init__(
suite=["community"],
generation_size=-1,
stop_sequence=None,
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -351,7 +347,6 @@ def __init__(
suite=["community"],
generation_size=-1,
stop_sequence=None,
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -393,7 +388,6 @@ def arabic_exams_pfn(line, task_name: str = None):
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -451,7 +445,6 @@ def __init__(
suite=["community"],
generation_size=-1,
stop_sequence=None,
trust_dataset=True,
version=0,
)

Expand All @@ -471,7 +464,6 @@ def __init__(
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)

Expand All @@ -488,7 +480,6 @@ def __init__(
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)

Expand All @@ -505,7 +496,6 @@ def __init__(
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)

Expand All @@ -522,7 +512,6 @@ def __init__(
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)

Expand All @@ -539,7 +528,6 @@ def __init__(
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)

Expand All @@ -556,7 +544,6 @@ def __init__(
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -594,7 +581,6 @@ def boolq_arabic_pfn(line, task_name: str = None):
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -629,7 +615,6 @@ def copa_arabic_pfn(line, task_name: str = None):
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -673,7 +658,6 @@ def hellaswag_arabic_pfn(line, task_name: str = None):
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -710,7 +694,6 @@ def toxigen_arabic_pfn(line, task_name: str = None):
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -761,7 +744,6 @@ def sciq_arabic_pfn(line, task_name: str = None):
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -826,7 +808,6 @@ def __init__(
suite=["community"],
generation_size=-1,
stop_sequence=None,
trust_dataset=True,
version=0,
)

Expand Down Expand Up @@ -1038,7 +1019,6 @@ def process_judge_response(response) -> float:
hf_avail_splits=["train"],
evaluation_splits=["train"],
metrics=[wrapped_judge],
trust_dataset=True,
generation_size=200,
stop_sequence=[],
version=0,
Expand Down
2 changes: 0 additions & 2 deletions community_tasks/french_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ def prompt_bac_fr(line, task_name: str = None):
generation_size=1,
metrics=[Metrics.loglikelihood_acc],
stop_sequence=["\n"],
trust_dataset=True,
version=0,
)

Expand All @@ -139,7 +138,6 @@ def prompt_bac_fr(line, task_name: str = None):
generation_size=1,
metrics=[Metrics.quasi_exact_match_math, Metrics.exact_match],
stop_sequence=["\n"],
trust_dataset=True,
version=0,
)

Expand Down
3 changes: 0 additions & 3 deletions community_tasks/serbian_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,10 +283,7 @@ def create_task_config(
few_shots_select="sequential",
metric=metric,
generation_size=generation_size,
# Since we use trust_dataset, we have to be careful about what is inside the dataset
# script. We thus lock the revision to ensure that the script doesn't change
hf_revision=HFSubsets.HF_REVISION.value,
trust_dataset=True,
version=0,
)

Expand Down
1 change: 0 additions & 1 deletion community_tasks/turkic_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ def __init__(
suite=["community"],
generation_size=-1,
stop_sequence=None,
trust_dataset=False,
version=0,
)

Expand Down
1 change: 0 additions & 1 deletion docs/source/saving-and-reading-results.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@ The detail file contains the following columns:
],
"original_num_docs": 1319,
"effective_num_docs": 1,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
}
Expand Down
1 change: 0 additions & 1 deletion examples/custom_tasks_templates/custom_yourbench_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,6 @@ def yourbench_prompt(line, task_name: str = ""):
generation_size=8192,
metrics=[Metrics.yourbench_metrics],
stop_sequence=[],
trust_dataset=True,
version=0,
)

Expand Down
3 changes: 1 addition & 2 deletions examples/custom_tasks_templates/custom_yourbench_task_mcq.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ def yourbench_prompt(line, task_name: str = ""):
few_shots_split=None,
few_shots_select=None,
generation_size=8192,
metric=[Metrics.yourbench_metrics],
trust_dataset=True,
metrics=[Metrics.yourbench_metrics],
version=0,
)

Expand Down
2 changes: 0 additions & 2 deletions examples/custom_tasks_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
generation_size=512,
metrics=[Metrics.expr_gold_metric],
stop_sequence=None,
trust_dataset=True,
version=0,
)

Expand All @@ -55,7 +54,6 @@
generation_size=2048,
metrics=[Metrics.gpqa_instruct_pass_at_1_1n],
stop_sequence=[], # no stop sequence, will use eos token
trust_dataset=True,
version=0,
)

Expand Down
20 changes: 0 additions & 20 deletions examples/nanotron/custom_evaluation_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ def preprocess(text):
hf_repo="hellaswag",
hf_subset="default",
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
LightevalTaskConfig(
Expand All @@ -99,7 +98,6 @@ def preprocess(text):
hf_repo="winogrande",
hf_subset="winogrande_xl",
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
LightevalTaskConfig(
Expand All @@ -108,7 +106,6 @@ def preprocess(text):
hf_repo="piqa",
hf_subset="plain_text",
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
LightevalTaskConfig(
Expand All @@ -118,7 +115,6 @@ def preprocess(text):
hf_subset="default",
hf_avail_splits=["train", "validation"],
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
LightevalTaskConfig(
Expand All @@ -127,7 +123,6 @@ def preprocess(text):
hf_repo="openbookqa",
hf_subset="main",
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
LightevalTaskConfig(
Expand All @@ -138,7 +133,6 @@ def preprocess(text):
evaluation_splits=["test"],
generation_size=1,
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
LightevalTaskConfig(
Expand All @@ -149,7 +143,6 @@ def preprocess(text):
evaluation_splits=["test"],
generation_size=1,
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
LightevalTaskConfig(
Expand All @@ -158,7 +151,6 @@ def preprocess(text):
hf_repo="commonsense_qa",
hf_subset="default",
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
]
Expand Down Expand Up @@ -189,7 +181,6 @@ def natural_questions_prompt(line, task_name: str = None):
hf_subset="rc.nocontext",
metric=[Metrics.quasi_exact_match],
generation_size=20,
trust_dataset=True,
stop_sequence=["\n", ".", ","],
),
LightevalTaskConfig(
Expand All @@ -199,7 +190,6 @@ def natural_questions_prompt(line, task_name: str = None):
hf_subset="default",
metric=[Metrics.quasi_exact_match],
generation_size=20,
trust_dataset=True,
stop_sequence=["\n", ".", ","],
),
]
Expand Down Expand Up @@ -228,7 +218,6 @@ def boolq_prompt(line, task_name: str = None):
hf_repo="super_glue",
hf_subset="boolq",
metric=[Metrics.target_perplexity],
trust_dataset=True,
stop_sequence=["\n"],
),
LightevalTaskConfig(
Expand All @@ -238,7 +227,6 @@ def boolq_prompt(line, task_name: str = None):
hf_subset="deault",
metric=[Metrics.quasi_exact_match],
generation_size=20,
trust_dataset=True,
stop_sequence=["\n", ".", ","],
),
]
Expand Down Expand Up @@ -266,7 +254,6 @@ def __init__(
few_shots_select=None,
suite=["custom"],
generation_size=40,
trust_dataset=True,
stop_sequence=None,
):
super().__init__(
Expand All @@ -281,7 +268,6 @@ def __init__(
few_shots_select=few_shots_select,
suite=suite,
generation_size=generation_size,
trust_dataset=trust_dataset,
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
)

Expand Down Expand Up @@ -365,7 +351,6 @@ def __init__(
few_shots_select=None,
suite=None,
generation_size=-1,
trust_dataset=True,
stop_sequence=None,
):
super().__init__(
Expand All @@ -380,7 +365,6 @@ def __init__(
few_shots_select=few_shots_select,
suite=suite,
generation_size=generation_size,
trust_dataset=trust_dataset,
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
)

Expand Down Expand Up @@ -478,7 +462,6 @@ def __init__(
few_shots_select=None,
suite=None,
generation_size=4,
trust_dataset=True,
stop_sequence=None,
):
super().__init__(
Expand All @@ -493,7 +476,6 @@ def __init__(
few_shots_select=few_shots_select,
suite=suite,
generation_size=generation_size,
trust_dataset=trust_dataset,
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
)

Expand Down Expand Up @@ -610,7 +592,6 @@ def __init__(
few_shots_select=None,
suite=None,
generation_size=-1,
trust_dataset=True,
stop_sequence=None,
):
super().__init__(
Expand All @@ -625,7 +606,6 @@ def __init__(
few_shots_select=few_shots_select,
suite=suite,
generation_size=generation_size,
trust_dataset=trust_dataset,
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
)

Expand Down
Loading
Loading