diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py index c9745b51a..3e0b45121 100644 --- a/src/lighteval/metrics/dynamic_metrics.py +++ b/src/lighteval/metrics/dynamic_metrics.py @@ -61,8 +61,8 @@ def loglikelihood_acc_metric(normalization: LogProbNormalization | None = None) Creates an accuracy (loglikelihood) metric, which returns accuracy given normalization. """ - normalization_str = normalization.name if normalization else "" - metric_name = f"acc_{normalization_str}" + normalization_str = f"_{normalization.name}" if normalization else "" + metric_name = f"acc{normalization_str}" return SampleLevelMetric( metric_name=metric_name, sample_level_fn=LoglikelihoodAcc(logprob_normalization=normalization).compute, @@ -83,8 +83,8 @@ def normalized_multi_choice_prob_metric( Creates a normalized multi-choice probability metric, which returns the probability of the gold choice / sum of probabilities of all choices (after logprobs are normalized). """ - normalization_str = normalization.name if normalization else "" - metric_name = "_".join(filter(None, ["normalized_mc_prob_", normalization_str])) + normalization_str = f"_{normalization.name}" if normalization else "" + metric_name = f"normalized_mc_prob{normalization_str}" return SampleLevelMetric( metric_name=metric_name, @@ -108,8 +108,8 @@ def probability_metric( Creates a probability metric, which returns the probability of the gold choice given normalization. """ - normalization_str = normalization.name if normalization else "" - metric_name = "_".join(filter(None, ["prob", normalization_str])) + normalization_str = f"_{normalization.name}" if normalization else "" + metric_name = f"prob{normalization_str}" return SampleLevelMetric( metric_name=metric_name, @@ -188,7 +188,7 @@ def multilingual_quasi_exact_match_metric( def multilingual_extractive_match_metric( language: Language = Language.ENGLISH, gold_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),), - pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),), + pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(), LatexExtractionConfig()), aggregation_function: Callable[[list[float]], float] = max, fallback_mode: Literal["no_fallback", "first_match"] = "first_match", extraction_mode: Literal["first_match", "any_match"] = "any_match", diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 2745b63c5..0982bfec5 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -2774,3 +2774,10 @@ def xsum(line, task_name: str = None): choices=[str(line["summary"])], specific={"text": line["article"]}, ) + + +# Utility for drop task +def get_drop_date(x): + components = [x["day"], x["month"], x["year"]] + components = list(filter(lambda x: x, components)) + return " ".join(components) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 3960e6f5c..9fb48b9e9 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -22,6 +22,8 @@ import lighteval.tasks.default_prompts as prompt from lighteval.metrics.metrics import Metrics from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language abstract_narrative_understanding_bigbench = LightevalTaskConfig( @@ -6627,21 +6629,28 @@ trust_dataset=True, version=0, ) -coqa_lighteval = LightevalTaskConfig( +coqa_first_question = LightevalTaskConfig( name="coqa", - suite=["lighteval"], - prompt_function=prompt.coqa, - hf_repo="coqa", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: { + "question": line["questions"][0], + "context": line["story"], + "choices": [line["answers"]["input_text"][0]], + }, + ), + suite=("lighteval",), + hf_repo="stanfordnlp/coqa", hf_subset="default", hf_avail_splits=["train", "validation"], evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metric=[Metrics.perfect_exact_match, Metrics.f1_score], - stop_sequence=["\n"], - trust_dataset=True, - version=0, + stop_sequence=["\n", "Question:", "question:"], + generation_size=100, + version=1, + metric=( + Metrics.prefix_quasi_exact_match, + Metrics.f1_score_quasi, + ), ) coqa_bb_lighteval = LightevalTaskConfig( name="coqa_bb", @@ -6835,21 +6844,43 @@ trust_dataset=True, version=0, ) -drop_lighteval = LightevalTaskConfig( +drop_qa = LightevalTaskConfig( name="drop", - suite=["lighteval"], - prompt_function=prompt.drop, + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: { + "context": line["passage"], + "question": line["question"], + "choices": list( + filter( + lambda x: x, + [line["answer"].get("number")] + + line["answer"]["spans"] + + [prompt.get_drop_date(line["answer"].get("date"))], + ) + ), + }, + ), + suite=("lighteval",), hf_repo="lighteval/drop_harness", hf_subset="default", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], + hf_filter=lambda line: list( + filter( + lambda x: x, + [line["answer"].get("number")] + + line["answer"]["spans"] + + [prompt.get_drop_date(line["answer"].get("date"))], + ) + ), + evaluation_splits=("validation",), few_shots_split="train", - few_shots_select="random_sampling_from_train", - generation_size=None, - metric=[Metrics.drop], - stop_sequence=["."], - trust_dataset=True, - version=0, + generation_size=250, + stop_sequence=["Question:", "question:", "\n"], + metric=( + Metrics.prefix_quasi_exact_match, + Metrics.f1_score_quasi, + ), + version=1, ) dyck_language_2_helm = LightevalTaskConfig( name="dyck_language:2", @@ -8581,6 +8612,27 @@ trust_dataset=True, version=0, ) +jeopardy = LightevalTaskConfig( + name="jeopardy", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: { + "question": line["question"], + "choices": [line["answer"]], + }, + ), + suite=("lighteval",), + hf_repo="openaccess-ai-collective/jeopardy", + hf_subset="default", + evaluation_splits=("train",), + few_shots_split="train", + generation_size=250, + stop_sequence=["\n", "Question:", "question:"], + metric=( + Metrics.prefix_quasi_exact_match, + Metrics.f1_score_quasi, + ), +) kanji_ascii_bigbench = LightevalTaskConfig( name="kanji_ascii", suite=["bigbench", "bigbench_json"], @@ -13665,6 +13717,24 @@ trust_dataset=True, version=0, ) +natural_questions = LightevalTaskConfig( + name="natural_questions", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: {"question": line["question"], "choices": [line["answer"]]}, + ), + suite=("lighteval",), + hf_repo="lighteval/small_natural_questions", + hf_subset="default", + evaluation_splits=("test",), + few_shots_split="few_shot", + generation_size=250, + stop_sequence=["\n", "Question:", "question:"], + metric=( + Metrics.prefix_quasi_exact_match, + Metrics.f1_score_quasi, + ), +) navigate_bigbench = LightevalTaskConfig( name="navigate", suite=["bigbench", "bigbench_json"], @@ -14885,7 +14955,7 @@ hf_subset="default", hf_avail_splits=["test"], evaluation_splits=["test"], - few_shots_split=None, + few_shots_split="few_shot", few_shots_select=None, generation_size=2048, metric=[Metrics.simpleqa_judge], @@ -15074,6 +15144,29 @@ trust_dataset=True, version=0, ) +squad_v2 = LightevalTaskConfig( + name="squad_v2", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="rajpurkar/squad_v2", + hf_subset="squad_v2", + hf_filter=lambda line: any(ans for ans in line["answers"]["text"] if len(ans) > 0), + evaluation_splits=("validation",), + few_shots_split="train", + stop_sequence=["\n", "Question:", "question:"], + generation_size=200, + metric=( + Metrics.prefix_quasi_exact_match, + Metrics.f1_score_quasi, + ), +) storycloze_2016_lighteval = LightevalTaskConfig( name="storycloze:2016", suite=["lighteval", "storycloze"],