From a6891a53e9e3afe5e1bdf1820d177ab9ea5df32d Mon Sep 17 00:00:00 2001 From: Hynek Kydlicek Date: Tue, 29 Apr 2025 00:56:23 +0000 Subject: [PATCH 1/5] add smolm generative tasks --- src/lighteval/metrics/dynamic_metrics.py | 14 +-- src/lighteval/tasks/default_prompts.py | 7 ++ src/lighteval/tasks/default_tasks.py | 114 ++++++++++++++++++----- 3 files changed, 104 insertions(+), 31 deletions(-) diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py index 34f69a8bd..9df264a77 100644 --- a/src/lighteval/metrics/dynamic_metrics.py +++ b/src/lighteval/metrics/dynamic_metrics.py @@ -61,8 +61,8 @@ def loglikelihood_acc_metric(normalization: LogProbNormalization | None = None) Creates a accuracy (loglikelihood) metric, which returns accuracy given normalization. """ - normalization_str = normalization.name if normalization else "" - metric_name = f"acc_{normalization_str}" + normalization_str = f"_{normalization.name}" if normalization else "" + metric_name = f"acc{normalization_str}" return SampleLevelMetric( metric_name=metric_name, sample_level_fn=LoglikelihoodAcc(logprob_normalization=normalization).compute, @@ -83,8 +83,8 @@ def normalized_multi_choice_prob_metric( Creates a normalized multi-choice probability metric, which returns the probability of the gold choice / sum of probabilities of all choices (after logprobs are normalized). """ - normalization_str = normalization.name if normalization else "" - metric_name = "_".join(filter(None, ["normalized_mc_prob_", normalization_str])) + normalization_str = f"_{normalization.name}" if normalization else "" + metric_name = f"normalized_mc_prob{normalization_str}" return SampleLevelMetric( metric_name=metric_name, @@ -108,8 +108,8 @@ def probability_metric( Creates a probability metric, which returns the probability of the gold choice given normalization. """ - normalization_str = normalization.name if normalization else "" - metric_name = "_".join(filter(None, ["prob", normalization_str])) + normalization_str = f"_{normalization.name}" if normalization else "" + metric_name = f"prob{normalization_str}" return SampleLevelMetric( metric_name=metric_name, @@ -188,7 +188,7 @@ def multilingual_quasi_exact_match_metric( def multilingual_extractive_match_metric( language: Language = Language.ENGLISH, gold_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),), - pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),), + pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),LatexExtractionConfig()), aggregation_function: Callable[[list[float]], float] = max, fallback_mode: Literal["no_fallback", "first_match"] = "first_match", extraction_mode: Literal["first_match", "any_match"] = "any_match", diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 2745b63c5..0982bfec5 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -2774,3 +2774,10 @@ def xsum(line, task_name: str = None): choices=[str(line["summary"])], specific={"text": line["article"]}, ) + + +# Utility for drop task +def get_drop_date(x): + components = [x["day"], x["month"], x["year"]] + components = list(filter(lambda x: x, components)) + return " ".join(components) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index f092092ce..557299ebf 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -22,6 +22,8 @@ import lighteval.tasks.default_prompts as prompt from lighteval.metrics.metrics import Metrics from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language abstract_narrative_understanding_bigbench = LightevalTaskConfig( @@ -6619,21 +6621,27 @@ trust_dataset=True, version=0, ) -coqa_lighteval = LightevalTaskConfig( - name="coqa", - suite=["lighteval"], - prompt_function=prompt.coqa, - hf_repo="coqa", +coqa_first_question = LightevalTaskConfig( + name="coqa_first_question", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: { + "question": line["questions"][0], + "context": line["story"], + "choices": [line["answers"]["input_text"][0]], + }, + ), + suite=("lighteval",), + hf_repo="stanfordnlp/coqa", hf_subset="default", hf_avail_splits=["train", "validation"], evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metric=[Metrics.perfect_exact_match, Metrics.f1_score], - stop_sequence=["\n"], - trust_dataset=True, - version=0, + stop_sequence=["\n", "Question:", "question:"], + generation_size=100, + metric=( + Metrics.prefix_quasi_exact_match, + Metrics.f1_score_quasi, + ), ) coqa_bb_lighteval = LightevalTaskConfig( name="coqa_bb", @@ -6827,21 +6835,38 @@ trust_dataset=True, version=0, ) -drop_lighteval = LightevalTaskConfig( - name="drop", - suite=["lighteval"], - prompt_function=prompt.drop, +drop_qa = LightevalTaskConfig( + name="drop_fixed", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: { + "context": line["passage"], + "question": line["question"], + "choices": list( + filter( + lambda x: x, + [line["answer"].get("number")] + line["answer"]["spans"] + [prompt.get_drop_date(line["answer"].get("date"))], + ) + ), + }, + ), + suite=("lighteval",), hf_repo="lighteval/drop_harness", hf_subset="default", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], + hf_filter=lambda line: list( + filter( + lambda x: x, + [line["answer"].get("number")] + line["answer"]["spans"] + [prompt.get_drop_date(line["answer"].get("date"))], + ) + ), + evaluation_splits=("validation",), few_shots_split="train", - few_shots_select="random_sampling_from_train", - generation_size=None, - metric=[Metrics.drop], - stop_sequence=["."], - trust_dataset=True, - version=0, + generation_size=250, + stop_sequence=["Question:", "Question", "question", "question:", "\n"], + metric=( + Metrics.prefix_quasi_exact_match, + Metrics.f1_score_quasi, + ), ) dyck_language_2_helm = LightevalTaskConfig( name="dyck_language:2", @@ -13648,6 +13673,24 @@ trust_dataset=True, version=0, ) +natural_questions = LightevalTaskConfig( + name="natural_questions", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: {"question": line["question"], "choices": [line["answer"]]}, + ), + suite=("lighteval",), + hf_repo="lighteval/small_natural_questions", + hf_subset="default", + evaluation_splits=("test",), + few_shots_split="few_shot", + generation_size=250, + stop_sequence=["\n", "Question:", "question:"], + metric=( + Metrics.prefix_quasi_exact_match, + Metrics.f1_score_quasi, + ), +) navigate_bigbench = LightevalTaskConfig( name="navigate", suite=["bigbench", "bigbench_json"], @@ -15057,6 +15100,29 @@ trust_dataset=True, version=0, ) +squad_v2 = LightevalTaskConfig( + name="squad_v2", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="rajpurkar/squad_v2", + hf_subset="squad_v2", + hf_filter=lambda line: any(ans for ans in line["answers"]["text"] if len(ans) > 0), + evaluation_splits=("validation",), + few_shots_split="train", + stop_sequence=["\n", "Question:", "question:"], + generation_size=200, + metric=( + Metrics.prefix_quasi_exact_match, + Metrics.f1_score_quasi, + ), +) storycloze_2016_lighteval = LightevalTaskConfig( name="storycloze:2016", suite=["lighteval", "storycloze"], From a767b063aca11dda5316cc71fe4b44e129776d77 Mon Sep 17 00:00:00 2001 From: Hynek Kydlicek Date: Tue, 29 Apr 2025 01:03:56 +0000 Subject: [PATCH 2/5] add jeopardy --- src/lighteval/tasks/default_tasks.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 557299ebf..3ae3a1479 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -8592,6 +8592,27 @@ trust_dataset=True, version=0, ) +jeopardy = LightevalTaskConfig( + name="jeopardy", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: { + "question": line["question"], + "choices": [line["answer"]], + }, + ), + suite=("lighteval",), + hf_repo="openaccess-ai-collective/jeopardy", + hf_subset="default", + evaluation_splits=("train",), + few_shots_split="train", + generation_size=250, + stop_sequence=["\n", "Question:", "question:"], + metric=( + Metrics.prefix_quasi_exact_match, + Metrics.f1_score_quasi, + ), +) kanji_ascii_bigbench = LightevalTaskConfig( name="kanji_ascii", suite=["bigbench", "bigbench_json"], @@ -14911,7 +14932,7 @@ hf_subset="default", hf_avail_splits=["test"], evaluation_splits=["test"], - few_shots_split=None, + few_shots_split="few_shot", few_shots_select=None, generation_size=2048, metric=[Metrics.simpleqa_judge], From 0534e3064dff49f6db2c13ddcc81f732d71230af Mon Sep 17 00:00:00 2001 From: Hynek Kydlicek Date: Mon, 12 May 2025 18:17:22 +0200 Subject: [PATCH 3/5] =?UTF-8?q?pretty=20=F0=9F=A5=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/lighteval/metrics/dynamic_metrics.py | 2 +- src/lighteval/tasks/default_tasks.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py index 9df264a77..e33f47e0b 100644 --- a/src/lighteval/metrics/dynamic_metrics.py +++ b/src/lighteval/metrics/dynamic_metrics.py @@ -188,7 +188,7 @@ def multilingual_quasi_exact_match_metric( def multilingual_extractive_match_metric( language: Language = Language.ENGLISH, gold_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),), - pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),LatexExtractionConfig()), + pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(), LatexExtractionConfig()), aggregation_function: Callable[[list[float]], float] = max, fallback_mode: Literal["no_fallback", "first_match"] = "first_match", extraction_mode: Literal["first_match", "any_match"] = "any_match", diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 3ae3a1479..4de6b0873 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -6845,7 +6845,9 @@ "choices": list( filter( lambda x: x, - [line["answer"].get("number")] + line["answer"]["spans"] + [prompt.get_drop_date(line["answer"].get("date"))], + [line["answer"].get("number")] + + line["answer"]["spans"] + + [prompt.get_drop_date(line["answer"].get("date"))], ) ), }, @@ -6856,7 +6858,9 @@ hf_filter=lambda line: list( filter( lambda x: x, - [line["answer"].get("number")] + line["answer"]["spans"] + [prompt.get_drop_date(line["answer"].get("date"))], + [line["answer"].get("number")] + + line["answer"]["spans"] + + [prompt.get_drop_date(line["answer"].get("date"))], ) ), evaluation_splits=("validation",), From d91b6212d9ea49ad93e6c97e2b19ae1fb9ee54fe Mon Sep 17 00:00:00 2001 From: Hynek Kydlicek Date: Mon, 12 May 2025 18:21:27 +0200 Subject: [PATCH 4/5] consistent stop sequences --- src/lighteval/tasks/default_tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 4de6b0873..c092f286a 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -6866,7 +6866,7 @@ evaluation_splits=("validation",), few_shots_split="train", generation_size=250, - stop_sequence=["Question:", "Question", "question", "question:", "\n"], + stop_sequence=["Question:", "question:", "\n"], metric=( Metrics.prefix_quasi_exact_match, Metrics.f1_score_quasi, From 5b7e073ed2ea8f4aaf58fbf7ced3249f3b3b9536 Mon Sep 17 00:00:00 2001 From: Hynek Kydlicek Date: Thu, 15 May 2025 13:59:58 +0200 Subject: [PATCH 5/5] add versions + change names --- src/lighteval/tasks/default_tasks.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index c092f286a..318087e8e 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -6622,7 +6622,7 @@ version=0, ) coqa_first_question = LightevalTaskConfig( - name="coqa_first_question", + name="coqa", prompt_function=get_qa_prompt_function( Language.ENGLISH, lambda line: { @@ -6638,6 +6638,7 @@ evaluation_splits=["validation"], stop_sequence=["\n", "Question:", "question:"], generation_size=100, + version=1, metric=( Metrics.prefix_quasi_exact_match, Metrics.f1_score_quasi, @@ -6836,7 +6837,7 @@ version=0, ) drop_qa = LightevalTaskConfig( - name="drop_fixed", + name="drop", prompt_function=get_qa_prompt_function( Language.ENGLISH, lambda line: { @@ -6871,6 +6872,7 @@ Metrics.prefix_quasi_exact_match, Metrics.f1_score_quasi, ), + version=1, ) dyck_language_2_helm = LightevalTaskConfig( name="dyck_language:2",