diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 3bdaefc23..786c4a0b1 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -911,6 +911,22 @@ def gpqa_instruct(line, task_name: str = None): ) +def gsm_plus(line, task_name: str = None): + # GSM8K with 8 prompt variations per sample + + # Some prompts require critical thinking (around 1k/10k), we skip them as + # they are a bit trickier to eval with regular text extraction. + if line["perturbation_type"] == "critical thinking": + return None + + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\n\nAnswer:", + choices=[line["answer"]], + gold_index=0, + ) + + def gsm8k(line, task_name: str = None): # Has special analysis in metric for number decomposition return Doc( diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 488520bae..b77b27d52 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -7960,6 +7960,22 @@ trust_dataset=True, version=0, ) +gsm_plus = LightevalTaskConfig( + name="gsm_plus", + suite=["lighteval"], + prompt_function=prompt.gsm_plus, + hf_repo="qintongli/GSM-Plus", + hf_subset="default", + hf_avail_splits=["test", "testmini"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=[Metrics.expr_gold_metric], + stop_sequence=None, + trust_dataset=True, + version=0, +) gsm8k_leaderboard = LightevalTaskConfig( name="gsm8k", suite=["leaderboard"],