huggingface · NathanHB · May 28, 2025 · May 23, 2025 · May 23, 2025
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -911,6 +911,22 @@ def gpqa_instruct(line, task_name: str = None):
     )
 
 
+def gsm_plus(line, task_name: str = None):
+    # GSM8K with 8 prompt variations per sample
+
+    # Some prompts require critical thinking (around 1k/10k), we skip them as
+    # they are a bit trickier to eval with regular text extraction.
+    if line["perturbation_type"] == "critical thinking":
+        return None
+
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\n\nAnswer:",
+        choices=[line["answer"]],
+        gold_index=0,
+    )
+
+
 def gsm8k(line, task_name: str = None):
     # Has special analysis in metric for number decomposition
     return Doc(

diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
@@ -7960,6 +7960,22 @@
     trust_dataset=True,
     version=0,
 )
+gsm_plus = LightevalTaskConfig(
+    name="gsm_plus",
+    suite=["lighteval"],
+    prompt_function=prompt.gsm_plus,
+    hf_repo="qintongli/GSM-Plus",
+    hf_subset="default",
+    hf_avail_splits=["test", "testmini"],
+    evaluation_splits=["test"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=None,
+    metric=[Metrics.expr_gold_metric],
+    stop_sequence=None,
+    trust_dataset=True,
+    version=0,
+)
 gsm8k_leaderboard = LightevalTaskConfig(
     name="gsm8k",
     suite=["leaderboard"],