Add pass@1 for GPQA-D and MATH-500 (#698)

lewtun · web-flow · commit 743179a4ab25 · 2025-05-05T11:51:27.000+02:00
* Add pass@1 for GPQA-D and clean up AIME

* Add pass@1 for math_500

* Add pass@1 for MATH-500

* Update test

* Fix
diff --git a/examples/custom_tasks_tests.py b/examples/custom_tasks_tests.py
@@ -53,7 +53,7 @@
     few_shots_split=None,
     few_shots_select=None,
     generation_size=2048,
-    metric=[Metrics.gpqa_instruct_metric],
+    metric=[Metrics.gpqa_instruct_pass_at_1_1n],
     stop_sequence=[],  # no stop sequence, will use eos token
     trust_dataset=True,
     version=0,
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -370,6 +370,38 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+    math_pass_at_1_1n = SampleLevelMetric(
+        metric_name="math_pass@1:1_samples",
+        sample_level_fn=PassAtK(
+            k=1,
+            n=1,
+            strip_strings=True,
+            # Extracting mathematical expressions and latex expressions
+            normalize_gold=lambda k: extract_target_from_pred(
+                k,
+                get_extraction_regexes(
+                    formatted_doc=None,
+                    target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
+                    language=Language.ENGLISH,
+                ),
+            ),
+            # Extracting mathematical expressions and latex expressions
+            normalize_pred=lambda k: extract_target_from_pred(
+                k,
+                get_extraction_regexes(
+                    formatted_doc=None,
+                    target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
+                    language=Language.ENGLISH,
+                ),
+            ),
+            # Uses sympy for comparision
+            sample_scoring_function=compare_gold_target,
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
     math_pass_at_1_4n = SampleLevelMetric(
         metric_name="math_pass@1:4_samples",
         sample_level_fn=PassAtK(
@@ -838,6 +870,57 @@ class Metrics(Enum):
         pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
         precision=6,
     )
+    gpqa_instruct_pass_at_1_1n = SampleLevelMetric(
+        metric_name="gpqa_pass@1:1_samples",
+        sample_level_fn=PassAtK(
+            k=1,
+            n=1,
+            sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
+                language=Language.ENGLISH,
+                gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+                pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+                precision=6,
+            ).sample_level_fn([ref], [pred], doc),
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+    gpqa_instruct_pass_at_1_4n = SampleLevelMetric(
+        metric_name="gpqa_pass@1:4_samples",
+        sample_level_fn=PassAtK(
+            k=1,
+            n=4,
+            sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
+                language=Language.ENGLISH,
+                gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+                pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+                precision=6,
+            ).sample_level_fn([ref], [pred], doc),
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+    gpqa_instruct_pass_at_1_8n = SampleLevelMetric(
+        metric_name="gpqa_pass@1:8_samples",
+        sample_level_fn=PassAtK(
+            k=1,
+            n=8,
+            sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
+                language=Language.ENGLISH,
+                gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+                pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+                precision=6,
+            ).sample_level_fn([ref], [pred], doc),
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
 
     def __str__(self):
         return self.name.replace("_at_", "@")
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -1163,7 +1163,9 @@ def __init__(
                 self.type_exact_match = "full"
             self.score_sample = self.default_sample_scoring
 
-    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict[str, float]:
+    def compute(
+        self, golds: list[str], predictions: list[str], formatted_doc: Doc = None, **kwargs
+    ) -> dict[str, float]:
         """Computes the metric over a list of golds and predictions for one single item with possibly many samples.
         It applies normalisation (if needed) to model prediction and gold, computes their per prediction score,
         then aggregates the scores over the samples using a pass@k.
@@ -1189,7 +1191,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict[st
         all_scores = []
         for pred in predictions[: self.n]:
             cur_pred = self.get_processed_pred(pred=pred)
-            all_scores.append(self.score_sample(cur_pred, gold))
+            all_scores.append(self.score_sample(cur_pred, gold, formatted_doc))
 
         return self.pass_at_k(all_scores)
 
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
@@ -27,7 +27,7 @@
 from typing import Optional
 
 import torch
-from pydantic import NonNegativeFloat, PositiveInt
+from pydantic import NonNegativeFloat, NonNegativeInt, PositiveInt
 from tqdm import tqdm
 
 from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset
@@ -82,7 +82,7 @@ class VLLMModelConfig(ModelConfig):
     gpu_memory_utilization: NonNegativeFloat = 0.9  # lower this if you are running out of memory
     max_model_length: PositiveInt | None = None  # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
     swap_space: PositiveInt = 4  # CPU swap space size (GiB) per GPU.
-    seed: PositiveInt = 1234
+    seed: NonNegativeInt = 1234
     trust_remote_code: bool = False
     use_chat_template: bool = False
     add_special_tokens: bool = True
diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
@@ -324,10 +324,14 @@
     few_shots_select=None,
     generation_size=32768,
     metric=[
-        Metrics.expr_gold_metric,
+        Metrics.math_pass_at_1_1n,
+        Metrics.math_pass_at_1_4n,
+        Metrics.math_pass_at_1_8n,
+        Metrics.math_pass_at_1_16n,
         Metrics.math_pass_at_1_32n,
+        Metrics.math_pass_at_1_64n,
     ],
-    version=1,
+    version=2,
 )
 aime24_gpassk = LightevalTaskConfig(
     name="aime24_gpassk",
@@ -355,10 +359,14 @@
     few_shots_select=None,
     generation_size=10000,
     metric=[
-        Metrics.expr_gold_metric,
+        Metrics.math_pass_at_1_1n,
+        Metrics.math_pass_at_1_4n,
+        Metrics.math_pass_at_1_8n,
+        Metrics.math_pass_at_1_16n,
         Metrics.math_pass_at_1_32n,
+        Metrics.math_pass_at_1_64n,
     ],
-    version=1,
+    version=2,
 )
 aime25_gpassk = LightevalTaskConfig(
     name="aime25_gpassk",
@@ -7809,10 +7817,14 @@
     few_shots_split=None,
     few_shots_select=None,
     generation_size=32768,  # needed for reasoning models like R1
-    metric=[Metrics.gpqa_instruct_metric],
+    metric=[
+        Metrics.gpqa_instruct_pass_at_1_1n,
+        Metrics.gpqa_instruct_pass_at_1_4n,
+        Metrics.gpqa_instruct_pass_at_1_8n,
+    ],
     stop_sequence=[],  # no stop sequence, will use eos token
     trust_dataset=True,
-    version=0,
+    version=1,
 )
 gpqa_extended_instruct_lighteval = LightevalTaskConfig(
     name="gpqa:extended",
@@ -9688,8 +9700,11 @@
     few_shots_split=None,
     few_shots_select=None,
     generation_size=32768,
-    metric=[Metrics.latex_gold_metric],
-    version=1,
+    metric=[
+        Metrics.math_pass_at_1_1n,
+        Metrics.math_pass_at_1_4n,
+    ],
+    version=2,
 )
 math_500_gpassk = LightevalTaskConfig(
     name="math_500_gpassk",