adds aime24, 25 and math500 (#586)

NathanHB · web-flow · commit 25ded1566603 · 2025-02-25T18:06:17.000+01:00
* commit

* Apply suggestions from code review

* commit

* add prompt to math 500

* add prompt to math 500
diff --git a/examples/model_configs/vllm_model_config.yaml b/examples/model_configs/vllm_model_config.yaml
@@ -1,6 +1,6 @@
 model:
   base_params:
-    model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B,revision=main,dtype=bfloat16" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
+    model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B-Instruct,revision=main,dtype=bfloat16"
   generation:
     temperature: 0.3
     repetition_penalty: 1.0
@@ -10,5 +10,4 @@ model:
     top_k: -1
     min_p: 0.0
     top_p: 0.9
-    max_new_tokens: 256
-    stop_tokens: ["<EOS>", "<PAD>"]
+    max_new_tokens: 2048
diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py
@@ -468,8 +468,8 @@ def litellm(
     if model_args.endswith(".yaml"):
         model_config = LiteLLMModelConfig.from_path(model_args)
     else:
-        model_name = model_args.split(",")[0].strip()
-        model_config = LiteLLMModelConfig(model=model_name)
+        model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
+        model_config = LiteLLMModelConfig(**model_args_dict)
 
     pipeline_params = PipelineParameters(
         launcher_type=parallelism_manager,
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -25,7 +25,9 @@
 from aenum import Enum
 
 from lighteval.metrics.dynamic_metrics import (
+    ExprExtractionConfig,
     IndicesExtractionConfig,
+    LatexExtractionConfig,
     multilingual_extractive_match_metric,
 )
 from lighteval.metrics.harness_compatibility.drop import drop_metrics
@@ -178,6 +180,15 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+    expr_gold_metric = multilingual_extractive_match_metric(
+        language=Language.ENGLISH,
+        fallback_mode="first_match",
+        precision=5,
+        gold_extraction_target=(ExprExtractionConfig(),),
+        # Match boxed first before trying other regexes
+        pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
+        aggregation_function=max,
+    )
     extractiveness = SampleLevelMetricGrouping(
         metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
         sample_level_fn=Extractiveness(
@@ -238,6 +249,15 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+    latex_gold_metric = multilingual_extractive_match_metric(
+        language=Language.ENGLISH,
+        fallback_mode="first_match",
+        precision=5,
+        gold_extraction_target=(LatexExtractionConfig(),),
+        # Match boxed first before trying other regexes
+        pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
+        aggregation_function=max,
+    )
     loglikelihood_acc = SampleLevelMetric(
         metric_name="acc",
         sample_level_fn=LoglikelihoodAcc(logprob_normalization=None).compute,
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
@@ -247,7 +247,11 @@ def greedy_until(
                 # the case! Because of that we only use batch size of 1
                 stop_tokens = dataset[0].stop_sequence
 
-            max_new_tokens = dataset[0].generation_size  # could be none
+            max_new_tokens = (
+                dataset[0].generation_size
+                if self.sampling_params.max_tokens is None
+                else self.sampling_params.max_tokens
+            )
             returns_logits = dataset[0].use_logits
             num_samples = dataset[0].num_samples
 
@@ -321,9 +325,7 @@ def _generate(
         sampling_params = self.sampling_params.clone() or SamplingParams()
         if generate:
             sampling_params.n = num_samples
-            sampling_params.max_tokens = (
-                max_new_tokens if sampling_params.max_tokens is None else sampling_params.max_tokens
-            )
+            sampling_params.max_tokens = max_new_tokens
             sampling_params.stop = stop_tokens
             sampling_params.logprobs = 1 if returns_logits else 0
 
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -43,6 +43,24 @@
 # fmt: on
 
 
+def aime_prompt_fn(line, task_name: str = None):
+    # Prompt template adapted from
+    # - simple-evals: https://github.com/openai/simple-evals/blob/6e84f4e2aed6b60f6a0c7b8f06bbbf4bfde72e58/math_eval.py#L17
+    # - Llama 3: https://huggingface.co/datasets/meta-llama/Llama-3.2-1B-Instruct-evals/viewer/Llama-3.2-1B-Instruct-evals__math__details?views%5B%5D=llama_32_1b_instruct_evals__math__details
+    # Note that it is important to have the final answer in a box for math-verify to work correctly
+    MATH_QUERY_TEMPLATE = """
+Solve the following math problem efficiently and clearly.  The last line of your response should be of the following format: 'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering.
+
+{Question}
+""".strip()
+    return Doc(
+        task_name=task_name,
+        query=MATH_QUERY_TEMPLATE.format(Question=line["problem"]),
+        choices=[line["answer"]],
+        gold_index=0,
+    )
+
+
 def anli(line, task_name: str = None):
     return Doc(
         task_name=task_name,
@@ -710,22 +728,31 @@ def ethics_virtue(line, task_name: str = None):
 
 
 def gpqa(line, task_name: str = None):
+    # Prompt template from simple-evals: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14
+    GPQA_QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+
+{Question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
     gold_index = random.randint(0, 3)
     choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
     choices.insert(gold_index, line["Correct Answer"])
 
-    instruction = "Select the correct answer to the following questions.\n\n"
-
-    query = f"Question: {line['Question']}\n"
-    query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)])
-    query += "Answer: "
+    query = GPQA_QUERY_TEMPLATE.format(
+        A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"]
+    )
 
     return Doc(
         task_name=task_name,
-        query=f"{instruction}{query}",
+        query=query,
         choices=LETTER_INDICES[: len(choices)],
         gold_index=gold_index,
-        instruction=instruction,
+        instruction=query,
     )
 
 
@@ -1257,6 +1284,25 @@ def lsat_qa(line, task_name: str = None):
     )
 
 
+def math_500(line, task_name: str = None):
+    # Prompt template adapted from
+    # - simple-evals: https://github.com/openai/simple-evals/blob/6e84f4e2aed6b60f6a0c7b8f06bbbf4bfde72e58/math_eval.py#L17
+    # - Llama 3: https://huggingface.co/datasets/meta-llama/Llama-3.2-1B-Instruct-evals/viewer/Llama-3.2-1B-Instruct-evals__math__details?views%5B%5D=llama_32_1b_instruct_evals__math__details
+    # Note that it is important to have the final answer in a box for math-verify to work correctly
+    MATH_QUERY_TEMPLATE = """
+Solve the following math problem efficiently and clearly.  The last line of your response should be of the following format: 'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering.
+
+{Question}
+""".strip()
+
+    return Doc(
+        task_name=task_name,
+        query=MATH_QUERY_TEMPLATE.format(Question=line["problem"]),
+        gold_index=0,
+        choices=[line["solution"]],
+    )
+
+
 def math(line, task_name: str = None):
     return Doc(
         task_name=task_name,
diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
@@ -312,6 +312,34 @@
     trust_dataset=True,
     version=0,
 )
+aime24 = LightevalTaskConfig(
+    name="aime24",
+    suite=["lighteval"],
+    prompt_function=prompt.aime_prompt_fn,
+    hf_repo="HuggingFaceH4/aime_2024",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=32768,
+    metric=[Metrics.expr_gold_metric],
+    version=1,
+)
+aime25 = LightevalTaskConfig(
+    name="aime25",
+    suite=["lighteval"],
+    prompt_function=prompt.aime_prompt_fn,
+    hf_repo="yentinglin/aime_2025",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=10000,
+    metric=[Metrics.expr_gold_metric],
+    version=1,
+)
 anachronisms_bigbench = LightevalTaskConfig(
     name="anachronisms",
     suite=["bigbench", "bigbench_json"],
@@ -9597,6 +9625,20 @@
     trust_dataset=True,
     version=0,
 )
+math_500 = LightevalTaskConfig(
+    name="math_500",
+    suite=["lighteval"],
+    prompt_function=prompt.math_500,
+    hf_repo="HuggingFaceH4/MATH-500",
+    hf_subset="default",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=32768,
+    metric=[Metrics.latex_gold_metric],
+    version=1,
+)
 math_algebra_lighteval = LightevalTaskConfig(
     name="math:algebra",
     suite=["lighteval", "math"],