From b06e18e18b6068241904dc392087ae103feacf5e Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Tue, 16 Apr 2024 13:36:35 +0000
Subject: [PATCH 01/13] init

---
 src/lighteval/evaluator.py              |  6 +-
 src/lighteval/metrics/__init__.py       |  7 +-
 src/lighteval/metrics/metrics.py        | 17 +++++
 src/lighteval/metrics/metrics_sample.py | 87 +++++++++++++++++++++++++
 src/lighteval/metrics/utils.py          |  3 +-
 src/lighteval/models/base_model.py      | 37 ++++++++++-
 src/lighteval/tasks/lighteval_task.py   | 25 ++++++-
 src/lighteval/tasks/requests.py         | 20 ++++++
 8 files changed, 190 insertions(+), 12 deletions(-)

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index c77c38890..5fa511edf 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -82,12 +82,14 @@ def evaluate(  # noqa: C901
             full_resps = lm.loglikelihood(requests, override_bs=override_bs)
         elif request_type == RequestType.LOGLIKELIHOOD_SINGLE_TOKEN:
             full_resps = lm.loglikelihood_single_token(requests, override_bs=override_bs)
+        elif request_type == RequestType.LOGLIKELIHOOD_ROLLING:
+            full_resps = lm.loglikelihood_rolling(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL:
             full_resps = lm.greedy_until(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL_WITH_LOGITS:
             full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
-        elif request_type == RequestType.LOGLIKELIHOOD_ROLLING:
-            full_resps = lm.loglikelihood_rolling(requests, override_bs=override_bs)
+        elif request_type == RequestType.GREEDY_UNTIL_WITH_SAMPLING:
+            full_resps = lm.greedy_until_with_sampling(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
             full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
         else:
diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 7ef77aefd..68727b8e9 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -91,7 +91,7 @@ def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metr
         golds = [formatted_doc.specific["label_to_choices"][g] for g in golds]
 
     for metric in metrics:
-        if Metrics[metric].value.category == MetricCategory.GENERATIVE:
+        if Metrics[metric].value.category in [MetricCategory.GENERATIVE, MetricCategory.GENERATIVE_SAMPLING]:
             outputs.update(Metrics[metric].value.compute(golds=golds, predictions=pred, formatted_doc=formatted_doc))
 
     return results, outputs
@@ -153,10 +153,7 @@ def apply_llm_as_judge_metric(results: list[ModelReturn], formatted_doc: Doc, me
     predictions = results.pop(0).result
 
     for metric in metrics:
-        if (
-            Metrics[metric].value.category == MetricCategory.LLM_AS_JUDGE_MULTI_TURN
-            or Metrics[metric].value.category == MetricCategory.LLM_AS_JUDGE
-        ):
+        if Metrics[metric].value.category in [MetricCategory.LLM_AS_JUDGE_MULTI_TURN, MetricCategory.LLM_AS_JUDGE]:
             outputs.update(Metrics[metric].value.compute(predictions=predictions, formatted_doc=formatted_doc))
 
     return results, outputs
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index 4a0e367da..ef2e3ee0d 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -41,6 +41,7 @@
     F1_score,
     JudgeLLM,
     LoglikelihoodAcc,
+    MajAtK,
     Recall,
     StringDistance,
     acc_golds_likelihood,
@@ -326,6 +327,22 @@ class Metrics(Enum):
         corpus_level_fn=matthews_corrcoef,
         higher_is_better=True,
     )
+    maj_at_5 = SampleLevelMetric(
+        metric="maj@5",
+        sample_level_fn=MajAtK(k=5).compute,
+        category=MetricCategory.GENERATIVE_,
+        use_case=MetricUseCase.ACCURACY,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+    maj_at_8 = SampleLevelMetric(
+        metric="maj@8",
+        sample_level_fn=MajAtK(k=8).compute,
+        category=MetricCategory.GENERATIVE_,
+        use_case=MetricUseCase.ACCURACY,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
     mrr = SampleLevelMetric(
         metric="mrr",
         sample_level_fn=MRR().compute,
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index a3809adb3..2d17c4af5 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -675,3 +675,90 @@ def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[
             "user_prompt": messages[0],
             "judgement": judgements[0],
         }
+
+
+class MajAtK:
+    def __init__(
+        self,
+        k: int,
+        normalize_gold: callable = None,
+        normalize_pred: callable = None,
+        strip_strings: bool = False,
+        type_exact_match: str = "full",
+    ):
+        """An exact match class.
+
+        Args:
+            normalize_gold (callable, optional): Function to use to normalize the reference strings.
+                Defaults to None if no normalization is applied.
+            normalize_pred (callable, optional): Function to use to normalize the predicted strings.
+                Defaults to None if no normalization is applied.
+            strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False.
+            type_exact_match (str, optional): Defines what type of match to apply (post normalization if present).
+                Can be any of `prefix`, `suffix` or `full`. Defaults to "full".
+                `prefix` checks if the prediction starts with the gold,
+                `suffix` if the prediction ends with the gold,
+                `full` if the prediction and gold are equal
+        """
+        self.k = k
+        self.normalize_gold = normalize_gold
+        self.normalize_pred = normalize_pred
+        self.strip_strings = strip_strings
+
+        if type_exact_match not in ["prefix", "suffix", "full"]:
+            # todo: we could add a set exact match
+            raise ValueError(
+                f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {type_exact_match} instead."
+            )
+        self.type_exact_match = type_exact_match
+
+    def compute(self, golds: list[str], predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
+        """Computes the metric over a list of golds and predictions for one single sample.
+
+        Args:
+            golds (list[str]): Reference targets
+            predictions (list[str]): Predicted strings
+
+        Returns:
+            float: Aggregated score over the current sample's items.
+        """
+        if len(golds) > 0:
+            raise Exception("Cannot compute maj@k with several golds")
+
+        gold = golds[0]
+        all_answers = []
+        for pred in predictions[: self.k]:
+            all_answers.append(self.compute_one_item(gold=gold, pred=pred))
+        return 1 if sum(all_answers) / len(all_answers) >= 0.5 else 0
+
+    def compute_one_item(
+        self,
+        gold: str,
+        pred: str,
+    ) -> float:
+        """Compares two strings only.
+
+        Args:
+            gold (str): One of the possible references
+            pred (str): One of the possible predictions
+
+        Returns:
+            float: The exact match score. Will be 1 for a match, 0 otherwise.
+        """
+        if not pred:
+            return 0
+
+        if self.strip_strings:
+            gold = gold.strip()
+            pred = pred.strip()
+
+        if self.normalize_gold:
+            gold = self.normalize_gold(gold)
+        if self.normalize_pred:
+            pred = self.normalize_pred(pred)
+
+        if self.type_exact_match == "prefix":
+            return 1 if pred.startswith(gold) else 0
+        if self.type_exact_match == "suffix":
+            return 1 if pred.endswith(gold) else 0
+        return 1 if gold == pred else 0
diff --git a/src/lighteval/metrics/utils.py b/src/lighteval/metrics/utils.py
index 6c79871e2..e5ceaeb0b 100644
--- a/src/lighteval/metrics/utils.py
+++ b/src/lighteval/metrics/utils.py
@@ -28,9 +28,10 @@ class MetricCategory(Enum):
     TARGET_PERPLEXITY = auto()
     PERPLEXITY = auto()
     GENERATIVE = auto()
+    GENERATIVE_LOGPROB = auto()
+    GENERATIVE_SAMPLING = auto()
     LLM_AS_JUDGE_MULTI_TURN = auto()
     LLM_AS_JUDGE = auto()
-    GENERATIVE_LOGPROB = auto()
     MULTICHOICE = auto()
     MULTICHOICE_ONE_TOKEN = auto()
     IGNORED = auto()
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 5dbaa750a..d31c9ebda 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -47,6 +47,7 @@
     GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
     GreedyUntilWithLogitsRequest,
+    GreedyUntilWithSamplingRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -336,7 +337,7 @@ def greedy_until_with_logits(
         returning both the generated sequences and the logits.
 
         Args:
-            requests (list[tuple[str, dict]]): A list of input requests,
+            requests (list[GreedyUntilWithLogitsRequest]): A list of input requests,
                 where each request is a tuple containing a prompt string and a dictionary of additional parameters.
             override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None.
 
@@ -352,6 +353,34 @@ def greedy_until_with_logits(
             override_bs=override_bs,
         )
 
+    def greedy_until_with_sampling(
+        self,
+        requests: list[GreedyUntilWithSamplingRequest],
+        num_samples: int,
+        override_bs: Optional[int] = None,
+    ) -> list[GenerateReturn]:
+        """
+        Generates sequences greedily until a stopping condition is met,
+        returning both the generated sequences and the logits.
+
+        Args:
+            requests (list[GreedyUntilWithSamplingRequest]): A list of input requests,
+                where each request is a tuple containing a prompt string and a dictionary of additional parameters.
+            override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None.
+
+        Returns:
+            list[GenerateReturn]: A list of GenerateReturn objects,
+                where each object contains the generated sequence and the corresponding logits.
+        """
+
+        return self.greedy_until(
+            requests,
+            returns_logits=False,
+            disable_tqdm=self.disable_tqdm,
+            override_bs=override_bs,
+            num_samples=num_samples,
+        )
+
     def greedy_until_multi_turn(  # noqa: C901
         self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
     ) -> GenerateMultiTurnReturn:
@@ -488,6 +517,7 @@ def greedy_until(
         requests: list[GreedyUntilRequest],
         returns_logits: bool = False,
         override_bs: Optional[int] = None,
+        num_samples: Optional[int] = None,
     ) -> list[GenerateReturn]:
         """
         Generates responses using a greedy decoding strategy until certain ending conditions are met.
@@ -596,6 +626,7 @@ def greedy_until(
                     max_new_tokens=max_new_tokens,
                     stop_tokens=stop_tokens,
                     returns_logits=returns_logits,
+                    num_samples=num_samples,
                 )
                 results.extend(cur_reponses)
 
@@ -607,6 +638,7 @@ def _generate(
         max_new_tokens: int,
         stop_tokens: list[str],
         returns_logits: Optional[bool] = False,
+        num_samples: Optional[int] = None,
     ) -> list[GenerateReturn]:
         """Contains the actual logic of the generation.
         First computes the stop sequences, then generates the predictions, then converts the outputs to GenerateReturn.
@@ -619,11 +651,12 @@ def _generate(
             attention_mask=batch.input_mask,
             max_new_tokens=max_new_tokens,
             stopping_criteria=stopping_criteria,
-            do_sample=False,
             pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id,
             return_dict_in_generate=True,
             output_scores=True,
             eos_token_id=self.tokenizer.eos_token_id,
+            do_sample=num_samples is not None,
+            num_return_sequences=num_samples,
         )
         if returns_logits:
             logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 64ba9f39f..4ab6c7a63 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -48,6 +48,7 @@
     GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
     GreedyUntilWithLogitsRequest,
+    GreedyUntilWithSamplingRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -101,6 +102,7 @@ class LightevalTaskConfig:
     generation_size: int = None
     stop_sequence: Optional[Tuple[str]] = None
     output_regex: Optional[str] = None
+    num_samples: Optional[list[int]] = None
 
     frozen: bool = False
     suite: Optional[Tuple[str]] = None
@@ -201,6 +203,8 @@ def __init__(  # noqa: C901
             hlog_warn(f"[WARNING] Not implemented yet: ignoring the metric {' ,'.join(ignored)} for task {self.name}.")
         current_categories = [Metrics[metric].value.category for metric in self.metrics]
         self.has_metric_category = {category: (category in current_categories) for category in MetricCategory}
+        # Sub-optimal system - we might want to store metric parametrisation in a yaml conf for example
+        self.num_samples = [int(metric.split("_")[-1]) for metric in self.metrics if "maj_at_" in metric]
 
         # Data processing
         # to use once prompt formatting is managed as a module
@@ -394,7 +398,7 @@ def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str:
         return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
 
     # Requests
-    def get_request_type(self) -> list[RequestType]:
+    def get_request_type(self) -> list[RequestType]:  # noqa C901
         """
         Returns the request types for the task.
 
@@ -418,6 +422,8 @@ def get_request_type(self) -> list[RequestType]:
             request_types.append(RequestType.GREEDY_UNTIL)
         if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
             request_types.append(RequestType.GREEDY_UNTIL_WITH_LOGITS)
+        if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
+            request_types.append(RequestType.GREEDY_UNTIL_WITH_SAMPLING)
         if self.has_metric_category[MetricCategory.MULTICHOICE]:
             request_types.append(RequestType.LOGLIKELIHOOD)
         if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]:
@@ -474,6 +480,18 @@ def construct_requests(
                     generation_size=self.generation_size,
                 )
             ]
+        if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
+            requests[RequestType.GREEDY_UNTIL_WITH_SAMPLING] += [
+                GreedyUntilWithSamplingRequest(
+                    task_name=current_task_name,
+                    example_index=document_id_seed,
+                    request_index=0,
+                    context=context,
+                    stop_sequence=self.stop_sequence,
+                    generation_size=self.generation_size,
+                    num_samples=max(self.num_samples),  # If we have several samplings to apply, we use the max
+                )
+            ]
         if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
             requests[RequestType.GREEDY_UNTIL_WITH_LOGITS] += [
                 GreedyUntilWithLogitsRequest(
@@ -543,7 +561,10 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics
             )
             outputs.update(cur_outputs)
-        if self.has_metric_category[MetricCategory.GENERATIVE]:
+        if (
+            self.has_metric_category[MetricCategory.GENERATIVE]
+            or self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]
+        ):
             results, cur_outputs = apply_generative_metric(
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex
             )
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index c4c863359..ead236c32 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -35,6 +35,7 @@ class RequestType(Enum):
     GREEDY_UNTIL = auto()
     GREEDY_UNTIL_MULTI_TURN = auto()
     GREEDY_UNTIL_WITH_LOGITS = auto()
+    GREEDY_UNTIL_WITH_SAMPLING = auto()
 
 
 @dataclass
@@ -155,6 +156,25 @@ class GreedyUntilWithLogitsRequest(Request):
     tokenized_context: list[int] = None
 
 
+@dataclass
+class GreedyUntilWithSamplingRequest(Request):
+    """
+    Represents a request for generating text using the Greedy-Until strategy but
+    returning the logits.
+
+    Attributes:
+        stop_sequence (str): The sequence of tokens that indicates when to stop generating text.
+        generation_size (int): The maximum number of tokens to generate.
+        request_type (RequestType): The type of the request (GREEDY_UNTIL_WITH_LOGITS).
+    """
+
+    stop_sequence: Union[str, tuple[str], list[str]]
+    generation_size: int
+    request_type = RequestType.GREEDY_UNTIL_WITH_SAMPLES
+    tokenized_context: list[int] = None
+    num_samples: int = None
+
+
 class TaskExampleId(NamedTuple):
     """
     Represents the identifier for an example in a task.

From cfdc6ee59a6f1a11ecfbce8bb51091eb8dc42358 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Tue, 16 Apr 2024 14:12:04 +0000
Subject: [PATCH 02/13] wip

---
 src/lighteval/evaluator.py            |  2 +-
 src/lighteval/metrics/metrics.py      | 14 ++++++++++++--
 src/lighteval/models/base_model.py    |  6 +++---
 src/lighteval/tasks/lighteval_task.py |  4 +++-
 src/lighteval/tasks/requests.py       |  2 +-
 src/lighteval/tasks/tasks_table.jsonl |  2 +-
 6 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index 5fa511edf..c4ccc2e7d 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -89,7 +89,7 @@ def evaluate(  # noqa: C901
         elif request_type == RequestType.GREEDY_UNTIL_WITH_LOGITS:
             full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL_WITH_SAMPLING:
-            full_resps = lm.greedy_until_with_sampling(requests, override_bs=override_bs)
+            full_resps = lm.greedy_until_with_sampling(requests, override_bs=override_bs)  # , num_samples=self)
         elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
             full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
         else:
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index ef2e3ee0d..d1d85a039 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -330,7 +330,7 @@ class Metrics(Enum):
     maj_at_5 = SampleLevelMetric(
         metric="maj@5",
         sample_level_fn=MajAtK(k=5).compute,
-        category=MetricCategory.GENERATIVE_,
+        category=MetricCategory.GENERATIVE_SAMPLING,
         use_case=MetricUseCase.ACCURACY,
         corpus_level_fn=np.mean,
         higher_is_better=True,
@@ -338,11 +338,21 @@ class Metrics(Enum):
     maj_at_8 = SampleLevelMetric(
         metric="maj@8",
         sample_level_fn=MajAtK(k=8).compute,
-        category=MetricCategory.GENERATIVE_,
+        category=MetricCategory.GENERATIVE_SAMPLING,
         use_case=MetricUseCase.ACCURACY,
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+    maj_at_8_gsm8k = SampleLevelMetric(
+        metric="qem",
+        sample_level_fn=MajAtK(
+            k=8, strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.MATH,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
     mrr = SampleLevelMetric(
         metric="mrr",
         sample_level_fn=MRR().compute,
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index d31c9ebda..c51b6b0d9 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -517,7 +517,7 @@ def greedy_until(
         requests: list[GreedyUntilRequest],
         returns_logits: bool = False,
         override_bs: Optional[int] = None,
-        num_samples: Optional[int] = None,
+        num_samples: Optional[int] = 1,
     ) -> list[GenerateReturn]:
         """
         Generates responses using a greedy decoding strategy until certain ending conditions are met.
@@ -638,7 +638,7 @@ def _generate(
         max_new_tokens: int,
         stop_tokens: list[str],
         returns_logits: Optional[bool] = False,
-        num_samples: Optional[int] = None,
+        num_samples: Optional[int] = 1,
     ) -> list[GenerateReturn]:
         """Contains the actual logic of the generation.
         First computes the stop sequences, then generates the predictions, then converts the outputs to GenerateReturn.
@@ -655,7 +655,7 @@ def _generate(
             return_dict_in_generate=True,
             output_scores=True,
             eos_token_id=self.tokenizer.eos_token_id,
-            do_sample=num_samples is not None,
+            do_sample=num_samples > 1,
             num_return_sequences=num_samples,
         )
         if returns_logits:
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 4ab6c7a63..015fe71a4 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -204,7 +204,9 @@ def __init__(  # noqa: C901
         current_categories = [Metrics[metric].value.category for metric in self.metrics]
         self.has_metric_category = {category: (category in current_categories) for category in MetricCategory}
         # Sub-optimal system - we might want to store metric parametrisation in a yaml conf for example
-        self.num_samples = [int(metric.split("_")[-1]) for metric in self.metrics if "maj_at_" in metric]
+        self.num_samples = [
+            int(metric.replace("maj_at_", "").split("_")[0]) for metric in self.metrics if "maj_at_" in metric
+        ]
 
         # Data processing
         # to use once prompt formatting is managed as a module
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index ead236c32..de9413b7f 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -170,7 +170,7 @@ class GreedyUntilWithSamplingRequest(Request):
 
     stop_sequence: Union[str, tuple[str], list[str]]
     generation_size: int
-    request_type = RequestType.GREEDY_UNTIL_WITH_SAMPLES
+    request_type = RequestType.GREEDY_UNTIL_WITH_SAMPLING
     tokenized_context: list[int] = None
     num_samples: int = None
 
diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl
index 12e70f38e..cd7f7c8ed 100644
--- a/src/lighteval/tasks/tasks_table.jsonl
+++ b/src/lighteval/tasks/tasks_table.jsonl
@@ -442,7 +442,7 @@
 {"name":"gpqa","suite":["lighteval"],"prompt_function":"gpqa","hf_repo":"Idavidrein/gpqa","hf_subset":"gpqa_main","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"gre_reading_comprehension","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gre_reading_comprehension","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"gsm8k","suite":["leaderboard"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":["Question:","Question",":"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"gsm8k","suite":["lighteval"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":["Question:"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"gsm8k","suite":["lighteval"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k","maj_at_8_gsm8k"],"stop_sequence":["Question:"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"headqa:en","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"headqa:es","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"es","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"hellaswag","suite":["leaderboard"],"prompt_function":"hellaswag_harness","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}

From 98c1c12a3a7844a9ed1a12922af36004cd307922 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Wed, 17 Apr 2024 17:02:19 +0000
Subject: [PATCH 03/13] testing how to pad and gather with an added dimension
 for the num_samples

---
 src/lighteval/evaluator.py              |  2 +-
 src/lighteval/metrics/metrics_sample.py |  2 +-
 src/lighteval/models/base_model.py      | 74 +++++++++++--------------
 src/lighteval/tasks/requests.py         | 12 +---
 4 files changed, 37 insertions(+), 53 deletions(-)

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index c4ccc2e7d..cfbfc7d0f 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -89,7 +89,7 @@ def evaluate(  # noqa: C901
         elif request_type == RequestType.GREEDY_UNTIL_WITH_LOGITS:
             full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL_WITH_SAMPLING:
-            full_resps = lm.greedy_until_with_sampling(requests, override_bs=override_bs)  # , num_samples=self)
+            full_resps = lm.greedy_until(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
             full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
         else:
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 2d17c4af5..d55b7f83f 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -722,7 +722,7 @@ def compute(self, golds: list[str], predictions: list[str], formatted_doc: Doc,
         Returns:
             float: Aggregated score over the current sample's items.
         """
-        if len(golds) > 0:
+        if len(golds) > 1:
             raise Exception("Cannot compute maj@k with several golds")
 
         gold = golds[0]
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index c51b6b0d9..7cea60474 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -353,34 +353,6 @@ def greedy_until_with_logits(
             override_bs=override_bs,
         )
 
-    def greedy_until_with_sampling(
-        self,
-        requests: list[GreedyUntilWithSamplingRequest],
-        num_samples: int,
-        override_bs: Optional[int] = None,
-    ) -> list[GenerateReturn]:
-        """
-        Generates sequences greedily until a stopping condition is met,
-        returning both the generated sequences and the logits.
-
-        Args:
-            requests (list[GreedyUntilWithSamplingRequest]): A list of input requests,
-                where each request is a tuple containing a prompt string and a dictionary of additional parameters.
-            override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None.
-
-        Returns:
-            list[GenerateReturn]: A list of GenerateReturn objects,
-                where each object contains the generated sequence and the corresponding logits.
-        """
-
-        return self.greedy_until(
-            requests,
-            returns_logits=False,
-            disable_tqdm=self.disable_tqdm,
-            override_bs=override_bs,
-            num_samples=num_samples,
-        )
-
     def greedy_until_multi_turn(  # noqa: C901
         self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
     ) -> GenerateMultiTurnReturn:
@@ -517,7 +489,6 @@ def greedy_until(
         requests: list[GreedyUntilRequest],
         returns_logits: bool = False,
         override_bs: Optional[int] = None,
-        num_samples: Optional[int] = 1,
     ) -> list[GenerateReturn]:
         """
         Generates responses using a greedy decoding strategy until certain ending conditions are met.
@@ -576,6 +547,7 @@ def greedy_until(
                 # the case! Because of that we only use batch size of 1
                 stop_tokens = batch[0].stop_sequence
                 max_new_tokens = batch[0].generation_size
+                num_samples = batch[0].num_samples if isinstance(batch[0], GreedyUntilWithSamplingRequest) else 1
 
                 # The main question for this step is the following:
                 # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
@@ -644,6 +616,7 @@ def _generate(
         First computes the stop sequences, then generates the predictions, then converts the outputs to GenerateReturn.
         """
         stopping_criteria = stop_sequences_criteria(self.tokenizer, stop_sequences=stop_tokens, batch=batch)
+        batch_size, _ = batch.input_ids.shape
 
         # Compute model generation
         outputs = self.model.generate(
@@ -661,7 +634,8 @@ def _generate(
         if returns_logits:
             logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
         generations = outputs.sequences[:, batch.input_ids.size(1) :]
-        generations, len_gens = self.pad_and_gather(generations)
+        generations = torch.reshape(generations, (batch_size, num_samples, -1))
+        generations, len_gens = self.pad_and_gather(generations, num_samples=num_samples)
         batch.input_ids, len_ids = self.pad_and_gather(batch.input_ids)
 
         logits, len_logits = None, None
@@ -679,20 +653,30 @@ def _generate(
 
         # We convert to GenerateReturn outputs
         all_responses = []
-        for ix, (generation, batched_input, trunc, padded) in enumerate(
+        for ix, (batched_generations, batched_input, trunc, padded) in enumerate(
             zip(generations, batch.input_ids, batch.truncated, batch.padded)
         ):
+            result_generations = []
+            decoded_generations = []
             # Ensure the generated responses do not contain the stop sequences.
-            generation = generation[: len_gens[ix]]
-            decoded_generation = self.tok_decode([generation])[0]
+            for generation in batched_generations:
+                generation = generation[: len_gens[ix]]
+                result_generations.append(generation)
+                decoded_generation = self.tok_decode([generation])[0]
 
-            for term in stop_tokens:
-                decoded_generation = decoded_generation.split(term)[0]
+                for term in stop_tokens:
+                    decoded_generation = decoded_generation.split(term)[0]
+
+                decoded_generations.append(decoded_generation)
+
+            if num_samples == 1:  # We only return one item
+                result_generations = result_generations[0]
+                decoded_generations = decoded_generations[0]
 
             cur_response = GenerateReturn(
-                result=decoded_generation,
+                result=decoded_generations,
                 logits=logits[ix][: len_logits[ix]] if returns_logits else None,
-                generated_tokens=generation,
+                generated_tokens=result_generations,
                 input_tokens=batched_input[: len_ids[ix]],
                 truncated_tokens_count=trunc.cpu().item(),
                 padded_tokens_count=padded.cpu().item(),
@@ -924,7 +908,9 @@ def prepare_batch_logprob(
             padded=padded,
         )
 
-    def pad_and_gather(self, output_tensor: torch.Tensor, drop_last_samples: bool = True) -> torch.Tensor:
+    def pad_and_gather(
+        self, output_tensor: torch.Tensor, drop_last_samples: bool = True, num_samples: int = 1
+    ) -> torch.Tensor:
         """
         Pads the `output_tensor` to the maximum length and gathers the lengths across processes.
 
@@ -938,15 +924,21 @@ def pad_and_gather(self, output_tensor: torch.Tensor, drop_last_samples: bool =
             torch.Tensor: The padded output tensor and the gathered length tensor.
         """
         # Create a tensor of size batch_size, [output_length] * batch_size, for each process
-        length_tensor = torch.tensor([output_tensor.shape[1]] * output_tensor.shape[0], device=self.device)
+        # length_tensor = torch.tensor([output_tensor.shape[1]] * output_tensor.shape[0], device=self.device)
+        length_tensor = torch.zeros(
+            [output_tensor.shape[-1]] * num_samples * output_tensor.shape[0], device=self.device
+        )
         if self.accelerator is not None:
             # Gather all the lengths, we end up with a tensor of size num_processes [output_length_1, output_length_2, ...]
             length_tensor = self.accelerator.gather(length_tensor)
         # We pad the output_tensor to the max length
         max_length = length_tensor.max().item()
-        output_tensor = F.pad(
-            output_tensor, (0, max_length - output_tensor.shape[1], 0, 0), value=self.tokenizer.pad_token_id
+        padding = (
+            (0, max_length - output_tensor.shape[1], 0, 0, 0, 0)
+            if num_samples > 1
+            else (0, max_length - output_tensor.shape[1], 0, 0)
         )
+        output_tensor = F.pad(output_tensor, padding, value=self.tokenizer.pad_token_id)
         if self.accelerator:
             if drop_last_samples:
                 output_tensor = self.accelerator.gather_for_metrics(output_tensor)
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index de9413b7f..952fd430d 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -139,25 +139,20 @@ class GreedyUntilMultiTurnRequest(Request):
 
 
 @dataclass
-class GreedyUntilWithLogitsRequest(Request):
+class GreedyUntilWithLogitsRequest(GreedyUntilRequest):
     """
     Represents a request for generating text using the Greedy-Until strategy but
     returning the logits.
 
     Attributes:
-        stop_sequence (str): The sequence of tokens that indicates when to stop generating text.
-        generation_size (int): The maximum number of tokens to generate.
         request_type (RequestType): The type of the request (GREEDY_UNTIL_WITH_LOGITS).
     """
 
-    stop_sequence: Union[str, tuple[str], list[str]]
-    generation_size: int
     request_type = RequestType.GREEDY_UNTIL_WITH_LOGITS
-    tokenized_context: list[int] = None
 
 
 @dataclass
-class GreedyUntilWithSamplingRequest(Request):
+class GreedyUntilWithSamplingRequest(GreedyUntilRequest):
     """
     Represents a request for generating text using the Greedy-Until strategy but
     returning the logits.
@@ -168,10 +163,7 @@ class GreedyUntilWithSamplingRequest(Request):
         request_type (RequestType): The type of the request (GREEDY_UNTIL_WITH_LOGITS).
     """
 
-    stop_sequence: Union[str, tuple[str], list[str]]
-    generation_size: int
     request_type = RequestType.GREEDY_UNTIL_WITH_SAMPLING
-    tokenized_context: list[int] = None
     num_samples: int = None
 
 

From 1549fda239dba7402dafa1ebda18c55a09f7d0d8 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Fri, 19 Apr 2024 12:42:07 +0000
Subject: [PATCH 04/13] now working, need to check why the metric is not
 displayed

---
 src/lighteval/logging/info_loggers.py |  5 ++++-
 src/lighteval/models/base_model.py    | 14 ++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 83fe981e0..c211d2e4f 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -350,7 +350,10 @@ def log(
         ):
             pred_saved = True
             pass  # should we log something?
-        if task.has_metric_category[MetricCategory.GENERATIVE]:
+        if (
+            task.has_metric_category[MetricCategory.GENERATIVE]
+            or task.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]
+        ):
             detail.gold = doc.get_golds()
             pred_saved = True
         if task.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 7cea60474..d9acf1a47 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -909,7 +909,7 @@ def prepare_batch_logprob(
         )
 
     def pad_and_gather(
-        self, output_tensor: torch.Tensor, drop_last_samples: bool = True, num_samples: int = 1
+        self, output_tensor: torch.Tensor, drop_last_samples: bool = True, num_samples: int = None
     ) -> torch.Tensor:
         """
         Pads the `output_tensor` to the maximum length and gathers the lengths across processes.
@@ -924,19 +924,17 @@ def pad_and_gather(
             torch.Tensor: The padded output tensor and the gathered length tensor.
         """
         # Create a tensor of size batch_size, [output_length] * batch_size, for each process
-        # length_tensor = torch.tensor([output_tensor.shape[1]] * output_tensor.shape[0], device=self.device)
-        length_tensor = torch.zeros(
-            [output_tensor.shape[-1]] * num_samples * output_tensor.shape[0], device=self.device
-        )
+        # output_tensor can be of size: batch_size * num_samples * length_item or just batch_size * length_item
+        length_tensor = torch.tensor([output_tensor.shape[-1]] * output_tensor.shape[0], device=self.device)
         if self.accelerator is not None:
             # Gather all the lengths, we end up with a tensor of size num_processes [output_length_1, output_length_2, ...]
             length_tensor = self.accelerator.gather(length_tensor)
         # We pad the output_tensor to the max length
         max_length = length_tensor.max().item()
         padding = (
-            (0, max_length - output_tensor.shape[1], 0, 0, 0, 0)
-            if num_samples > 1
-            else (0, max_length - output_tensor.shape[1], 0, 0)
+            (0, max_length - output_tensor.shape[-1], 0, 0, 0, 0)
+            if num_samples is not None
+            else (0, max_length - output_tensor.shape[-1], 0, 0)
         )
         output_tensor = F.pad(output_tensor, padding, value=self.tokenizer.pad_token_id)
         if self.accelerator:

From 8adfc0772e184555aa5ce3095102fd1fde5d3012 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Fri, 19 Apr 2024 14:00:44 +0000
Subject: [PATCH 05/13] seems to be working!

---
 src/lighteval/metrics/__init__.py       | 50 ++++++++++++++++++++-----
 src/lighteval/metrics/metrics.py        |  2 +-
 src/lighteval/metrics/metrics_sample.py | 35 ++++++++---------
 src/lighteval/tasks/lighteval_task.py   | 12 ++++--
 4 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 68727b8e9..2458496b0 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -70,12 +70,15 @@ def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metr
     outputs = {}
 
     # Post processing prediction
-    pred_raw = results.pop(0).result
-    if output_regex is not None:
-        pred = next(iter(re.findall(output_regex, pred_raw)), "")
-    else:
-        pred = pred_raw
-    pred = as_list(pred)
+    preds_raw = as_list(results.pop(0).result)
+    preds = []
+
+    for pred_raw in preds_raw:
+        if output_regex is not None:
+            pred = next(iter(re.findall(output_regex, pred_raw)), "")
+        else:
+            pred = pred_raw
+        preds.append(pred)
 
     # Extracting gold
     try:
@@ -87,12 +90,41 @@ def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metr
     # if "label_to_choices" in formatted_doc:
     if formatted_doc.specific is not None and "label_to_choices" in formatted_doc.specific:
         # Helm predicts on labels keys (A/B/C/D), but computes metrics on choices
-        pred = [formatted_doc.specific["label_to_choices"].get(p) for p in pred]
+        preds = [formatted_doc.specific["label_to_choices"].get(p) for p in preds]
         golds = [formatted_doc.specific["label_to_choices"][g] for g in golds]
 
     for metric in metrics:
-        if Metrics[metric].value.category in [MetricCategory.GENERATIVE, MetricCategory.GENERATIVE_SAMPLING]:
-            outputs.update(Metrics[metric].value.compute(golds=golds, predictions=pred, formatted_doc=formatted_doc))
+        if Metrics[metric].value.category == MetricCategory.GENERATIVE:
+            outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
+
+    return results, outputs
+
+
+def apply_generative_sampling_metric(
+    results: list[ModelReturn], formatted_doc: Doc, metrics: list[str], output_regex=None
+):
+    outputs = {}
+
+    # Post processing prediction
+    preds_raw = as_list(results.pop(0).result)
+    preds = []
+
+    for pred_raw in preds_raw:
+        if output_regex is not None:
+            pred = next(iter(re.findall(output_regex, pred_raw)), "")
+        else:
+            pred = pred_raw
+        preds.append(pred)
+
+    # Extracting gold
+    try:
+        golds = formatted_doc.get_golds()
+    except (KeyError, IndexError):
+        golds = None
+
+    for metric in metrics:
+        if Metrics[metric].value.category == MetricCategory.GENERATIVE_SAMPLING:
+            outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
 
     return results, outputs
 
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index d1d85a039..f9430afdf 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -344,7 +344,7 @@ class Metrics(Enum):
         higher_is_better=True,
     )
     maj_at_8_gsm8k = SampleLevelMetric(
-        metric="qem",
+        metric="maj@8",
         sample_level_fn=MajAtK(
             k=8, strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer
         ).compute,
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index d55b7f83f..5b286afba 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -725,38 +725,35 @@ def compute(self, golds: list[str], predictions: list[str], formatted_doc: Doc,
         if len(golds) > 1:
             raise Exception("Cannot compute maj@k with several golds")
 
-        gold = golds[0]
+        gold = self.get_processed_gold(golds[0])
         all_answers = []
         for pred in predictions[: self.k]:
-            all_answers.append(self.compute_one_item(gold=gold, pred=pred))
-        return 1 if sum(all_answers) / len(all_answers) >= 0.5 else 0
+            all_answers.append(self.get_processed_pred(pred=pred))
+        majority_prediction = max(all_answers, key=all_answers.count)
+        return self.compute_score(majority_prediction, gold)
 
-    def compute_one_item(
-        self,
-        gold: str,
-        pred: str,
-    ) -> float:
-        """Compares two strings only.
+    def get_processed_gold(self, gold: str) -> float:
+        if self.strip_strings:
+            gold = gold.strip()
 
-        Args:
-            gold (str): One of the possible references
-            pred (str): One of the possible predictions
+        if self.normalize_gold:
+            gold = self.normalize_gold(gold)
 
-        Returns:
-            float: The exact match score. Will be 1 for a match, 0 otherwise.
-        """
+        return gold
+
+    def get_processed_pred(self, pred: str) -> float:
         if not pred:
-            return 0
+            return ""
 
         if self.strip_strings:
-            gold = gold.strip()
             pred = pred.strip()
 
-        if self.normalize_gold:
-            gold = self.normalize_gold(gold)
         if self.normalize_pred:
             pred = self.normalize_pred(pred)
 
+        return pred
+
+    def compute_score(self, pred: str, gold: str):
         if self.type_exact_match == "prefix":
             return 1 if pred.startswith(gold) else 0
         if self.type_exact_match == "suffix":
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 015fe71a4..832f5367a 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -34,6 +34,7 @@
 from lighteval.metrics import (
     apply_generative_logprob_metric,
     apply_generative_metric,
+    apply_generative_sampling_metric,
     apply_llm_as_judge_metric,
     apply_multichoice_metric,
     apply_multichoice_metric_one_token,
@@ -563,14 +564,17 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics
             )
             outputs.update(cur_outputs)
-        if (
-            self.has_metric_category[MetricCategory.GENERATIVE]
-            or self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]
-        ):
+        if self.has_metric_category[MetricCategory.GENERATIVE]:
             results, cur_outputs = apply_generative_metric(
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex
             )
             outputs.update(cur_outputs)
+        if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
+            results, cur_outputs = apply_generative_sampling_metric(
+                results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex
+            )
+            outputs.update(cur_outputs)
+
         if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
             results, cur_outputs = apply_generative_logprob_metric(
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics

From 24d46922a5acc8e9c23e08cbc02a202774dee2ac Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Fri, 19 Apr 2024 14:15:35 +0000
Subject: [PATCH 06/13] add maj at 4 for math with preprocessing

---
 src/lighteval/metrics/metrics.py      | 10 ++++++++++
 src/lighteval/tasks/tasks_table.jsonl | 14 +++++++-------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index f9430afdf..07d5c918e 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -327,6 +327,16 @@ class Metrics(Enum):
         corpus_level_fn=matthews_corrcoef,
         higher_is_better=True,
     )
+    maj_at_4_math = SampleLevelMetric(
+        metric="maj@4",
+        sample_level_fn=MajAtK(
+            k=4, strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer_gold
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.MATH,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
     maj_at_5 = SampleLevelMetric(
         metric="maj@5",
         sample_level_fn=MajAtK(k=5).compute,
diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl
index cd7f7c8ed..ecaeff048 100644
--- a/src/lighteval/tasks/tasks_table.jsonl
+++ b/src/lighteval/tasks/tasks_table.jsonl
@@ -540,13 +540,13 @@
 {"name":"lsat_qa:grouping","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"grouping","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"lsat_qa:miscellaneous","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"miscellaneous","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"lsat_qa:ordering","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"ordering","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:geometry","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:number_theory","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:prealgebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:precalculus","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:geometry","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:number_theory","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:prealgebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:precalculus","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"mathematical_induction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mathematical_induction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"mathqa","suite":["lighteval"],"prompt_function":"mathqa","hf_repo":"math_qa","hf_subset":"default","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"matrixshapes","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"matrixshapes","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}

From 3a1c1c4d32d9ab744e8b6c75ea31623b2a3e3d63 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 22 Apr 2024 09:38:05 +0000
Subject: [PATCH 07/13] Uses a homogeneized system for all greedy evaluations -
 we can do evals in one single step

---
 src/lighteval/data.py                  |  3 +-
 src/lighteval/evaluator.py             |  4 --
 src/lighteval/metrics/__init__.py      | 40 +------------
 src/lighteval/models/abstract_model.py | 28 ----------
 src/lighteval/models/base_model.py     | 35 +-----------
 src/lighteval/models/endpoint_model.py | 33 ++---------
 src/lighteval/models/nanotron_model.py | 17 +-----
 src/lighteval/tasks/lighteval_task.py  | 77 +++++++++-----------------
 src/lighteval/tasks/requests.py        | 33 +----------
 tests/test_unit_harness_metrics.py     | 12 ++--
 10 files changed, 44 insertions(+), 238 deletions(-)

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index 711b0749b..247cff042 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -29,7 +29,6 @@
 from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.tasks.requests import (
     GreedyUntilRequest,
-    GreedyUntilWithLogitsRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -205,7 +204,7 @@ def _sorting_criteria(self, request: LoglikelihoodSingleTokenRequest) -> int:
 
 
 class GenerativeTaskDataset(DynamicBatchDataset):
-    def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsRequest) -> int:
+    def _sorting_criteria(self, request: GreedyUntilRequest) -> int:
         """
         Collate function for generating batches.
 
diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index cfbfc7d0f..e837b9225 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -86,10 +86,6 @@ def evaluate(  # noqa: C901
             full_resps = lm.loglikelihood_rolling(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL:
             full_resps = lm.greedy_until(requests, override_bs=override_bs)
-        elif request_type == RequestType.GREEDY_UNTIL_WITH_LOGITS:
-            full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
-        elif request_type == RequestType.GREEDY_UNTIL_WITH_SAMPLING:
-            full_resps = lm.greedy_until(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
             full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
         else:
diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 2458496b0..7577c7713 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -96,46 +96,10 @@ def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metr
     for metric in metrics:
         if Metrics[metric].value.category == MetricCategory.GENERATIVE:
             outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
-
-    return results, outputs
-
-
-def apply_generative_sampling_metric(
-    results: list[ModelReturn], formatted_doc: Doc, metrics: list[str], output_regex=None
-):
-    outputs = {}
-
-    # Post processing prediction
-    preds_raw = as_list(results.pop(0).result)
-    preds = []
-
-    for pred_raw in preds_raw:
-        if output_regex is not None:
-            pred = next(iter(re.findall(output_regex, pred_raw)), "")
-        else:
-            pred = pred_raw
-        preds.append(pred)
-
-    # Extracting gold
-    try:
-        golds = formatted_doc.get_golds()
-    except (KeyError, IndexError):
-        golds = None
-
-    for metric in metrics:
-        if Metrics[metric].value.category == MetricCategory.GENERATIVE_SAMPLING:
-            outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
-
-    return results, outputs
-
-
-def apply_generative_logprob_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
-    # Applied to no metric atm, but we have the model side logic
-    outputs = {}
-
-    for metric in metrics:
         if Metrics[metric].value.category == MetricCategory.GENERATIVE_LOGPROB:
             outputs.update(Metrics[metric].value.compute(results=results, formatted_doc=formatted_doc))
+        if Metrics[metric].value.category == MetricCategory.GENERATIVE_SAMPLING:
+            outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
 
     return results, outputs
 
diff --git a/src/lighteval/models/abstract_model.py b/src/lighteval/models/abstract_model.py
index ccc49146c..754a6144a 100644
--- a/src/lighteval/models/abstract_model.py
+++ b/src/lighteval/models/abstract_model.py
@@ -36,7 +36,6 @@
 from lighteval.tasks.requests import (
     GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
-    GreedyUntilWithLogitsRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -83,31 +82,6 @@ def max_length(self) -> int:
     def disable_tqdm(self) -> bool:
         raise NotImplementedError
 
-    def greedy_until_with_logits(
-        self,
-        requests: list[GreedyUntilWithLogitsRequest],
-        override_bs: Optional[int] = None,
-    ) -> list[GenerateReturn]:
-        """
-        Generates sequences greedily until a stopping condition is met,
-        returning both the generated sequences and the logits.
-
-        Args:
-            requests (list[tuple[str, dict]]): A list of input requests,
-                where each request is a tuple containing a prompt string and a dictionary of additional parameters.
-            disable_tqdm (bool, optional): Whether to disable the tqdm progress bar. Defaults to False.
-            override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None.
-
-        Returns:
-            list[GenerateReturn]: A list of GenerateReturn objects,
-                where each object contains the generated sequence and the corresponding logits.
-        """
-        return self.greedy_until(
-            requests=requests,
-            override_bs=override_bs,
-            returns_logits=True,
-        )
-
     def greedy_until_multi_turn(  # noqa: C901
         self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
     ) -> GenerateMultiTurnReturn:
@@ -118,7 +92,6 @@ def greedy_until_multi_turn(  # noqa: C901
     def greedy_until(
         self,
         requests: list[GreedyUntilRequest],
-        returns_logits: bool = False,
         override_bs: Optional[int] = None,
     ) -> list[GenerateReturn]:
         """
@@ -126,7 +99,6 @@ def greedy_until(
 
         Args:
             requests (list[Request]): list of requests containing the context and ending conditions.
-            returns_logits (bool, optional): Whether to return the logits of the generated responses. Defaults to False.
             disable_tqdm (bool, optional): Whether to disable the progress bar. Defaults to False.
             override_bs (int, optional): Override the batch size for generation. Defaults to None.
 
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index d9acf1a47..7d9bd8d2b 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -46,8 +46,6 @@
 from lighteval.tasks.requests import (
     GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
-    GreedyUntilWithLogitsRequest,
-    GreedyUntilWithSamplingRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -327,32 +325,6 @@ def forward_batch(batch_size):
         hlog(f"Determined largest batch size: {batch_size}")
         return batch_size
 
-    def greedy_until_with_logits(
-        self,
-        requests: list[GreedyUntilWithLogitsRequest],
-        override_bs: Optional[int] = None,
-    ) -> list[GenerateReturn]:
-        """
-        Generates sequences greedily until a stopping condition is met,
-        returning both the generated sequences and the logits.
-
-        Args:
-            requests (list[GreedyUntilWithLogitsRequest]): A list of input requests,
-                where each request is a tuple containing a prompt string and a dictionary of additional parameters.
-            override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None.
-
-        Returns:
-            list[GenerateReturn]: A list of GenerateReturn objects,
-                where each object contains the generated sequence and the corresponding logits.
-        """
-
-        return self.greedy_until(
-            requests,
-            returns_logits=True,
-            disable_tqdm=self.disable_tqdm,
-            override_bs=override_bs,
-        )
-
     def greedy_until_multi_turn(  # noqa: C901
         self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
     ) -> GenerateMultiTurnReturn:
@@ -487,7 +459,6 @@ def greedy_until_multi_turn(  # noqa: C901
     def greedy_until(
         self,
         requests: list[GreedyUntilRequest],
-        returns_logits: bool = False,
         override_bs: Optional[int] = None,
     ) -> list[GenerateReturn]:
         """
@@ -495,7 +466,6 @@ def greedy_until(
 
         Args:
             requests (list[Request]): list of requests containing the context and ending conditions.
-            returns_logits (bool, optional): Whether to return the logits of the generated responses. Defaults to False.
             override_bs (int, optional): Override the batch size for generation. Defaults to None.
 
         Returns:
@@ -543,11 +513,12 @@ def greedy_until(
                 dataloader, desc="Greedy generation", position=1, leave=False, disable=self.disable_tqdm
             ):
                 # NOTE: we are assuming all items in a batch behave similarly (same
-                # stop_tokens and max_tokens genrated) which is not necessarily
+                # stop_tokens and max_tokens generated) which is not necessarily
                 # the case! Because of that we only use batch size of 1
                 stop_tokens = batch[0].stop_sequence
                 max_new_tokens = batch[0].generation_size
-                num_samples = batch[0].num_samples if isinstance(batch[0], GreedyUntilWithSamplingRequest) else 1
+                returns_logits = batch[0].use_logits
+                num_samples = batch[0].num_samples
 
                 # The main question for this step is the following:
                 # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py
index b118a93b8..03e184bc3 100644
--- a/src/lighteval/models/endpoint_model.py
+++ b/src/lighteval/models/endpoint_model.py
@@ -44,7 +44,6 @@
 from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
 from lighteval.tasks.requests import (
     GreedyUntilRequest,
-    GreedyUntilWithLogitsRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -182,7 +181,7 @@ def __process_request(self, context: str, stop_tokens: list[str], max_tokens: in
 
     async def __async_process_batch_generate(
         self,
-        requests: list[GreedyUntilRequest | GreedyUntilWithLogitsRequest],
+        requests: list[GreedyUntilRequest],
     ) -> list[TextGenerationOutput]:
         return await asyncio.gather(
             *[
@@ -197,7 +196,7 @@ async def __async_process_batch_generate(
 
     def __process_batch_generate(
         self,
-        requests: list[GreedyUntilRequest | GreedyUntilWithLogitsRequest],
+        requests: list[GreedyUntilRequest],
     ) -> list[TextGenerationOutput]:
         return [
             self.__process_request(
@@ -234,35 +233,9 @@ def __process_batch_logprob(
             for request in requests
         ]
 
-    def greedy_until_with_logits(
-        self,
-        requests: list[GreedyUntilWithLogitsRequest],
-        override_bs: Optional[int] = None,
-    ) -> list[GenerateReturn]:
-        """
-        Generates sequences greedily until a stopping condition is met,
-        returning both the generated sequences and the logits.
-
-        Args:
-            requests (list[tuple[str, dict]]): A list of input requests,
-                where each request is a tuple containing a prompt string and a dictionary of additional parameters.
-            override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None.
-
-        Returns:
-            list[GenerateReturn]: A list of GenerateReturn objects,
-                where each object contains the generated sequence and the corresponding logits.
-        """
-
-        return self.greedy_until(
-            requests,
-            returns_logits=True,
-            override_bs=override_bs,
-        )
-
     def greedy_until(
         self,
         requests: List[GreedyUntilRequest],
-        returns_logits: bool = False,
         override_bs: Optional[int] = None,
     ) -> List[GenerateReturn]:
         for request in requests:
@@ -286,6 +259,8 @@ def greedy_until(
                 dataloader, desc="Greedy generation", position=1, leave=False, disable=self.disable_tqdm
             ):
                 # the `returns_logits` flag is only used to filter the results, we always request the full details.
+                returns_logits = batch[0].use_logits
+
                 if self.use_async:
                     responses = asyncio.run(self.__async_process_batch_generate(batch))
                 else:
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index 69ad420f1..eecd18fb1 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -351,21 +351,6 @@ def tok_decode(self, tokens: torch.LongTensor) -> List[str]:
     def _model_call(self, inputs: torch.Tensor) -> torch.Tensor:
         return self.model(inputs)
 
-    def greedy_until_with_logits(
-        self,
-        requests: list[tuple[str, dict]],
-        disable_tqdm: bool = False,
-        override_bs=None,
-        dataset_splits: int = 4,
-    ) -> list[GenerateReturn]:
-        return self.greedy_until(
-            requests,
-            returns_logits=True,
-            disable_tqdm=disable_tqdm,
-            override_bs=override_bs,
-            dataset_splits=dataset_splits,
-        )
-
     def _encode_pair(self, context, continuation):
         n_spaces = len(context) - len(context.rstrip())
         if n_spaces > 0:
@@ -1130,7 +1115,6 @@ def _loglikelihood_tokens(
     def greedy_until(
         self,
         requests: List[GreedyUntilRequest],
-        returns_logits=False,
         disable_tqdm: bool = False,
         override_bs=None,
         dataset_splits: int = 1,
@@ -1216,6 +1200,7 @@ def greedy_until(
                 # the maximum allowed generation size for the batch, unless we want to force truncation
                 # need to pass them somewhere ! stop_tokens = batch[0].stop_sequence
                 max_new_tokens = batch[0].generation_size
+                returns_logits = batch[0].use_logits
 
                 # The main question for this step is the following:
                 # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 832f5367a..cef8d5d0d 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -32,9 +32,7 @@
 from lighteval.few_shot_manager import FewShotSampler
 from lighteval.logging.hierarchical_logger import hlog, hlog_warn
 from lighteval.metrics import (
-    apply_generative_logprob_metric,
     apply_generative_metric,
-    apply_generative_sampling_metric,
     apply_llm_as_judge_metric,
     apply_multichoice_metric,
     apply_multichoice_metric_one_token,
@@ -48,8 +46,6 @@
     Doc,
     GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
-    GreedyUntilWithLogitsRequest,
-    GreedyUntilWithSamplingRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -205,7 +201,8 @@ def __init__(  # noqa: C901
         current_categories = [Metrics[metric].value.category for metric in self.metrics]
         self.has_metric_category = {category: (category in current_categories) for category in MetricCategory}
         # Sub-optimal system - we might want to store metric parametrisation in a yaml conf for example
-        self.num_samples = [
+        # We assume num_samples always contains 1 (for base generative evals)
+        self.num_samples = [1] + [
             int(metric.replace("maj_at_", "").split("_")[0]) for metric in self.metrics if "maj_at_" in metric
         ]
 
@@ -415,27 +412,27 @@ def get_request_type(self) -> list[RequestType]:  # noqa C901
         request_types = []
         if self.has_metric_category[MetricCategory.TARGET_PERPLEXITY]:
             request_types.append(RequestType.LOGLIKELIHOOD)
+        if self.has_metric_category[MetricCategory.MULTICHOICE]:
+            request_types.append(RequestType.LOGLIKELIHOOD)
+        if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]:
+            request_types.append(RequestType.LOGLIKELIHOOD_SINGLE_TOKEN)
         if self.has_metric_category[MetricCategory.PERPLEXITY]:
             request_types.append(RequestType.LOGLIKELIHOOD_ROLLING)
         if self.has_metric_category[MetricCategory.GENERATIVE]:
             request_types.append(RequestType.GREEDY_UNTIL)
-        if self.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN]:
-            request_types.append(RequestType.GREEDY_UNTIL_MULTI_TURN)
-        if self.has_metric_category[MetricCategory.LLM_AS_JUDGE]:
-            request_types.append(RequestType.GREEDY_UNTIL)
         if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
-            request_types.append(RequestType.GREEDY_UNTIL_WITH_LOGITS)
+            request_types.append(RequestType.GREEDY_UNTIL)
         if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
-            request_types.append(RequestType.GREEDY_UNTIL_WITH_SAMPLING)
-        if self.has_metric_category[MetricCategory.MULTICHOICE]:
-            request_types.append(RequestType.LOGLIKELIHOOD)
-        if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]:
-            request_types.append(RequestType.LOGLIKELIHOOD_SINGLE_TOKEN)
+            request_types.append(RequestType.GREEDY_UNTIL)
+        if self.has_metric_category[MetricCategory.LLM_AS_JUDGE]:
+            request_types.append(RequestType.GREEDY_UNTIL)
+        if self.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN]:
+            request_types.append(RequestType.GREEDY_UNTIL_MULTI_TURN)
 
         if len(request_types) == 0:
             raise NotImplementedError(f"Request type not implemented for task {self.name}")
 
-        return request_types
+        return list(set(request_types))
 
     def construct_requests(
         self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
@@ -472,7 +469,13 @@ def construct_requests(
                     task_name=current_task_name, example_index=document_id_seed, request_index=0, context=context
                 )
             ]
-        if self.has_metric_category[MetricCategory.GENERATIVE]:
+        if (
+            self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]
+            or self.has_metric_category[MetricCategory.GENERATIVE]
+            or self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]
+        ):
+            # All these tasks require the same generation process - we can do them in one step
+            use_logits = self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]
             requests[RequestType.GREEDY_UNTIL] += [
                 GreedyUntilRequest(
                     task_name=current_task_name,
@@ -481,29 +484,8 @@ def construct_requests(
                     context=context,
                     stop_sequence=self.stop_sequence,
                     generation_size=self.generation_size,
-                )
-            ]
-        if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
-            requests[RequestType.GREEDY_UNTIL_WITH_SAMPLING] += [
-                GreedyUntilWithSamplingRequest(
-                    task_name=current_task_name,
-                    example_index=document_id_seed,
-                    request_index=0,
-                    context=context,
-                    stop_sequence=self.stop_sequence,
-                    generation_size=self.generation_size,
                     num_samples=max(self.num_samples),  # If we have several samplings to apply, we use the max
-                )
-            ]
-        if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
-            requests[RequestType.GREEDY_UNTIL_WITH_LOGITS] += [
-                GreedyUntilWithLogitsRequest(
-                    task_name=current_task_name,
-                    example_index=document_id_seed,
-                    request_index=0,
-                    context=context,
-                    stop_sequence=self.stop_sequence,
-                    generation_size=self.generation_size,
+                    use_logits=use_logits,
                 )
             ]
         if self.has_metric_category[MetricCategory.MULTICHOICE]:
@@ -564,22 +546,15 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics
             )
             outputs.update(cur_outputs)
-        if self.has_metric_category[MetricCategory.GENERATIVE]:
+        if (
+            self.has_metric_category[MetricCategory.GENERATIVE]
+            or self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]
+            or self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]
+        ):
             results, cur_outputs = apply_generative_metric(
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex
             )
             outputs.update(cur_outputs)
-        if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
-            results, cur_outputs = apply_generative_sampling_metric(
-                results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex
-            )
-            outputs.update(cur_outputs)
-
-        if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
-            results, cur_outputs = apply_generative_logprob_metric(
-                results=results, formatted_doc=formatted_doc, metrics=self.metrics
-            )
-            outputs.update(cur_outputs)
         if self.has_metric_category[MetricCategory.MULTICHOICE]:
             results, cur_outputs = apply_multichoice_metric(
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index 952fd430d..51abf61d6 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -34,8 +34,6 @@ class RequestType(Enum):
     LOGLIKELIHOOD_ROLLING = auto()
     GREEDY_UNTIL = auto()
     GREEDY_UNTIL_MULTI_TURN = auto()
-    GREEDY_UNTIL_WITH_LOGITS = auto()
-    GREEDY_UNTIL_WITH_SAMPLING = auto()
 
 
 @dataclass
@@ -120,6 +118,8 @@ class GreedyUntilRequest(Request):
     generation_size: int
     request_type = RequestType.GREEDY_UNTIL
     tokenized_context: list[int] = None
+    num_samples: int = None
+    use_logits: bool = False
 
 
 @dataclass
@@ -138,35 +138,6 @@ class GreedyUntilMultiTurnRequest(Request):
     request_type = RequestType.GREEDY_UNTIL_MULTI_TURN
 
 
-@dataclass
-class GreedyUntilWithLogitsRequest(GreedyUntilRequest):
-    """
-    Represents a request for generating text using the Greedy-Until strategy but
-    returning the logits.
-
-    Attributes:
-        request_type (RequestType): The type of the request (GREEDY_UNTIL_WITH_LOGITS).
-    """
-
-    request_type = RequestType.GREEDY_UNTIL_WITH_LOGITS
-
-
-@dataclass
-class GreedyUntilWithSamplingRequest(GreedyUntilRequest):
-    """
-    Represents a request for generating text using the Greedy-Until strategy but
-    returning the logits.
-
-    Attributes:
-        stop_sequence (str): The sequence of tokens that indicates when to stop generating text.
-        generation_size (int): The maximum number of tokens to generate.
-        request_type (RequestType): The type of the request (GREEDY_UNTIL_WITH_LOGITS).
-    """
-
-    request_type = RequestType.GREEDY_UNTIL_WITH_SAMPLING
-    num_samples: int = None
-
-
 class TaskExampleId(NamedTuple):
     """
     Represents the identifier for an example in a task.
diff --git a/tests/test_unit_harness_metrics.py b/tests/test_unit_harness_metrics.py
index 35f6634f8..d8a6503ac 100644
--- a/tests/test_unit_harness_metrics.py
+++ b/tests/test_unit_harness_metrics.py
@@ -26,7 +26,6 @@
 import pytest
 
 from lighteval.metrics import (
-    apply_generative_logprob_metric,
     apply_generative_metric,
     apply_multichoice_metric,
     apply_multichoice_metric_one_token,
@@ -129,14 +128,13 @@ def apply_metric(metric, results, formatted_doc: Doc):
     if Metrics[metric].value.category == MetricCategory.PERPLEXITY:
         _, cur_outputs = apply_perplexity_metric(results=results, formatted_doc=formatted_doc, metrics=[metric])
         return cur_outputs
-    if Metrics[metric].value.category == MetricCategory.GENERATIVE:
+    if Metrics[metric].value.category in [
+        MetricCategory.GENERATIVE,
+        MetricCategory.GENERATIVE_LOGPROB,
+        MetricCategory.GENERATIVE_SAMPLING,
+    ]:
         _, cur_outputs = apply_generative_metric(results=results, formatted_doc=formatted_doc, metrics=[metric])
         return cur_outputs
-    if Metrics[metric].value.category == MetricCategory.GENERATIVE_LOGPROB:
-        _, cur_outputs = apply_generative_logprob_metric(
-            results=results, formatted_doc=formatted_doc, metrics=[metric]
-        )
-        return cur_outputs
     if Metrics[metric].value.category == MetricCategory.MULTICHOICE:
         _, cur_outputs = apply_multichoice_metric(results=results, formatted_doc=formatted_doc, metrics=[metric])
         return cur_outputs

From 71aa2b88608dc4fb9b90198b2a82e1fa9284f543 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 22 Apr 2024 09:51:54 +0000
Subject: [PATCH 08/13] edit to prevent sampling for providing too many answers
 to some metrics

---
 src/lighteval/metrics/__init__.py     | 16 +++++++++++++---
 src/lighteval/tasks/lighteval_task.py |  6 +++++-
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 7577c7713..c4e44c03c 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -66,7 +66,9 @@ def apply_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metr
     return results, outputs
 
 
-def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str], output_regex=None):
+def apply_generative_metric(
+    results: list[ModelReturn], formatted_doc: Doc, metrics: list[str], output_regex=None, max_num_samples=1
+):
     outputs = {}
 
     # Post processing prediction
@@ -93,11 +95,19 @@ def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metr
         preds = [formatted_doc.specific["label_to_choices"].get(p) for p in preds]
         golds = [formatted_doc.specific["label_to_choices"][g] for g in golds]
 
+    preds_no_sampling = preds
+    if max_num_samples > 1:  # We want to run our evaluation on only one sample for base generative evals
+        preds_no_sampling = as_list(preds[0])
+
     for metric in metrics:
         if Metrics[metric].value.category == MetricCategory.GENERATIVE:
-            outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
+            outputs.update(
+                Metrics[metric].value.compute(golds=golds, predictions=preds_no_sampling, formatted_doc=formatted_doc)
+            )
         if Metrics[metric].value.category == MetricCategory.GENERATIVE_LOGPROB:
-            outputs.update(Metrics[metric].value.compute(results=results, formatted_doc=formatted_doc))
+            outputs.update(
+                Metrics[metric].value.compute(golds=golds, predictions=preds_no_sampling, formatted_doc=formatted_doc)
+            )
         if Metrics[metric].value.category == MetricCategory.GENERATIVE_SAMPLING:
             outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
 
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index cef8d5d0d..f9df6fddb 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -552,7 +552,11 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
             or self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]
         ):
             results, cur_outputs = apply_generative_metric(
-                results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex
+                results=results,
+                formatted_doc=formatted_doc,
+                metrics=self.metrics,
+                output_regex=self.output_regex,
+                max_num_samples=max(self.num_samples),
             )
             outputs.update(cur_outputs)
         if self.has_metric_category[MetricCategory.MULTICHOICE]:

From e36d6e091458a6e4171317bf97a9b7d7db1911c4 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 22 Apr 2024 09:58:49 +0000
Subject: [PATCH 09/13] added some doc

---
 src/lighteval/metrics/metrics_sample.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 5b286afba..3a64f5d03 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -712,12 +712,14 @@ def __init__(
             )
         self.type_exact_match = type_exact_match
 
-    def compute(self, golds: list[str], predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict[str, float]:
         """Computes the metric over a list of golds and predictions for one single sample.
+        It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones,
+        then compares it to the gold.
 
         Args:
             golds (list[str]): Reference targets
-            predictions (list[str]): Predicted strings
+            predictions (list[str]): k predicted strings
 
         Returns:
             float: Aggregated score over the current sample's items.

From 4ef4c99779eb4a40b4b3cefcaf58c10a73d3dfe7 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 22 Apr 2024 13:06:12 +0000
Subject: [PATCH 10/13] neither nanotron nor endpoints models cover sampling
 atm

---
 src/lighteval/models/endpoint_model.py | 5 +++++
 src/lighteval/models/nanotron_model.py | 7 ++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py
index 03e184bc3..d79e0f912 100644
--- a/src/lighteval/models/endpoint_model.py
+++ b/src/lighteval/models/endpoint_model.py
@@ -260,6 +260,11 @@ def greedy_until(
             ):
                 # the `returns_logits` flag is only used to filter the results, we always request the full details.
                 returns_logits = batch[0].use_logits
+                num_samples = batch[0].num_samples
+                if num_samples > 1:
+                    hlog_err(
+                        "Inference endpoints does not allow sampling evaluations - this is likely to fail or provide problematic results"
+                    )
 
                 if self.use_async:
                     responses = asyncio.run(self.__async_process_batch_generate(batch))
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index eecd18fb1..977b2b198 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -54,7 +54,7 @@
     LoglikelihoodDataset,
     LoglikelihoodSingleTokenDataset,
 )
-from lighteval.logging.hierarchical_logger import hlog_warn
+from lighteval.logging.hierarchical_logger import hlog_err, hlog_warn
 from lighteval.models.base_model import LightevalModel
 from lighteval.models.model_config import EnvConfig
 from lighteval.models.model_output import Batch, GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
@@ -1201,6 +1201,11 @@ def greedy_until(
                 # need to pass them somewhere ! stop_tokens = batch[0].stop_sequence
                 max_new_tokens = batch[0].generation_size
                 returns_logits = batch[0].use_logits
+                num_samples = batch[0].num_samples
+                if num_samples > 1:
+                    hlog_err(
+                        "Nonotron models does not allow sampling evaluations - this is likely to fail or provide problematic results"
+                    )
 
                 # The main question for this step is the following:
                 # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk

From 26c7868d580d35610e4fb68dd905f2e19570443c Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 22 Apr 2024 13:56:13 +0000
Subject: [PATCH 11/13] add readme

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index e7c549280..0596061f3 100644
--- a/README.md
+++ b/README.md
@@ -350,6 +350,7 @@ These metrics need the model to generate an output. They are therefore slower.
     - `f1_score`:  Average F1 score in terms of word overlap between the model output and gold without normalisation
     - `f1_score_macro`: Corpus level macro F1 score
     - `f1_score_macro`: Corpus level micro F1 score
+    - `maj_at_5` and `maj_at_8`: Model majority vote. Takes n (5 or 8) generations from the model and assumes the most frequent is the actual prediction.
 - Summarization:
     - `rouge` (Harness): Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/)
     - `rouge1` (HELM): Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
@@ -376,7 +377,9 @@ These metrics need the model to generate an output. They are therefore slower.
         - `edit_similarity`: average Levenshtein edit similarity (normalized by length of longer sequence) between model generation and reference.
 - Math:
     - `quasi_exact_match_math` (HELM): Fraction of instances where the normalized prediction matches the normalized gold (normalization done for math, where latex symbols, units, etc are removed)
+    - `maj_at_4_math` (Lighteval): Majority choice evaluation, using the math normalisation for the predictions and gold
     - `quasi_exact_match_gsm8k` (Harness): Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed)
+    - `maj_at_8_gsm8k` (Lighteval): Majority choice evaluation, using the gsm8k normalisation for the predictions and gold
 
 ### Metrics for specific tasks
 To keep compatibility with the Harness for some specific tasks, we ported their evaluations more or less as such. They include `drop` (for the DROP dataset) and `truthfulqa_mc_metrics` (for TruthfulQA). In general, except for tasks where the dataset has a very different formatting than usual (an other language, programming language, math, ...), we want to use standard implementations of the above metrics. It makes little sense to have 10 different versions of an exact match depending on the task. However, most of the above metrics are parametrizable so that you can change the normalization applied easily for experimental purposes.

From 39cf6765dd9ae1a816935fb1534e531981e09b6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?=
 <22726840+clefourrier@users.noreply.github.com>
Date: Tue, 23 Apr 2024 11:20:56 +0200
Subject: [PATCH 12/13] Update src/lighteval/metrics/metrics_sample.py

Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
---
 src/lighteval/metrics/metrics_sample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 3a64f5d03..37b922dac 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -755,7 +755,7 @@ def get_processed_pred(self, pred: str) -> float:
 
         return pred
 
-    def compute_score(self, pred: str, gold: str):
+    def compute_score(self, pred: str, gold: str) -> int:
         if self.type_exact_match == "prefix":
             return 1 if pred.startswith(gold) else 0
         if self.type_exact_match == "suffix":

From 5a6988ef8cffc5d6d5d64d1960774627e4e80f48 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Tue, 23 Apr 2024 09:22:21 +0000
Subject: [PATCH 13/13] added review change

---
 src/lighteval/metrics/__init__.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index c4e44c03c..3d5257562 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -95,18 +95,22 @@ def apply_generative_metric(
         preds = [formatted_doc.specific["label_to_choices"].get(p) for p in preds]
         golds = [formatted_doc.specific["label_to_choices"][g] for g in golds]
 
-    preds_no_sampling = preds
-    if max_num_samples > 1:  # We want to run our evaluation on only one sample for base generative evals
-        preds_no_sampling = as_list(preds[0])
-
     for metric in metrics:
         if Metrics[metric].value.category == MetricCategory.GENERATIVE:
             outputs.update(
-                Metrics[metric].value.compute(golds=golds, predictions=preds_no_sampling, formatted_doc=formatted_doc)
+                Metrics[metric].value.compute(
+                    golds=golds,
+                    predictions=as_list(preds[0]) if max_num_samples > 0 else preds,
+                    formatted_doc=formatted_doc,
+                )
             )
         if Metrics[metric].value.category == MetricCategory.GENERATIVE_LOGPROB:
             outputs.update(
-                Metrics[metric].value.compute(golds=golds, predictions=preds_no_sampling, formatted_doc=formatted_doc)
+                Metrics[metric].value.compute(
+                    golds=golds,
+                    predictions=as_list(preds[0]) if max_num_samples > 0 else preds,
+                    formatted_doc=formatted_doc,
+                )
             )
         if Metrics[metric].value.category == MetricCategory.GENERATIVE_SAMPLING:
             outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))