huggingface
diff --git a/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions b/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lighteval/metrics/imports/summac.py‎
Lines changed: 0 additions & 1 deletion b/‎src/lighteval/metrics/imports/summac.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/lighteval/metrics/metrics.py‎
Lines changed: 1 addition & 1 deletion b/‎src/lighteval/metrics/metrics.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lighteval/metrics/metrics_corpus.py‎
Lines changed: 16 additions & 2 deletions b/‎src/lighteval/metrics/metrics_corpus.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎src/lighteval/metrics/metrics_sample.py‎
Lines changed: 20 additions & 14 deletions b/‎src/lighteval/metrics/metrics_sample.py‎
Lines changed: 20 additions & 14 deletions
diff --git a/‎src/lighteval/metrics/utils/metric_utils.py‎
Lines changed: 0 additions & 1 deletion b/‎src/lighteval/metrics/utils/metric_utils.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/lighteval/models/model_output.py‎
Lines changed: 1 addition & 1 deletion b/‎src/lighteval/models/model_output.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lighteval/tasks/extended/ifbench/instructions.py‎
Lines changed: 6 additions & 5 deletions b/‎src/lighteval/tasks/extended/ifbench/instructions.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/lighteval/tasks/extended/lcb/main.py‎
Lines changed: 1 addition & 0 deletions b/‎src/lighteval/tasks/extended/lcb/main.py‎
Lines changed: 1 addition & 0 deletions
@@ -1 +1,2 @@
 *.json filter=lfs diff=lfs merge=lfs -text
+tests/unit/metrics/test_cases/*.json -filter -diff -merge text
@@ -99,7 +99,7 @@ nanotron = [
 tensorboardX = ["tensorboardX"]
 vllm = ["vllm>=0.10.0,<0.10.2", "ray", "more_itertools"]
 quality = ["ruff>=v0.11.0","pre-commit"]
-tests = ["pytest>=7.4.0","deepdiff"]
+tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"]
 dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
 docs = ["hf-doc-builder", "watchdog"]
 extended_tasks = [
 
@@ -221,7 +221,6 @@ def build_image(self, original, generated):
                     truncation=True,
                     max_length=self.max_input_length,
                     return_tensors="pt",
-                    truncation_strategy="only_first",
                 )
                 batch_tokens = {k: v.to(self.device) for k, v in batch_tokens.items()}
                 with torch.no_grad():
 
@@ -390,7 +390,7 @@ class Metrics(Enum):
         metric_name="mf1",
         sample_level_fn=LoglikelihoodPreparator(is_single_token=True),
         category=SamplingMethod.LOGPROBS,
-        corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3),
+        corpus_level_fn=CorpusLevelF1Score(average="micro", num_classes=3),
         higher_is_better=True,
     )
     pass_at_k = SampleLevelMetric(
 
@@ -105,7 +105,11 @@ def compute_corpus(self, items: list[LogprobCorpusMetricInput]):
         # Multi f1
         f1s = []
         for i in range(self.num_classes):
-            f1s.append(sklearn.metrics.f1_score(y_true=golds == i, y_pred=preds == i))
+            f1s.append(
+                sklearn.metrics.f1_score(
+                    y_true=[g == i for g in golds], y_pred=[p == i for p in preds], average=self.average
+                )
+            )
         return float(np.mean(f1s))
 
 
@@ -122,6 +126,9 @@ def __init__(self, metric_type: str, lang: Literal["zh", "ja", "ko", ""] = ""):
 
     def get_metric(self):
         if self.metric_type == "bleu":
+            import nltk
+
+            nltk.download("punkt_tab")
             return sacrebleu.BLEU(trg_lang=self.lang)
         elif self.metric_type == "chrf":
             return sacrebleu.CHRF()
@@ -144,7 +151,14 @@ def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float:
                     f"Multiple predictions present, keeping only the first prediction (when computing sacrebleu.{metric.__name__})."
                 )
             preds.append(pred[0])
-        return float(metric.corpus_score(hypotheses=preds, references=golds).score)
+
+        if self.metric_type == "bleu":
+            golds = [[gold[0] for gold in golds]]
+
+        corpus_score = metric.corpus_score(hypotheses=preds, references=golds)
+        score = corpus_score.score
+        results = float(score)
+        return results
 
 
 class CorpusLevelPerplexityMetric(CorpusLevelComputation):
 
@@ -823,6 +823,9 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
         Returns:
             float: Score over the current sample's items.
         """
+        import nltk
+
+        nltk.download("punkt_tab")
         golds = doc.get_golds()
         predictions = model_response.final_text
         return np.mean([self._bleu_score(golds, p) for p in predictions])
@@ -1122,6 +1125,7 @@ def __init__(
                 raise ValueError(f"Unknown normalization function: {normalize}")
         else:
             self.normalize = normalize
+
         self.strip_strings = strip_strings
 
         if callable(sample_scoring_function):
@@ -1141,6 +1145,7 @@ def __init__(
             else:
                 self.type_exact_match = "full"
             self.compute_score = self.default_sample_scoring
+            self.score_sample = self.default_sample_scoring
 
     def preprocess(self, text: str) -> str:
         if not text:
@@ -1194,7 +1199,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs):
         """
         all_scores = []
         for i in range(self.k):
-            all_scores.append(self.compute_score(doc, model_response[i]))
+            all_scores.append(self.score_sample(doc, model_response[i]))
 
         avg_score = np.mean(all_scores)
         return avg_score
@@ -1221,30 +1226,31 @@ def __init__(self, k: int | None = None, **kwargs):
         self.k = k
         self.attribute_must_be_set = ["k"]
 
-    def compute(self, model_response: ModelResponse, docs: Doc, **kwargs):
+    def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
         """Computes the metric over a list of golds and predictions for one single sample.
-        It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones,
-        then compares it to the gold.
+        It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones, then compares it to the gold.
 
         Args:
+            doc (Doc): The document containing gold references.
             model_response (ModelResponse): The model's response containing predictions.
-            docs (Doc): The document containing gold references.
             **kwargs: Additional keyword arguments.
 
         Returns:
             float: Aggregated score over the current sample's items.
         """
         if self.k is None:
             raise Exception("You did not set the value of k")
-        golds = docs.get_golds()
+
+        golds = doc.get_golds()
+
         if len(golds) > 1:
             raise Exception("Cannot compute maj@k with several golds")
 
-        processed_choices = [self.preprocess(text=g) for g in docs.get_golds()]
+        processed_choices = [self.preprocess(text=g) for g in doc.get_golds()]
         new_doc = Doc(
             choices=processed_choices,
-            query=docs.query,
-            gold_index=docs.gold_index,
+            query=doc.query,
+            gold_index=list(range(len(processed_choices))),
         )
         all_answers = []
         for pred in model_response.final_text[: self.k]:
@@ -1253,7 +1259,7 @@ def compute(self, model_response: ModelResponse, docs: Doc, **kwargs):
         new_model_response = ModelResponse(
             text=[majority_prediction],
         )
-        return self.compute_score(new_model_response, new_doc)
+        return self.compute_score(new_doc, new_model_response)
 
     def num_samples(self):
         return self.k
@@ -1433,8 +1439,8 @@ def compute_mg_pass_at_k(n, c, k):
         metrics = {}
         for k in ks:
             for t in thresholds:
-                metrics[f"{self.name}@{k}_{t}"] = compute_g_pass_at_k(n, c, k, t)
-            metrics[f"m{self.name}@{k}"] = compute_mg_pass_at_k(n, c, k)
+                metrics[f"{self.name}{k}_{t}"] = compute_g_pass_at_k(n, c, k, t)
+            metrics[f"m{self.name}{k}"] = compute_mg_pass_at_k(n, c, k)
 
         return metrics
 
@@ -1446,8 +1452,8 @@ def metric_names(self):
         metrics = []
         for k in ks:
             for t in thresholds:
-                metrics.append(f"{self.name}@{k}_{t}")
-            metrics.append(f"m{self.name}@{k}")
+                metrics.append(f"{self.name}{k}_{t}")
+            metrics.append(f"m{self.name}{k}")
 
         return metrics
 
 
@@ -50,7 +50,6 @@ def compute_sample(
         elif isinstance(self.sample_level_fn, Preparator):
             sample_level_fn = self.sample_level_fn.prepare
         else:
-            breakpoint()
             raise ValueError(
                 f"Incorrect type for {self.sample_level_fn}, should be a SampleLevelComputation or Preparator"
             )
 
@@ -149,7 +149,7 @@ def __getitem__(self, index: int) -> "ModelResponse":
             input=self.input,
             input_tokens=self.input_tokens,
             text=[self.text[index]],
-            output_tokens=[self.output_tokens[index]],
+            output_tokens=[self.output_tokens[index]] if self.output_tokens else [],
             logprobs=[self.logprobs[index]] if self.logprobs else [],
             argmax_logits_eq_gold=[self.argmax_logits_eq_gold[index]] if self.argmax_logits_eq_gold else [],
             logits=[self.logits[index]] if self.logits else None,
 
@@ -142,7 +142,7 @@ def build_description(self, *, N=None):
         """Build the instruction description.
 
         Args:
-          n: An integer specifying the number of unique words contained in the response.
+          N: An integer specifying the number of unique words contained in the response.
 
         Returns:
           A string representing the instruction description.
@@ -2113,7 +2113,7 @@ def build_description(self, *, prompt_to_repeat=None):
         """Build the instruction description.
 
         Args:
-          keyword: A string representing a keyword that is expected in the response.
+          prompt_to_repeat: The prompt that is meant to be repeated.
 
         Returns:
           A string representing the instruction description.
@@ -2187,11 +2187,12 @@ def build_description(self, prompt_to_repeat=None, n_start=None, n_end=None):
         """Build the instruction description.
 
         Args:
-        n_start: An integer representing the start index of the span.
-        n_end: An integer representing the end index of the span.
+            prompt_to_repeat: The prompt that is meant to be repeated.
+            n_start: An integer representing the start index of the span.
+            n_end: An integer representing the end index of the span.
 
         Returns:
-        A string representing the instruction description.
+            A string representing the instruction description.
         """
         if not prompt_to_repeat:
             raise ValueError("prompt_to_repeat must be set.")
 
@@ -113,6 +113,7 @@ def codegen_metric(model_response: ModelResponse, doc: Doc, **kwargs) -> float:
     higher_is_better=True,
     sample_level_fn=codegen_metric,
     corpus_level_fn=np.mean,
+    batched_compute=False,
 )
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`*.json filter=lfs diff=lfs merge=lfs -text`
	`2`	`+tests/unit/metrics/test_cases/*.json -filter -diff -merge text`
Original file line number	Diff line number	Diff line change
`@@ -221,7 +221,6 @@ def build_image(self, original, generated):`
`221`	`221`	`truncation=True,`
`222`	`222`	`max_length=self.max_input_length,`
`223`	`223`	`return_tensors="pt",`
`224`		`- truncation_strategy="only_first",`
`225`	`224`	`)`
`226`	`225`	`batch_tokens = {k: v.to(self.device) for k, v in batch_tokens.items()}`
`227`	`226`	`with torch.no_grad():`
Original file line number	Diff line number	Diff line change
`@@ -390,7 +390,7 @@ class Metrics(Enum):`
`390`	`390`	`metric_name="mf1",`
`391`	`391`	`sample_level_fn=LoglikelihoodPreparator(is_single_token=True),`
`392`	`392`	`category=SamplingMethod.LOGPROBS,`
`393`		`- corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3),`
	`393`	`+ corpus_level_fn=CorpusLevelF1Score(average="micro", num_classes=3),`
`394`	`394`	`higher_is_better=True,`
`395`	`395`	`)`
`396`	`396`	`pass_at_k = SampleLevelMetric(`
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,6 @@ def compute_sample(`
`50`	`50`	`elif isinstance(self.sample_level_fn, Preparator):`
`51`	`51`	`sample_level_fn = self.sample_level_fn.prepare`
`52`	`52`	`else:`
`53`		`- breakpoint()`
`54`	`53`	`raise ValueError(`
`55`	`54`	`f"Incorrect type for {self.sample_level_fn}, should be a SampleLevelComputation or Preparator"`
`56`	`55`	`)`
Original file line number	Diff line number	Diff line change
`@@ -113,6 +113,7 @@ def codegen_metric(model_response: ModelResponse, doc: Doc, **kwargs) -> float:`
`113`	`113`	`higher_is_better=True,`
`114`	`114`	`sample_level_fn=codegen_metric,`
`115`	`115`	`corpus_level_fn=np.mean,`
	`116`	`+ batched_compute=False,`
`116`	`117`	`)`
`117`	`118`
`118`	`119`