Fix BLEURT evaluation errors (#316)

chuandudx · clefourrier · NathanHB · web-flow · commit acef117b582f · 2024-10-16T15:51:44.000+02:00
These changes address the issues described in: #315 I made the code changes such that it built on the BERTScore changes (#311) that haven't been merged yet, so we see those changes here. Please let me know if there is preference on removing those from this PR. Thank you! --------- Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -121,12 +121,13 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+
     bleurt = SampleLevelMetric(
         metric_name="bleurt",
-        sample_level_fn=BLEURT.compute,
+        sample_level_fn=BLEURT().compute,
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.TRANSLATION,
-        corpus_level_fn=lambda x: np.mean(x.flatten()),  # flatten, then average
+        corpus_level_fn=np.mean,
         higher_is_better=True,
     )
     byte_perplexity = CorpusLevelMetric(
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -702,7 +702,7 @@ def __init__(self):
         self.model = AutoModelForSequenceClassification.from_pretrained("Elron/bleurt-tiny-512")
         self.model.eval()
 
-    def compute(self, golds: list[str], predictions: list[str]) -> float:
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         """Uses the stored BLEURT scorer to compute the score on the current sample.
 
         Args:
@@ -715,8 +715,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> float:
         if len(predictions) == 1:
             predictions = predictions * len(golds)
         scores = self.model(**self.tokenizer(golds, predictions, return_tensors="pt"))[0].squeeze()
-
-        return scores
+        return scores.item()
 
 
 class BLEU: