Class implementations of faithfulness and extractiveness metrics (#323)

chuandudx · clefourrier · web-flow · commit ab05ad335f6e · 2024-09-27T16:22:28.000+02:00
Refactor extractiveness to be a class and modified its instantiation in metrics.py
Refactor faithfulness to be a class and modified its instantiation in metrics.py
Enable configurable summaCZS model, and configurable input_column.

---------

Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -40,15 +40,15 @@
     ROUGE,
     BertScore,
     ExactMatches,
+    Extractiveness,
     F1_score,
+    Faithfulness,
     JudgeLLM,
     LoglikelihoodAcc,
     MajAtK,
     Recall,
     StringDistance,
     acc_golds_likelihood,
-    extractiveness,
-    faithfulness,
 )
 from lighteval.metrics.normalizations import (
     LogProbCharNorm,
@@ -175,7 +175,9 @@ class Metrics(Enum):
     )
     extractiveness = SampleLevelMetricGrouping(
         metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
-        sample_level_fn=extractiveness,
+        sample_level_fn=Extractiveness(
+            normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text"
+        ).compute,
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.SUMMARIZATION,
         corpus_level_fn={
@@ -223,7 +225,9 @@ class Metrics(Enum):
     )
     faithfulness = SampleLevelMetric(
         metric_name="summac",
-        sample_level_fn=faithfulness,
+        sample_level_fn=Faithfulness(
+            normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text"
+        ).compute,
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.SUMMARIZATION,
         corpus_level_fn=np.mean,
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -593,24 +593,104 @@ def compute(self, golds: list[str], predictions: list[str]) -> dict:
         return {"BERTScore-P": p[0].item(), "BERTScore-R": r[0].item(), "BERTScore-F": f[0].item()}
 
 
-# todo: make into clean classes with call to normalizer
-def extractiveness(formatted_doc: Doc, predictions: list[str], **kwargs):
-    inp = remove_braces(formatted_doc.specific["text"])
-    pred = remove_braces_and_strip(predictions[0])
-    stats = DataStatsMetric().evaluate_example(pred, inp)
-    return {
-        "summarization_coverage": stats["coverage"],
-        "summarization_density": stats["density"],
-        "summarization_compression": stats["compression"],
-    }
-
-
-# todo: make into clean classes with call to normalizer
-def faithfulness(formatted_doc: Doc, predictions: list[str], **kwargs):
-    inp = remove_braces(formatted_doc.specific["text"])
-    pred = remove_braces_and_strip(predictions[0])
-    summac = SummaCZS(granularity="sentence", model_name="vitc", imager_load_cache=False)  # , device=device)
-    return summac.score_one(inp, pred)["score"]
+class Extractiveness:
+    def __init__(
+        self,
+        normalize_input: callable = remove_braces,
+        normalize_pred: callable = remove_braces_and_strip,
+        input_column: str = "text",
+    ):
+        """
+        Extractiveness metric class.
+
+        Args:
+            normalize_input (callable, optional): Function to normalize the input strings.
+                Defaults to remove_braces from lighteval.metrics.normalizations if no normalization is applied.
+            normalize_pred (callable, optional): Function to use to normalize the predicted strings.
+                Defaults to remove_braces_and_strip from lighteval.metrics.normalizations if no normalization is applied.
+            input_column (str): Column in the formatted_doc to use for the input. Defaults to "text".
+        """
+        self.stats_metric = None
+        self.normalize_input = normalize_input
+        self.normalize_pred = normalize_pred
+        self.input_column = input_column
+
+    def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
+        """
+        Compute the extractiveness of the predictions.
+
+        This method calculates coverage, density, and compression scores for a single
+        prediction against the input text.
+
+        Args:
+            predictions (list[str]): Predicted strings, a list of length 1.
+            formatted_doc (Doc): The formatted document.
+
+        Returns:
+            dict[str, float]: The extractiveness scores.
+        """
+        if self.stats_metric is None:
+            self.stats_metric = DataStatsMetric()
+
+        inp = formatted_doc.specific[self.input_column]
+        prediction = predictions[0]
+        if self.normalize_input:
+            inp = self.normalize_input(inp)
+        if self.normalize_pred:
+            prediction = self.normalize_pred(prediction)
+
+        stats = self.stats_metric.evaluate_example(prediction, inp)
+        return {
+            "summarization_coverage": stats["coverage"],
+            "summarization_density": stats["density"],
+            "summarization_compression": stats["compression"],
+        }
+
+
+class Faithfulness:
+    def __init__(
+        self,
+        normalize_input: callable = remove_braces,
+        normalize_pred: callable = remove_braces_and_strip,
+        input_column: str = "text",
+    ):
+        """
+        Faithfulness metric class.
+
+        Args:
+            normalize_input (callable, optional): Function to normalize the input strings.
+                Defaults to remove_braces from lighteval.metrics.normalizations if no normalization is applied.
+            normalize_pred (callable, optional): Function to use to normalize the predicted strings.
+                Defaults to remove_braces_and_strip from lighteval.metrics.normalizations if no normalization is applied.
+            input_column (str): Column in the formatted_doc to use for the input. Defaults to "text".
+        """
+        self.summac = None
+        self.normalize_input = normalize_input
+        self.normalize_pred = normalize_pred
+        self.input_column = input_column
+
+    def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
+        """
+        Compute the faithfulness of the predictions.
+
+        The SummaCZS (Summary Content Zero-Shot) model is used with configurable granularity and model variation.
+
+        Args:
+            predictions (list[str]): Predicted strings, a list of length 1.
+            formatted_doc (Doc): The formatted document.
+
+        Returns:
+            dict[str, float]: The faithfulness scores.
+        """
+        if self.summac is None:
+            SummaCZS(granularity="sentence", model_name="vitc", imager_load_cache=False)  # , device=device)
+        inp = formatted_doc.specific[self.input_column]
+        prediction = predictions[0]
+        if self.normalize_input:
+            inp = self.normalize_input(inp)
+        if self.normalize_pred:
+            prediction = self.normalize_pred(prediction)
+        return self.summac.score_one(inp, prediction)["score"]
 
 
 class BLEURT: