Multilingual extractiveness (#956)

rolshoven · NathanHB · web-flow · commit 2dc178853433 · 2025-09-16T10:08:35.000+02:00
* Added German, French, and Italian language support to Extractiveness metric

* Added minimum version for spacy dependency

* Added changes from code review

* Added missing newline

---------

Co-authored-by: Nathan Habib &lt;30601243+NathanHB@users.noreply.github.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -110,7 +110,7 @@ extended_tasks = [
 s3 = ["s3fs"]
 multilingual = [
     "stanza",
-    "spacy[ja,ko,th]",
+    "spacy[ja,ko,th]>=3.8.0",
     "jieba", # for chinese tokenizer
     "pyvi", # for vietnamese tokenizer
 ]
diff --git a/src/lighteval/metrics/imports/data_stats_metric.py b/src/lighteval/metrics/imports/data_stats_metric.py
@@ -27,15 +27,20 @@
 import logging
 from collections import Counter
 from multiprocessing import Pool
+from typing import Literal
 
 from lighteval.metrics.imports.data_stats_utils import Fragments
 from lighteval.utils.imports import NO_SPACY_ERROR_MSG, is_spacy_available
 
 
 logger = logging.getLogger(__name__)
 
-
-_en = None
+LANGUAGE_TO_SPACY_MODEL_MAP = {
+    "en": "en_core_web_sm",
+    "de": "de_core_news_sm",
+    "fr": "fr_core_news_sm",
+    "it": "it_core_news_sm",
+}
 
 
 class Metric:
@@ -51,8 +56,16 @@ def find_ngrams(input_list, n):
 
 
 class DataStatsMetric(Metric):
-    def __init__(self, n_gram=3, n_workers=24, case=False, tokenize=True):
-        """Data Statistics metric
+    def __init__(
+        self,
+        n_gram: int = 3,
+        n_workers: int = 24,
+        case: bool = False,
+        tokenize: bool = True,
+        language: Literal["en", "de", "fr", "it"] = "en",
+    ):
+        """
+        Data Statistics metric
         Makes use of Newsroom code: \
             https://github.com/lil-lab/newsroom/blob/master/newsroom/analyze/fragments.py
         Calculates extractive statistics such as coverage, density, compression as
@@ -69,6 +82,9 @@ def __init__(self, n_gram=3, n_workers=24, case=False, tokenize=True):
             case (bool): whether to lowercase input before calculating statistics.
             tokenize (bool): whether to tokenize the input; otherwise assumes that the input
                 is a string of space-separated tokens.
+            language (Literal["en", "de", "fr", "it"]): the language of the input text. This
+                determines the spaCy model used for tokenization. Currently supports English,
+                German, French, and Italian.
         """
         if not is_spacy_available():
             raise ImportError(NO_SPACY_ERROR_MSG)
@@ -78,22 +94,24 @@ def __init__(self, n_gram=3, n_workers=24, case=False, tokenize=True):
         self.n_workers = n_workers
         self.case = case
         self.tokenize = tokenize
+        self.language = language
+        self.nlp = None
 
-        global _en
+        spacy_model = LANGUAGE_TO_SPACY_MODEL_MAP.get(self.language, "en_core_web_sm")
         try:
-            _en = spacy.load("en_core_web_sm")
+            self.nlp = spacy.load(spacy_model)
         except OSError:
-            logger.info("Downloading the spacy en_core_web_sm model\n(don't worry, this will only happen once)")
+            logger.info(f"Downloading the spacy {spacy_model} model\n(don't worry, this will only happen once)")
             from spacy.cli import download
 
-            download("en_core_web_sm")
-            _en = spacy.load("en_core_web_sm")
+            download(spacy_model)
+            self.nlp = spacy.load(spacy_model)
 
     def evaluate_example(self, summary, input_text):
         if self.tokenize:
-            input_text = _en(input_text, disable=["tagger", "parser", "ner", "textcat"])
+            input_text = self.nlp(input_text, disable=["tagger", "parser", "ner", "textcat"])
             input_text = [tok.text for tok in input_text]
-            summary = _en(summary, disable=["tagger", "parser", "ner", "textcat"])
+            summary = self.nlp(summary, disable=["tagger", "parser", "ner", "textcat"])
             summary = [tok.text for tok in summary]
         fragments = Fragments(summary, input_text, case=self.case)
         coverage = fragments.coverage()
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -26,9 +26,7 @@
 import numpy as np
 from aenum import Enum
 
-from lighteval.metrics.dynamic_metrics import (
-    MultilingualExtractiveMatchMetric,
-)
+from lighteval.metrics.dynamic_metrics import MultilingualExtractiveMatchMetric
 from lighteval.metrics.harness_compatibility.drop import DropMetrics
 from lighteval.metrics.harness_compatibility.truthful_qa import TruthfulqaMCMetrics
 from lighteval.metrics.metrics_corpus import (
@@ -57,11 +55,7 @@
     Recall,
     StringDistance,
 )
-from lighteval.metrics.normalizations import (
-    bigbench_normalizer,
-    remove_braces,
-    remove_braces_and_strip,
-)
+from lighteval.metrics.normalizations import bigbench_normalizer, remove_braces, remove_braces_and_strip
 from lighteval.metrics.sample_preparator import (
     GenerativePreparator,
     LoglikelihoodPreparator,
@@ -231,6 +225,57 @@ class Metrics(Enum):
             "summarization_compression": True,
         },
     )
+    extractiveness_de = SampleLevelMetricGrouping(
+        metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
+        sample_level_fn=Extractiveness(
+            normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text", language="de"
+        ),
+        category=SamplingMethod.GENERATIVE,
+        corpus_level_fn={
+            "summarization_coverage": np.mean,
+            "summarization_density": np.mean,
+            "summarization_compression": np.mean,
+        },
+        higher_is_better={
+            "summarization_coverage": True,
+            "summarization_density": True,
+            "summarization_compression": True,
+        },
+    )
+    extractiveness_fr = SampleLevelMetricGrouping(
+        metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
+        sample_level_fn=Extractiveness(
+            normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text", language="fr"
+        ),
+        category=SamplingMethod.GENERATIVE,
+        corpus_level_fn={
+            "summarization_coverage": np.mean,
+            "summarization_density": np.mean,
+            "summarization_compression": np.mean,
+        },
+        higher_is_better={
+            "summarization_coverage": True,
+            "summarization_density": True,
+            "summarization_compression": True,
+        },
+    )
+    extractiveness_it = SampleLevelMetricGrouping(
+        metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
+        sample_level_fn=Extractiveness(
+            normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text", language="it"
+        ),
+        category=SamplingMethod.GENERATIVE,
+        corpus_level_fn={
+            "summarization_coverage": np.mean,
+            "summarization_density": np.mean,
+            "summarization_compression": np.mean,
+        },
+        higher_is_better={
+            "summarization_coverage": True,
+            "summarization_density": True,
+            "summarization_compression": True,
+        },
+    )
     f1_score = SampleLevelMetric(
         metric_name="f1",
         sample_level_fn=F1_score(),
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -664,6 +664,7 @@ def __init__(
         normalize_input: callable = remove_braces,
         normalize_pred: callable = remove_braces_and_strip,
         input_column: str = "text",
+        language: Literal["en", "de", "fr", "it"] = "en",
     ):
         """Extractiveness metric class.
 
@@ -673,11 +674,13 @@ def __init__(
             normalize_pred (callable, optional): Function to use to normalize the predicted strings.
                 Defaults to remove_braces_and_strip from lighteval.metrics.normalizations if no normalization is applied.
             input_column (str): Column in the formatted_doc to use for the input. Defaults to "text".
+            language (str): Language ISO code for the input text. Defaults to "en".
         """
         self.stats_metric = None
         self.normalize_input = normalize_input
         self.normalize_pred = normalize_pred
         self.input_column = input_column
+        self.language = language
 
     def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str, float]:
         """Compute the extractiveness of the predictions.
@@ -694,7 +697,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str
             dict[str, float]: The extractiveness scores.
         """
         if self.stats_metric is None:
-            self.stats_metric = DataStatsMetric()
+            self.stats_metric = DataStatsMetric(language=self.language)
 
         inp = doc.specific[self.input_column]
         prediction = model_response.final_text[0]

Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ extended_tasks = [`
`110`	`110`	`s3 = ["s3fs"]`
`111`	`111`	`multilingual = [`
`112`	`112`	`"stanza",`
`113`		`- "spacy[ja,ko,th]",`
	`113`	`+ "spacy[ja,ko,th]>=3.8.0",`
`114`	`114`	`"jieba", # for chinese tokenizer`
`115`	`115`	`"pyvi", # for vietnamese tokenizer`
`116`	`116`	`]`