huggingface · NathanHB · Sep 17, 2025 · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -1 +1,2 @@
 *.json filter=lfs diff=lfs merge=lfs -text
+tests/unit/metrics/test_cases/*.json -filter -diff -merge text
diff --git a/pyproject.toml b/pyproject.toml
@@ -99,7 +99,7 @@ nanotron = [
 tensorboardX = ["tensorboardX"]
 vllm = ["vllm>=0.10.0,<0.10.2", "ray", "more_itertools"]
 quality = ["ruff>=v0.11.0","pre-commit"]
-tests = ["pytest>=7.4.0","deepdiff"]
+tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"]
 dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
 docs = ["hf-doc-builder", "watchdog"]
 extended_tasks = [

diff --git a/src/lighteval/metrics/imports/summac.py b/src/lighteval/metrics/imports/summac.py
@@ -221,7 +221,6 @@ def build_image(self, original, generated):
                     truncation=True,
                     max_length=self.max_input_length,
                     return_tensors="pt",
-                    truncation_strategy="only_first",
                 )
                 batch_tokens = {k: v.to(self.device) for k, v in batch_tokens.items()}
                 with torch.no_grad():

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -390,7 +390,7 @@ class Metrics(Enum):
         metric_name="mf1",
         sample_level_fn=LoglikelihoodPreparator(is_single_token=True),
         category=SamplingMethod.LOGPROBS,
-        corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3),
+        corpus_level_fn=CorpusLevelF1Score(average="micro", num_classes=3),
         higher_is_better=True,
     )
     pass_at_k = SampleLevelMetric(

diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
@@ -105,7 +105,11 @@ def compute_corpus(self, items: list[LogprobCorpusMetricInput]):
         # Multi f1
         f1s = []
         for i in range(self.num_classes):
-            f1s.append(sklearn.metrics.f1_score(y_true=golds == i, y_pred=preds == i))
+            f1s.append(
+                sklearn.metrics.f1_score(
+                    y_true=[g == i for g in golds], y_pred=[p == i for p in preds], average=self.average
+                )
+            )
         return float(np.mean(f1s))
 
 
@@ -122,6 +126,9 @@ def __init__(self, metric_type: str, lang: Literal["zh", "ja", "ko", ""] = ""):
 
     def get_metric(self):
         if self.metric_type == "bleu":
+            import nltk
+
+            nltk.download("punkt_tab")
             return sacrebleu.BLEU(trg_lang=self.lang)
         elif self.metric_type == "chrf":
             return sacrebleu.CHRF()
@@ -144,7 +151,14 @@ def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float:
                     f"Multiple predictions present, keeping only the first prediction (when computing sacrebleu.{metric.__name__})."
                 )
             preds.append(pred[0])
-        return float(metric.corpus_score(hypotheses=preds, references=golds).score)
+
+        if self.metric_type == "bleu":
+            golds = [[gold[0] for gold in golds]]
+
+        corpus_score = metric.corpus_score(hypotheses=preds, references=golds)
+        score = corpus_score.score
+        results = float(score)
+        return results
 
 
 class CorpusLevelPerplexityMetric(CorpusLevelComputation):

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -823,6 +823,9 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
         Returns:
             float: Score over the current sample's items.
         """
+        import nltk
+
+        nltk.download("punkt_tab")
         golds = doc.get_golds()
         predictions = model_response.final_text
         return np.mean([self._bleu_score(golds, p) for p in predictions])
@@ -1122,6 +1125,7 @@ def __init__(
                 raise ValueError(f"Unknown normalization function: {normalize}")
         else:
             self.normalize = normalize
+
         self.strip_strings = strip_strings
 
         if callable(sample_scoring_function):
@@ -1141,6 +1145,7 @@ def __init__(
             else:
                 self.type_exact_match = "full"
             self.compute_score = self.default_sample_scoring
+            self.score_sample = self.default_sample_scoring
 
     def preprocess(self, text: str) -> str:
         if not text:
@@ -1194,7 +1199,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs):
         """
         all_scores = []
         for i in range(self.k):
-            all_scores.append(self.compute_score(doc, model_response[i]))
+            all_scores.append(self.score_sample(doc, model_response[i]))
 
         avg_score = np.mean(all_scores)
         return avg_score
@@ -1221,30 +1226,31 @@ def __init__(self, k: int | None = None, **kwargs):
         self.k = k
         self.attribute_must_be_set = ["k"]
 
-    def compute(self, model_response: ModelResponse, docs: Doc, **kwargs):
+    def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
         """Computes the metric over a list of golds and predictions for one single sample.
-        It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones,
-        then compares it to the gold.
+        It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones, then compares it to the gold.
 
         Args:
+            doc (Doc): The document containing gold references.
             model_response (ModelResponse): The model's response containing predictions.
-            docs (Doc): The document containing gold references.
             **kwargs: Additional keyword arguments.
 
         Returns:
             float: Aggregated score over the current sample's items.
         """
         if self.k is None:
             raise Exception("You did not set the value of k")
-        golds = docs.get_golds()
+
+        golds = doc.get_golds()
+
         if len(golds) > 1:
             raise Exception("Cannot compute maj@k with several golds")
 
-        processed_choices = [self.preprocess(text=g) for g in docs.get_golds()]
+        processed_choices = [self.preprocess(text=g) for g in doc.get_golds()]
         new_doc = Doc(
             choices=processed_choices,
-            query=docs.query,
-            gold_index=docs.gold_index,
+            query=doc.query,
+            gold_index=list(range(len(processed_choices))),
         )
         all_answers = []
         for pred in model_response.final_text[: self.k]:
@@ -1253,7 +1259,7 @@ def compute(self, model_response: ModelResponse, docs: Doc, **kwargs):
         new_model_response = ModelResponse(
             text=[majority_prediction],
         )
-        return self.compute_score(new_model_response, new_doc)
+        return self.compute_score(new_doc, new_model_response)
 
     def num_samples(self):
         return self.k
@@ -1433,8 +1439,8 @@ def compute_mg_pass_at_k(n, c, k):
         metrics = {}
         for k in ks:
             for t in thresholds:
-                metrics[f"{self.name}@{k}_{t}"] = compute_g_pass_at_k(n, c, k, t)
-            metrics[f"m{self.name}@{k}"] = compute_mg_pass_at_k(n, c, k)
+                metrics[f"{self.name}{k}_{t}"] = compute_g_pass_at_k(n, c, k, t)
+            metrics[f"m{self.name}{k}"] = compute_mg_pass_at_k(n, c, k)
 
         return metrics
 
@@ -1446,8 +1452,8 @@ def metric_names(self):
         metrics = []
         for k in ks:
             for t in thresholds:
-                metrics.append(f"{self.name}@{k}_{t}")
-            metrics.append(f"m{self.name}@{k}")
+                metrics.append(f"{self.name}{k}_{t}")
+            metrics.append(f"m{self.name}{k}")
 
         return metrics
 

diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py
@@ -50,7 +50,6 @@ def compute_sample(
         elif isinstance(self.sample_level_fn, Preparator):
             sample_level_fn = self.sample_level_fn.prepare
         else:
-            breakpoint()
             raise ValueError(
                 f"Incorrect type for {self.sample_level_fn}, should be a SampleLevelComputation or Preparator"
             )

diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py
@@ -149,7 +149,7 @@ def __getitem__(self, index: int) -> "ModelResponse":
             input=self.input,
             input_tokens=self.input_tokens,
             text=[self.text[index]],
-            output_tokens=[self.output_tokens[index]],
+            output_tokens=[self.output_tokens[index]] if self.output_tokens else [],
             logprobs=[self.logprobs[index]] if self.logprobs else [],
             argmax_logits_eq_gold=[self.argmax_logits_eq_gold[index]] if self.argmax_logits_eq_gold else [],
             logits=[self.logits[index]] if self.logits else None,

diff --git a/src/lighteval/tasks/extended/ifbench/instructions.py b/src/lighteval/tasks/extended/ifbench/instructions.py
@@ -142,7 +142,7 @@ def build_description(self, *, N=None):
         """Build the instruction description.
 
         Args:
-          n: An integer specifying the number of unique words contained in the response.
+          N: An integer specifying the number of unique words contained in the response.
 
         Returns:
           A string representing the instruction description.
@@ -2113,7 +2113,7 @@ def build_description(self, *, prompt_to_repeat=None):
         """Build the instruction description.
 
         Args:
-          keyword: A string representing a keyword that is expected in the response.
+          prompt_to_repeat: The prompt that is meant to be repeated.
 
         Returns:
           A string representing the instruction description.
@@ -2187,11 +2187,12 @@ def build_description(self, prompt_to_repeat=None, n_start=None, n_end=None):
         """Build the instruction description.
 
         Args:
-        n_start: An integer representing the start index of the span.
-        n_end: An integer representing the end index of the span.
+            prompt_to_repeat: The prompt that is meant to be repeated.
+            n_start: An integer representing the start index of the span.
+            n_end: An integer representing the end index of the span.
 
         Returns:
-        A string representing the instruction description.
+            A string representing the instruction description.
         """
         if not prompt_to_repeat:
             raise ValueError("prompt_to_repeat must be set.")

diff --git a/src/lighteval/tasks/extended/lcb/main.py b/src/lighteval/tasks/extended/lcb/main.py
@@ -113,6 +113,7 @@ def codegen_metric(model_response: ModelResponse, doc: Doc, **kwargs) -> float:
     higher_is_better=True,
     sample_level_fn=codegen_metric,
     corpus_level_fn=np.mean,
+    batched_compute=False,
 )
 
 

diff --git a/tests/logging/test_evaluation_tracker.py → ...s/unit/logging/test_evaluation_tracker.py b/tests/logging/test_evaluation_tracker.py → ...s/unit/logging/test_evaluation_tracker.py
diff --git a/tests/unit/metrics/pytest.ini b/tests/unit/metrics/pytest.ini
@@ -0,0 +1,18 @@
+[tool:pytest]
+testpaths = .
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+addopts =
+    -v
+    --tb=short
+    --strict-markers
+    --disable-warnings
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    unit: marks tests as unit tests
+    integration: marks tests as integration tests
+    automated: marks tests as automated metric tests
+filterwarnings =
+    ignore::DeprecationWarning
+    ignore::PendingDeprecationWarning
diff --git a/tests/unit/metrics/test_automated_metrics_pytest.py b/tests/unit/metrics/test_automated_metrics_pytest.py
@@ -0,0 +1,104 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""
+Pytest integration for the automated metric testing framework.
+
+This module provides pytest fixtures and test functions that can load and run
+test cases from JSON files.
+"""
+
+import json
+from pathlib import Path
+from typing import List
+
+import pytest
+from test_metrics_automated import AutomatedMetricTester, MetricTestSuite
+
+
+@pytest.fixture
+def metric_tester():
+    """Fixture providing an AutomatedMetricTester instance."""
+    return AutomatedMetricTester()
+
+
+def load_test_suite_from_file(file_path: str) -> MetricTestSuite:
+    """Load a test suite from a JSON file."""
+    with open(file_path, "r") as f:
+        data = json.load(f)
+    return MetricTestSuite(**data)
+
+
+def get_test_suite_files() -> List[str]:
+    """Get all test suite JSON files from the test_cases directory."""
+    test_cases_dir = Path(__file__).parent / "test_cases"
+    if not test_cases_dir.exists():
+        return []
+
+    json_files = list(test_cases_dir.glob("*.json"))
+    return [str(f) for f in json_files]
+
+
+def parametrize_test_suites():
+    """Create parametrized test cases for all test suite files."""
+    test_files = get_test_suite_files()
+    if not test_files:
+        pytest.skip("No test suite files found")
+
+    return test_files
+
+
+class TestAutomatedMetrics:
+    """Test class for automated metric testing with pytest."""
+
+    @pytest.mark.parametrize("test_file", parametrize_test_suites())
+    def test_metric_suite(self, metric_tester, test_file):
+        """Test a complete metric test suite from a JSON file."""
+        test_suite = load_test_suite_from_file(test_file)
+
+        # Run all test cases in the suite
+        results = metric_tester.run_test_suite(test_suite)
+
+        # Separate failed tests from skipped tests
+        failed_tests = [r for r in results if not r["success"] and not r.get("skipped", False)]
+        skipped_tests = [r for r in results if r.get("skipped", False)]
+
+        if failed_tests:
+            # Create detailed error message
+            error_msg = f"Test suite '{test_suite.name}' failed with {len(failed_tests)} failed tests:\n"
+            for result in failed_tests:
+                error_msg += f"\n  - {result['test_case']}: "
+                if result["error"]:
+                    error_msg += f"Error: {result['error']}"
+                else:
+                    error_msg += f"Expected {result['expected']}, got {result['actual']}"
+
+            pytest.fail(error_msg)
+
+        # Log skipped tests
+        if skipped_tests:
+            print(f"\nSkipped {len(skipped_tests)} tests in '{test_suite.name}':")
+            for result in skipped_tests:
+                print(f"  - {result['test_case']}: {result.get('skip_reason', 'Unknown reason')}")
+
+        # All non-skipped tests passed
+        assert len(failed_tests) == 0, f"Expected all non-skipped tests to pass, but {len(failed_tests)} failed"
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		*.json filter=lfs diff=lfs merge=lfs -text
		tests/unit/metrics/test_cases/*.json -filter -diff -merge text
Copy link Member Author NathanHB Sep 15, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. do not use git-lfs for json files in the test_cases dir clefourrier reacted with thumbs up emoji