Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
d8cfc2e
Fixe Sampling Metrics and Evals
NathanHB Aug 27, 2025
7ae5da5
remove breakpoint
NathanHB Aug 27, 2025
a00f3c0
add auto tests for metrics
NathanHB Aug 27, 2025
a892260
Merge branch 'main' into nathan-add-tests-for-metrics
NathanHB Aug 27, 2025
bf25211
Delete tests/unit/metrics/test_cases/README.md
NathanHB Aug 27, 2025
2b65d08
Delete tests/unit/metrics/test_unit_harness_metrics.py
NathanHB Aug 27, 2025
594b942
add pip as test dependency, for spacy to work correctly
NathanHB Aug 27, 2025
6db8263
Merge branch 'nathan-add-tests-for-metrics' of github.com:huggingface…
NathanHB Aug 27, 2025
9f7c2be
fix tests and reorg files
NathanHB Aug 28, 2025
e1a55ac
fix tests and reorg files
NathanHB Aug 28, 2025
c9e7243
better tests, passing
NathanHB Sep 1, 2025
e493b49
Merge remote-tracking branch 'origin/main' into nathan-add-tests-for-…
NathanHB Sep 1, 2025
5f323b7
Merge remote-tracking branch 'origin/main' into nathan-add-tests-for-…
NathanHB Sep 1, 2025
3d7b448
fix tests
NathanHB Sep 1, 2025
0c4a554
fix faithfullness metric
NathanHB Sep 2, 2025
594c269
adds corpus level metric testing
NathanHB Sep 3, 2025
fc01e6b
fix bleu metric
NathanHB Sep 3, 2025
c574035
fix bleu metric
NathanHB Sep 3, 2025
e127955
Merge branch 'main' into nathan-add-tests-for-metrics
NathanHB Sep 8, 2025
51db828
fix tests after merge
NathanHB Sep 8, 2025
70a5a10
Delete tests/slow_tests/test_sglang_model.py
NathanHB Sep 8, 2025
6384835
test simpleqa judge
NathanHB Sep 8, 2025
3c9aec6
Merge branch 'nathan-add-tests-for-metrics' of github.com:huggingface…
NathanHB Sep 8, 2025
b5b82a8
fix avg at k
NathanHB Sep 9, 2025
bf740a3
remove test files from git lfs cache
NathanHB Sep 15, 2025
ef216dc
re-add test-files to actual repo
NathanHB Sep 15, 2025
f903ee0
use SKIPPED_METRIC list instead of hardcoding all metric names
NathanHB Sep 15, 2025
86892e9
Merge remote-tracking branch 'origin/main' into nathan-add-tests-for-…
NathanHB Sep 15, 2025
23e9714
Update tests/unit/metrics/test_metrics_automated.py
NathanHB Sep 16, 2025
048b407
fix tests
NathanHB Sep 16, 2025
c4aebce
remove breakpoint
NathanHB Sep 16, 2025
432345e
remove breakpoint
NathanHB Sep 16, 2025
dab1dae
Merge branch 'main' into nathan-add-tests-for-metrics
NathanHB Sep 16, 2025
fd27034
fix quality
NathanHB Sep 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
*.json filter=lfs diff=lfs merge=lfs -text
tests/unit/metrics/test_cases/*.json -filter -diff -merge text
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do not use git-lfs for json files in the test_cases dir

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ nanotron = [
tensorboardX = ["tensorboardX"]
vllm = ["vllm>=0.10.0,<0.10.2", "ray", "more_itertools"]
quality = ["ruff>=v0.11.0","pre-commit"]
tests = ["pytest>=7.4.0","deepdiff"]
tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"]
dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
docs = ["hf-doc-builder", "watchdog"]
extended_tasks = [
Expand Down
1 change: 0 additions & 1 deletion src/lighteval/metrics/imports/summac.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,6 @@ def build_image(self, original, generated):
truncation=True,
max_length=self.max_input_length,
return_tensors="pt",
truncation_strategy="only_first",
)
batch_tokens = {k: v.to(self.device) for k, v in batch_tokens.items()}
with torch.no_grad():
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ class Metrics(Enum):
metric_name="mf1",
sample_level_fn=LoglikelihoodPreparator(is_single_token=True),
category=SamplingMethod.LOGPROBS,
corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3),
corpus_level_fn=CorpusLevelF1Score(average="micro", num_classes=3),
higher_is_better=True,
)
pass_at_k = SampleLevelMetric(
Expand Down
18 changes: 16 additions & 2 deletions src/lighteval/metrics/metrics_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,11 @@ def compute_corpus(self, items: list[LogprobCorpusMetricInput]):
# Multi f1
f1s = []
for i in range(self.num_classes):
f1s.append(sklearn.metrics.f1_score(y_true=golds == i, y_pred=preds == i))
f1s.append(
sklearn.metrics.f1_score(
y_true=[g == i for g in golds], y_pred=[p == i for p in preds], average=self.average
)
)
return float(np.mean(f1s))


Expand All @@ -122,6 +126,9 @@ def __init__(self, metric_type: str, lang: Literal["zh", "ja", "ko", ""] = ""):

def get_metric(self):
if self.metric_type == "bleu":
import nltk

nltk.download("punkt_tab")
return sacrebleu.BLEU(trg_lang=self.lang)
elif self.metric_type == "chrf":
return sacrebleu.CHRF()
Expand All @@ -144,7 +151,14 @@ def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float:
f"Multiple predictions present, keeping only the first prediction (when computing sacrebleu.{metric.__name__})."
)
preds.append(pred[0])
return float(metric.corpus_score(hypotheses=preds, references=golds).score)

if self.metric_type == "bleu":
golds = [[gold[0] for gold in golds]]

corpus_score = metric.corpus_score(hypotheses=preds, references=golds)
score = corpus_score.score
results = float(score)
return results


class CorpusLevelPerplexityMetric(CorpusLevelComputation):
Expand Down
34 changes: 20 additions & 14 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,6 +823,9 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
Returns:
float: Score over the current sample's items.
"""
import nltk

nltk.download("punkt_tab")
golds = doc.get_golds()
predictions = model_response.final_text
return np.mean([self._bleu_score(golds, p) for p in predictions])
Expand Down Expand Up @@ -1122,6 +1125,7 @@ def __init__(
raise ValueError(f"Unknown normalization function: {normalize}")
else:
self.normalize = normalize

self.strip_strings = strip_strings

if callable(sample_scoring_function):
Expand All @@ -1141,6 +1145,7 @@ def __init__(
else:
self.type_exact_match = "full"
self.compute_score = self.default_sample_scoring
self.score_sample = self.default_sample_scoring

def preprocess(self, text: str) -> str:
if not text:
Expand Down Expand Up @@ -1194,7 +1199,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs):
"""
all_scores = []
for i in range(self.k):
all_scores.append(self.compute_score(doc, model_response[i]))
all_scores.append(self.score_sample(doc, model_response[i]))

avg_score = np.mean(all_scores)
return avg_score
Expand All @@ -1221,30 +1226,31 @@ def __init__(self, k: int | None = None, **kwargs):
self.k = k
self.attribute_must_be_set = ["k"]

def compute(self, model_response: ModelResponse, docs: Doc, **kwargs):
def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
"""Computes the metric over a list of golds and predictions for one single sample.
It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones,
then compares it to the gold.
It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones, then compares it to the gold.

Args:
doc (Doc): The document containing gold references.
model_response (ModelResponse): The model's response containing predictions.
docs (Doc): The document containing gold references.
**kwargs: Additional keyword arguments.

Returns:
float: Aggregated score over the current sample's items.
"""
if self.k is None:
raise Exception("You did not set the value of k")
golds = docs.get_golds()

golds = doc.get_golds()

if len(golds) > 1:
raise Exception("Cannot compute maj@k with several golds")

processed_choices = [self.preprocess(text=g) for g in docs.get_golds()]
processed_choices = [self.preprocess(text=g) for g in doc.get_golds()]
new_doc = Doc(
choices=processed_choices,
query=docs.query,
gold_index=docs.gold_index,
query=doc.query,
gold_index=list(range(len(processed_choices))),
)
all_answers = []
for pred in model_response.final_text[: self.k]:
Expand All @@ -1253,7 +1259,7 @@ def compute(self, model_response: ModelResponse, docs: Doc, **kwargs):
new_model_response = ModelResponse(
text=[majority_prediction],
)
return self.compute_score(new_model_response, new_doc)
return self.compute_score(new_doc, new_model_response)

def num_samples(self):
return self.k
Expand Down Expand Up @@ -1433,8 +1439,8 @@ def compute_mg_pass_at_k(n, c, k):
metrics = {}
for k in ks:
for t in thresholds:
metrics[f"{self.name}@{k}_{t}"] = compute_g_pass_at_k(n, c, k, t)
metrics[f"m{self.name}@{k}"] = compute_mg_pass_at_k(n, c, k)
metrics[f"{self.name}{k}_{t}"] = compute_g_pass_at_k(n, c, k, t)
metrics[f"m{self.name}{k}"] = compute_mg_pass_at_k(n, c, k)

return metrics

Expand All @@ -1446,8 +1452,8 @@ def metric_names(self):
metrics = []
for k in ks:
for t in thresholds:
metrics.append(f"{self.name}@{k}_{t}")
metrics.append(f"m{self.name}@{k}")
metrics.append(f"{self.name}{k}_{t}")
metrics.append(f"m{self.name}{k}")

return metrics

Expand Down
1 change: 0 additions & 1 deletion src/lighteval/metrics/utils/metric_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def compute_sample(
elif isinstance(self.sample_level_fn, Preparator):
sample_level_fn = self.sample_level_fn.prepare
else:
breakpoint()
raise ValueError(
f"Incorrect type for {self.sample_level_fn}, should be a SampleLevelComputation or Preparator"
)
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/models/model_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def __getitem__(self, index: int) -> "ModelResponse":
input=self.input,
input_tokens=self.input_tokens,
text=[self.text[index]],
output_tokens=[self.output_tokens[index]],
output_tokens=[self.output_tokens[index]] if self.output_tokens else [],
logprobs=[self.logprobs[index]] if self.logprobs else [],
argmax_logits_eq_gold=[self.argmax_logits_eq_gold[index]] if self.argmax_logits_eq_gold else [],
logits=[self.logits[index]] if self.logits else None,
Expand Down
11 changes: 6 additions & 5 deletions src/lighteval/tasks/extended/ifbench/instructions.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def build_description(self, *, N=None):
"""Build the instruction description.

Args:
n: An integer specifying the number of unique words contained in the response.
N: An integer specifying the number of unique words contained in the response.

Returns:
A string representing the instruction description.
Expand Down Expand Up @@ -2113,7 +2113,7 @@ def build_description(self, *, prompt_to_repeat=None):
"""Build the instruction description.

Args:
keyword: A string representing a keyword that is expected in the response.
prompt_to_repeat: The prompt that is meant to be repeated.

Returns:
A string representing the instruction description.
Expand Down Expand Up @@ -2187,11 +2187,12 @@ def build_description(self, prompt_to_repeat=None, n_start=None, n_end=None):
"""Build the instruction description.

Args:
n_start: An integer representing the start index of the span.
n_end: An integer representing the end index of the span.
prompt_to_repeat: The prompt that is meant to be repeated.
n_start: An integer representing the start index of the span.
n_end: An integer representing the end index of the span.

Returns:
A string representing the instruction description.
A string representing the instruction description.
"""
if not prompt_to_repeat:
raise ValueError("prompt_to_repeat must be set.")
Expand Down
1 change: 1 addition & 0 deletions src/lighteval/tasks/extended/lcb/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def codegen_metric(model_response: ModelResponse, doc: Doc, **kwargs) -> float:
higher_is_better=True,
sample_level_fn=codegen_metric,
corpus_level_fn=np.mean,
batched_compute=False,
)


Expand Down
18 changes: 18 additions & 0 deletions tests/unit/metrics/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[tool:pytest]
testpaths = .
python_files = test_*.py
python_classes = Test*
python_functions = test_*
addopts =
-v
--tb=short
--strict-markers
--disable-warnings
markers =
slow: marks tests as slow (deselect with '-m "not slow"')
unit: marks tests as unit tests
integration: marks tests as integration tests
automated: marks tests as automated metric tests
filterwarnings =
ignore::DeprecationWarning
ignore::PendingDeprecationWarning
104 changes: 104 additions & 0 deletions tests/unit/metrics/test_automated_metrics_pytest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
Pytest integration for the automated metric testing framework.

This module provides pytest fixtures and test functions that can load and run
test cases from JSON files.
"""

import json
from pathlib import Path
from typing import List

import pytest
from test_metrics_automated import AutomatedMetricTester, MetricTestSuite


@pytest.fixture
def metric_tester():
"""Fixture providing an AutomatedMetricTester instance."""
return AutomatedMetricTester()


def load_test_suite_from_file(file_path: str) -> MetricTestSuite:
"""Load a test suite from a JSON file."""
with open(file_path, "r") as f:
data = json.load(f)
return MetricTestSuite(**data)


def get_test_suite_files() -> List[str]:
"""Get all test suite JSON files from the test_cases directory."""
test_cases_dir = Path(__file__).parent / "test_cases"
if not test_cases_dir.exists():
return []

json_files = list(test_cases_dir.glob("*.json"))
return [str(f) for f in json_files]


def parametrize_test_suites():
"""Create parametrized test cases for all test suite files."""
test_files = get_test_suite_files()
if not test_files:
pytest.skip("No test suite files found")

return test_files


class TestAutomatedMetrics:
"""Test class for automated metric testing with pytest."""

@pytest.mark.parametrize("test_file", parametrize_test_suites())
def test_metric_suite(self, metric_tester, test_file):
"""Test a complete metric test suite from a JSON file."""
test_suite = load_test_suite_from_file(test_file)

# Run all test cases in the suite
results = metric_tester.run_test_suite(test_suite)

# Separate failed tests from skipped tests
failed_tests = [r for r in results if not r["success"] and not r.get("skipped", False)]
skipped_tests = [r for r in results if r.get("skipped", False)]

if failed_tests:
# Create detailed error message
error_msg = f"Test suite '{test_suite.name}' failed with {len(failed_tests)} failed tests:\n"
for result in failed_tests:
error_msg += f"\n - {result['test_case']}: "
if result["error"]:
error_msg += f"Error: {result['error']}"
else:
error_msg += f"Expected {result['expected']}, got {result['actual']}"

pytest.fail(error_msg)

# Log skipped tests
if skipped_tests:
print(f"\nSkipped {len(skipped_tests)} tests in '{test_suite.name}':")
for result in skipped_tests:
print(f" - {result['test_case']}: {result.get('skip_reason', 'Unknown reason')}")

# All non-skipped tests passed
assert len(failed_tests) == 0, f"Expected all non-skipped tests to pass, but {len(failed_tests)} failed"
Loading
Loading