Skip to content

Commit 16318bb

Browse files
NathanHBCopilot
andauthored
Add auto tests for metrics (#939)
- Adds mechanism to auto test metric. When creating a metric you now create a json file with test cases (input, output and expected results). - move unit test to a tests/unit folder. - fix broken metrics --------- Co-authored-by: Copilot <[email protected]>
1 parent 3a71f68 commit 16318bb

File tree

86 files changed

+6185
-30
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+6185
-30
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
*.json filter=lfs diff=lfs merge=lfs -text
2+
tests/unit/metrics/test_cases/*.json -filter -diff -merge text

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ nanotron = [
9999
tensorboardX = ["tensorboardX"]
100100
vllm = ["vllm>=0.10.0,<0.10.2", "ray", "more_itertools"]
101101
quality = ["ruff>=v0.11.0","pre-commit"]
102-
tests = ["pytest>=7.4.0","deepdiff"]
102+
tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"]
103103
dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
104104
docs = ["hf-doc-builder", "watchdog"]
105105
extended_tasks = [

src/lighteval/metrics/imports/summac.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,6 @@ def build_image(self, original, generated):
221221
truncation=True,
222222
max_length=self.max_input_length,
223223
return_tensors="pt",
224-
truncation_strategy="only_first",
225224
)
226225
batch_tokens = {k: v.to(self.device) for k, v in batch_tokens.items()}
227226
with torch.no_grad():

src/lighteval/metrics/metrics.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ class Metrics(Enum):
390390
metric_name="mf1",
391391
sample_level_fn=LoglikelihoodPreparator(is_single_token=True),
392392
category=SamplingMethod.LOGPROBS,
393-
corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3),
393+
corpus_level_fn=CorpusLevelF1Score(average="micro", num_classes=3),
394394
higher_is_better=True,
395395
)
396396
pass_at_k = SampleLevelMetric(

src/lighteval/metrics/metrics_corpus.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,11 @@ def compute_corpus(self, items: list[LogprobCorpusMetricInput]):
105105
# Multi f1
106106
f1s = []
107107
for i in range(self.num_classes):
108-
f1s.append(sklearn.metrics.f1_score(y_true=golds == i, y_pred=preds == i))
108+
f1s.append(
109+
sklearn.metrics.f1_score(
110+
y_true=[g == i for g in golds], y_pred=[p == i for p in preds], average=self.average
111+
)
112+
)
109113
return float(np.mean(f1s))
110114

111115

@@ -122,6 +126,9 @@ def __init__(self, metric_type: str, lang: Literal["zh", "ja", "ko", ""] = ""):
122126

123127
def get_metric(self):
124128
if self.metric_type == "bleu":
129+
import nltk
130+
131+
nltk.download("punkt_tab")
125132
return sacrebleu.BLEU(trg_lang=self.lang)
126133
elif self.metric_type == "chrf":
127134
return sacrebleu.CHRF()
@@ -144,7 +151,14 @@ def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float:
144151
f"Multiple predictions present, keeping only the first prediction (when computing sacrebleu.{metric.__name__})."
145152
)
146153
preds.append(pred[0])
147-
return float(metric.corpus_score(hypotheses=preds, references=golds).score)
154+
155+
if self.metric_type == "bleu":
156+
golds = [[gold[0] for gold in golds]]
157+
158+
corpus_score = metric.corpus_score(hypotheses=preds, references=golds)
159+
score = corpus_score.score
160+
results = float(score)
161+
return results
148162

149163

150164
class CorpusLevelPerplexityMetric(CorpusLevelComputation):

src/lighteval/metrics/metrics_sample.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -823,6 +823,9 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
823823
Returns:
824824
float: Score over the current sample's items.
825825
"""
826+
import nltk
827+
828+
nltk.download("punkt_tab")
826829
golds = doc.get_golds()
827830
predictions = model_response.final_text
828831
return np.mean([self._bleu_score(golds, p) for p in predictions])
@@ -1122,6 +1125,7 @@ def __init__(
11221125
raise ValueError(f"Unknown normalization function: {normalize}")
11231126
else:
11241127
self.normalize = normalize
1128+
11251129
self.strip_strings = strip_strings
11261130

11271131
if callable(sample_scoring_function):
@@ -1141,6 +1145,7 @@ def __init__(
11411145
else:
11421146
self.type_exact_match = "full"
11431147
self.compute_score = self.default_sample_scoring
1148+
self.score_sample = self.default_sample_scoring
11441149

11451150
def preprocess(self, text: str) -> str:
11461151
if not text:
@@ -1194,7 +1199,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs):
11941199
"""
11951200
all_scores = []
11961201
for i in range(self.k):
1197-
all_scores.append(self.compute_score(doc, model_response[i]))
1202+
all_scores.append(self.score_sample(doc, model_response[i]))
11981203

11991204
avg_score = np.mean(all_scores)
12001205
return avg_score
@@ -1221,30 +1226,31 @@ def __init__(self, k: int | None = None, **kwargs):
12211226
self.k = k
12221227
self.attribute_must_be_set = ["k"]
12231228

1224-
def compute(self, model_response: ModelResponse, docs: Doc, **kwargs):
1229+
def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
12251230
"""Computes the metric over a list of golds and predictions for one single sample.
1226-
It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones,
1227-
then compares it to the gold.
1231+
It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones, then compares it to the gold.
12281232
12291233
Args:
1234+
doc (Doc): The document containing gold references.
12301235
model_response (ModelResponse): The model's response containing predictions.
1231-
docs (Doc): The document containing gold references.
12321236
**kwargs: Additional keyword arguments.
12331237
12341238
Returns:
12351239
float: Aggregated score over the current sample's items.
12361240
"""
12371241
if self.k is None:
12381242
raise Exception("You did not set the value of k")
1239-
golds = docs.get_golds()
1243+
1244+
golds = doc.get_golds()
1245+
12401246
if len(golds) > 1:
12411247
raise Exception("Cannot compute maj@k with several golds")
12421248

1243-
processed_choices = [self.preprocess(text=g) for g in docs.get_golds()]
1249+
processed_choices = [self.preprocess(text=g) for g in doc.get_golds()]
12441250
new_doc = Doc(
12451251
choices=processed_choices,
1246-
query=docs.query,
1247-
gold_index=docs.gold_index,
1252+
query=doc.query,
1253+
gold_index=list(range(len(processed_choices))),
12481254
)
12491255
all_answers = []
12501256
for pred in model_response.final_text[: self.k]:
@@ -1253,7 +1259,7 @@ def compute(self, model_response: ModelResponse, docs: Doc, **kwargs):
12531259
new_model_response = ModelResponse(
12541260
text=[majority_prediction],
12551261
)
1256-
return self.compute_score(new_model_response, new_doc)
1262+
return self.compute_score(new_doc, new_model_response)
12571263

12581264
def num_samples(self):
12591265
return self.k
@@ -1433,8 +1439,8 @@ def compute_mg_pass_at_k(n, c, k):
14331439
metrics = {}
14341440
for k in ks:
14351441
for t in thresholds:
1436-
metrics[f"{self.name}@{k}_{t}"] = compute_g_pass_at_k(n, c, k, t)
1437-
metrics[f"m{self.name}@{k}"] = compute_mg_pass_at_k(n, c, k)
1442+
metrics[f"{self.name}{k}_{t}"] = compute_g_pass_at_k(n, c, k, t)
1443+
metrics[f"m{self.name}{k}"] = compute_mg_pass_at_k(n, c, k)
14381444

14391445
return metrics
14401446

@@ -1446,8 +1452,8 @@ def metric_names(self):
14461452
metrics = []
14471453
for k in ks:
14481454
for t in thresholds:
1449-
metrics.append(f"{self.name}@{k}_{t}")
1450-
metrics.append(f"m{self.name}@{k}")
1455+
metrics.append(f"{self.name}{k}_{t}")
1456+
metrics.append(f"m{self.name}{k}")
14511457

14521458
return metrics
14531459

src/lighteval/metrics/utils/metric_utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ def compute_sample(
5050
elif isinstance(self.sample_level_fn, Preparator):
5151
sample_level_fn = self.sample_level_fn.prepare
5252
else:
53-
breakpoint()
5453
raise ValueError(
5554
f"Incorrect type for {self.sample_level_fn}, should be a SampleLevelComputation or Preparator"
5655
)

src/lighteval/models/model_output.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def __getitem__(self, index: int) -> "ModelResponse":
149149
input=self.input,
150150
input_tokens=self.input_tokens,
151151
text=[self.text[index]],
152-
output_tokens=[self.output_tokens[index]],
152+
output_tokens=[self.output_tokens[index]] if self.output_tokens else [],
153153
logprobs=[self.logprobs[index]] if self.logprobs else [],
154154
argmax_logits_eq_gold=[self.argmax_logits_eq_gold[index]] if self.argmax_logits_eq_gold else [],
155155
logits=[self.logits[index]] if self.logits else None,

src/lighteval/tasks/extended/ifbench/instructions.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def build_description(self, *, N=None):
142142
"""Build the instruction description.
143143
144144
Args:
145-
n: An integer specifying the number of unique words contained in the response.
145+
N: An integer specifying the number of unique words contained in the response.
146146
147147
Returns:
148148
A string representing the instruction description.
@@ -2113,7 +2113,7 @@ def build_description(self, *, prompt_to_repeat=None):
21132113
"""Build the instruction description.
21142114
21152115
Args:
2116-
keyword: A string representing a keyword that is expected in the response.
2116+
prompt_to_repeat: The prompt that is meant to be repeated.
21172117
21182118
Returns:
21192119
A string representing the instruction description.
@@ -2187,11 +2187,12 @@ def build_description(self, prompt_to_repeat=None, n_start=None, n_end=None):
21872187
"""Build the instruction description.
21882188
21892189
Args:
2190-
n_start: An integer representing the start index of the span.
2191-
n_end: An integer representing the end index of the span.
2190+
prompt_to_repeat: The prompt that is meant to be repeated.
2191+
n_start: An integer representing the start index of the span.
2192+
n_end: An integer representing the end index of the span.
21922193
21932194
Returns:
2194-
A string representing the instruction description.
2195+
A string representing the instruction description.
21952196
"""
21962197
if not prompt_to_repeat:
21972198
raise ValueError("prompt_to_repeat must be set.")

src/lighteval/tasks/extended/lcb/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ def codegen_metric(model_response: ModelResponse, doc: Doc, **kwargs) -> float:
113113
higher_is_better=True,
114114
sample_level_fn=codegen_metric,
115115
corpus_level_fn=np.mean,
116+
batched_compute=False,
116117
)
117118

118119

0 commit comments

Comments
 (0)