Skip to content

Commit 1dac455

Browse files
authored
Merge pull request #2 from huggingface/clem_doc_readme
Doc metrics + README
2 parents 3295b4f + eff1f9d commit 1dac455

14 files changed

+400
-919
lines changed

README.md

Lines changed: 50 additions & 49 deletions
Large diffs are not rendered by default.

src/lighteval/metrics/metrics.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from lighteval.metrics.metrics_sample import (
1414
BLEU,
1515
BLEURT,
16+
MRR,
1617
ROUGE,
1718
BertScore,
1819
ExactMatches,
@@ -23,7 +24,6 @@
2324
acc_golds_likelihood,
2425
extractiveness,
2526
faithfulness,
26-
mrr,
2727
)
2828
from lighteval.metrics.normalizations import (
2929
bigbench_normalizer,
@@ -277,7 +277,7 @@ class Metrics(Enum):
277277
)
278278
mrr = SampleLevelMetric(
279279
metric="mrr",
280-
sample_level_fn=mrr,
280+
sample_level_fn=MRR().compute,
281281
category=MetricCategory.MULTICHOICE,
282282
use_case=MetricUseCase.ACCURACY,
283283
corpus_level_fn=np.mean,

src/lighteval/metrics/metrics_corpus.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""This module manages all the score aggregations and computations occurring at the corpus level.
1+
"""This module manages all the metrics occurring at the corpus level.
22
Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus.
33
A number of these aggregations come from the EleutherAIHarness
44
"""
@@ -10,6 +10,7 @@
1010

1111
from lighteval.metrics.sample_preparator import (
1212
GenerativeCorpusMetricInput,
13+
LogprobCorpusMetricInput,
1314
PerplexityCorpusMetricInput,
1415
)
1516
from lighteval.utils import as_list
@@ -20,7 +21,7 @@ def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float:
2021
"""Computes the Matthews Correlation Coefficient, using scikit learn ([doc](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)).
2122
2223
Args:
23-
items (list[dict]): List of the correctly formatted dictionarinput
24+
items (list[dict]): List of GenerativeCorpusMetricInput
2425
2526
Returns:
2627
float: Score
@@ -32,13 +33,23 @@ def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float:
3233

3334
class CorpusLevelF1Score:
3435
def __init__(self, average: str, num_classes: int = 2):
35-
# If num_classes > 2, we compute multi_f1_corpus_aggregation
36-
self.average = average # weighted, macro, micro
36+
"""Stores the relevant parameters for the task's corpus level f1 score.
37+
38+
Args:
39+
average (str): Method to use to compute the f1 score. Can be weighted, macro, micro.
40+
num_classes (int, optional): Num of possible choice classes. Defaults to 2. If this parameter is above 2, we'll compute multi f1 corpus score
41+
"""
42+
if average not in ["weighted", "macro", "micro", None]:
43+
raise ValueError(
44+
f"A CorpusLevelF1Score must be initialized with weighted, macro, micro, or None as an average function. {average} was used."
45+
)
46+
self.average = average
3747
self.num_classes = num_classes
3848

39-
def compute(self, items):
40-
golds = [i["golds"] for i in items]
41-
preds = [i["preds"] for i in items]
49+
def compute(self, items: list[LogprobCorpusMetricInput]):
50+
"""Computes the metric score over all the corpus generated items, by using the scikit learn implementation."""
51+
golds = [i.golds for i in items]
52+
preds = [i.preds for i in items]
4253
# Single f1
4354
if self.num_classes == 2:
4455
fscore = sklearn.metrics.f1_score(golds, preds, average=self.average)
@@ -48,11 +59,16 @@ def compute(self, items):
4859
f1s = []
4960
for i in range(self.num_classes):
5061
f1s.append(sklearn.metrics.f1_score(y_true=golds == i, y_pred=preds == i))
51-
return np.mean(f1s)
62+
return float(np.mean(f1s))
5263

5364

5465
class CorpusLevelTranslationMetric:
5566
def __init__(self, metric_type: str):
67+
"""Stores the relevant parameters for a corpus level translation metric.
68+
69+
Args:
70+
metric_type (str): Can be any of bleu, chrf, or ter depending on the metric to use.
71+
"""
5672
if metric_type == "bleu":
5773
self.metric = sacrebleu.corpus_bleu
5874
elif metric_type == "chrf":
@@ -63,19 +79,32 @@ def __init__(self, metric_type: str):
6379
raise ValueError(f"Unknown corpus level translation metric type : {metric_type}")
6480

6581
def compute(self, items: list[GenerativeCorpusMetricInput]) -> float:
82+
"""Computes the metric score over all the corpus generated items, by using the sacrebleu implementation."""
6683
golds = [i.golds for i in items]
6784
preds = [as_list(i.preds) for i in items]
68-
return self.metric(hypotheses=preds, references=golds).score
85+
return float(self.metric(hypotheses=preds, references=golds).score)
6986

7087

7188
class CorpusLevelPerplexityMetric:
7289
def __init__(self, metric_type: str):
90+
"""Stores the relevant parameter for a corpus level perplexity metric.
91+
Perplexity metrics compute more or less the same thing, which is a variation on the
92+
average of log-probabilities over a sequence, but the normalization and processing applied
93+
is different depending on the metric type.
94+
Perplexity uses an exponential and no weights for the average, weighted perplexity uses an exponential
95+
and the number of words as weights for the log-prob average, and bits per byte uses the number of bits
96+
for normalization and divides the results by log(2).
97+
98+
Args:
99+
metric_type (str): Can be any of `perplexity`, `weighted_perplexity` or `bits_per_byte`
100+
"""
73101
if metric_type not in ["perplexity", "weighted_perplexity", "bits_per_byte"]:
74102
raise ValueError(f"Unknown corpus level perplexity metric type : {metric_type}")
75103

76104
self.metric_type = metric_type
77105

78106
def compute(self, items: list[PerplexityCorpusMetricInput]):
107+
"""Computes the metric score over all the corpus generated items."""
79108
logprobs = [i.logprobs for i in items]
80109
weights = [i.weights for i in items]
81110

0 commit comments

Comments
 (0)