1
- """This module manages all the score aggregations and computations occurring at the corpus level.
1
+ """This module manages all the metrics occurring at the corpus level.
2
2
Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus.
3
3
A number of these aggregations come from the EleutherAIHarness
4
4
"""
10
10
11
11
from lighteval .metrics .sample_preparator import (
12
12
GenerativeCorpusMetricInput ,
13
+ LogprobCorpusMetricInput ,
13
14
PerplexityCorpusMetricInput ,
14
15
)
15
16
from lighteval .utils import as_list
@@ -20,7 +21,7 @@ def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float:
20
21
"""Computes the Matthews Correlation Coefficient, using scikit learn ([doc](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)).
21
22
22
23
Args:
23
- items (list[dict]): List of the correctly formatted dictionarinput
24
+ items (list[dict]): List of GenerativeCorpusMetricInput
24
25
25
26
Returns:
26
27
float: Score
@@ -32,13 +33,23 @@ def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float:
32
33
33
34
class CorpusLevelF1Score :
34
35
def __init__ (self , average : str , num_classes : int = 2 ):
35
- # If num_classes > 2, we compute multi_f1_corpus_aggregation
36
- self .average = average # weighted, macro, micro
36
+ """Stores the relevant parameters for the task's corpus level f1 score.
37
+
38
+ Args:
39
+ average (str): Method to use to compute the f1 score. Can be weighted, macro, micro.
40
+ num_classes (int, optional): Num of possible choice classes. Defaults to 2. If this parameter is above 2, we'll compute multi f1 corpus score
41
+ """
42
+ if average not in ["weighted" , "macro" , "micro" , None ]:
43
+ raise ValueError (
44
+ f"A CorpusLevelF1Score must be initialized with weighted, macro, micro, or None as an average function. { average } was used."
45
+ )
46
+ self .average = average
37
47
self .num_classes = num_classes
38
48
39
- def compute (self , items ):
40
- golds = [i ["golds" ] for i in items ]
41
- preds = [i ["preds" ] for i in items ]
49
+ def compute (self , items : list [LogprobCorpusMetricInput ]):
50
+ """Computes the metric score over all the corpus generated items, by using the scikit learn implementation."""
51
+ golds = [i .golds for i in items ]
52
+ preds = [i .preds for i in items ]
42
53
# Single f1
43
54
if self .num_classes == 2 :
44
55
fscore = sklearn .metrics .f1_score (golds , preds , average = self .average )
@@ -48,11 +59,16 @@ def compute(self, items):
48
59
f1s = []
49
60
for i in range (self .num_classes ):
50
61
f1s .append (sklearn .metrics .f1_score (y_true = golds == i , y_pred = preds == i ))
51
- return np .mean (f1s )
62
+ return float ( np .mean (f1s ) )
52
63
53
64
54
65
class CorpusLevelTranslationMetric :
55
66
def __init__ (self , metric_type : str ):
67
+ """Stores the relevant parameters for a corpus level translation metric.
68
+
69
+ Args:
70
+ metric_type (str): Can be any of bleu, chrf, or ter depending on the metric to use.
71
+ """
56
72
if metric_type == "bleu" :
57
73
self .metric = sacrebleu .corpus_bleu
58
74
elif metric_type == "chrf" :
@@ -63,19 +79,32 @@ def __init__(self, metric_type: str):
63
79
raise ValueError (f"Unknown corpus level translation metric type : { metric_type } " )
64
80
65
81
def compute (self , items : list [GenerativeCorpusMetricInput ]) -> float :
82
+ """Computes the metric score over all the corpus generated items, by using the sacrebleu implementation."""
66
83
golds = [i .golds for i in items ]
67
84
preds = [as_list (i .preds ) for i in items ]
68
- return self .metric (hypotheses = preds , references = golds ).score
85
+ return float ( self .metric (hypotheses = preds , references = golds ).score )
69
86
70
87
71
88
class CorpusLevelPerplexityMetric :
72
89
def __init__ (self , metric_type : str ):
90
+ """Stores the relevant parameter for a corpus level perplexity metric.
91
+ Perplexity metrics compute more or less the same thing, which is a variation on the
92
+ average of log-probabilities over a sequence, but the normalization and processing applied
93
+ is different depending on the metric type.
94
+ Perplexity uses an exponential and no weights for the average, weighted perplexity uses an exponential
95
+ and the number of words as weights for the log-prob average, and bits per byte uses the number of bits
96
+ for normalization and divides the results by log(2).
97
+
98
+ Args:
99
+ metric_type (str): Can be any of `perplexity`, `weighted_perplexity` or `bits_per_byte`
100
+ """
73
101
if metric_type not in ["perplexity" , "weighted_perplexity" , "bits_per_byte" ]:
74
102
raise ValueError (f"Unknown corpus level perplexity metric type : { metric_type } " )
75
103
76
104
self .metric_type = metric_type
77
105
78
106
def compute (self , items : list [PerplexityCorpusMetricInput ]):
107
+ """Computes the metric score over all the corpus generated items."""
79
108
logprobs = [i .logprobs for i in items ]
80
109
weights = [i .weights for i in items ]
81
110
0 commit comments