Skip to content

Commit aaf7e8a

Browse files
clefourrierhynky1999NathanHB
authored
Add metrics as functions (#214)
--------- Co-authored-by: Hynek Kydlíček <[email protected]> Co-authored-by: Nathan Habib <[email protected]>
1 parent 733257f commit aaf7e8a

File tree

15 files changed

+3021
-1683
lines changed

15 files changed

+3021
-1683
lines changed

community_tasks/_template.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def prompt_fn(line, task_name: str = None):
6868
evaluation_splits=[],
6969
few_shots_split="",
7070
few_shots_select="",
71-
metric=[""],
71+
metric=[], # select your metric in Metrics
7272
)
7373

7474
# EVALS WITH SUBSET
@@ -91,7 +91,7 @@ def __init__(
9191
hf_subset=hf_subset,
9292
prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
9393
hf_repo="",
94-
metric=[""],
94+
metric=[custom_metric], # select your metric in Metrics or use your custom_metric
9595
hf_avail_splits=[],
9696
evaluation_splits=[],
9797
few_shots_split="",
@@ -111,16 +111,14 @@ def __init__(
111111

112112
# CUSTOM METRIC IF NEEDED
113113
custom_metric = SampleLevelMetric(
114-
metric="my_custom_metric_name",
114+
metric_name="my_custom_metric_name",
115115
higher_is_better=True,
116116
category=MetricCategory.IGNORED,
117117
use_case=MetricUseCase.NONE,
118118
sample_level_fn=lambda x: x, # how to compute score for one sample
119119
corpus_level_fn=np.mean, # aggregation
120120
)
121121

122-
extend_enum(Metrics, "my_custom_metric_name", custom_metric)
123-
124122
# MODULE LOGIC
125123
# You should not need to touch this
126124
# Convert to dict for lighteval

community_tasks/aimo_evals.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
Task to evaluate LLMs on the training set of the Kaggle AIMO competition: https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize
2626
"""
2727

28+
from lighteval.metrics.metrics import Metrics
2829
from lighteval.tasks.lighteval_task import LightevalTaskConfig
2930
from lighteval.tasks.requests import Doc
3031

@@ -48,7 +49,7 @@ def aimo_prompt(line, task_name: str = None):
4849
evaluation_splits=["train"],
4950
few_shots_split="train",
5051
few_shots_select="sequential",
51-
metric=["quasi_exact_match_math"],
52+
metric=[Metrics.quasi_exact_match_math],
5253
generation_size=2048,
5354
stop_sequence=None,
5455
)

community_tasks/arabic_evals.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import random
3030
import re
3131

32+
from lighteval.metrics.metrics import Metrics
3233
from lighteval.tasks.lighteval_task import LightevalTaskConfig
3334
from lighteval.tasks.requests import Doc
3435
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
@@ -86,7 +87,7 @@ def __init__(
8687
hf_subset=hf_subset,
8788
prompt_function=mmlu_arabic,
8889
hf_repo="OALL/Arabic_MMLU",
89-
metric=["loglikelihood_acc_norm"],
90+
metric=[Metrics.loglikelihood_acc_norm],
9091
hf_avail_splits=["test", "dev"],
9192
evaluation_splits=["test"],
9293
few_shots_split="dev",
@@ -143,7 +144,7 @@ def __init__(
143144
hf_subset=hf_subset,
144145
prompt_function=acva,
145146
hf_repo="OALL/ACVA",
146-
metric=["loglikelihood_acc_norm"],
147+
metric=[Metrics.loglikelihood_acc_norm],
147148
hf_avail_splits=["test", "validation"],
148149
evaluation_splits=["test"],
149150
few_shots_split="validation",
@@ -195,7 +196,7 @@ def arabic_exams(line, task_name: str = None):
195196
evaluation_splits=["test"],
196197
few_shots_split="validation",
197198
few_shots_select="sequential",
198-
metric=["loglikelihood_acc_norm"],
199+
metric=[Metrics.loglikelihood_acc_norm],
199200
trust_dataset=True,
200201
version=0,
201202
)
@@ -245,7 +246,7 @@ def __init__(
245246
hf_subset=hf_subset,
246247
prompt_function=alghafa_prompt,
247248
hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
248-
metric=["loglikelihood_acc_norm"],
249+
metric=[Metrics.loglikelihood_acc_norm],
249250
hf_avail_splits=["test", "validation"],
250251
evaluation_splits=["test"],
251252
few_shots_split="validation",
@@ -273,7 +274,7 @@ def __init__(
273274
evaluation_splits=["test"],
274275
few_shots_split="validation",
275276
few_shots_select="sequential",
276-
metric=["loglikelihood_acc_norm"],
277+
metric=[Metrics.loglikelihood_acc_norm],
277278
trust_dataset=True,
278279
version=0,
279280
)
@@ -290,7 +291,7 @@ def __init__(
290291
evaluation_splits=["test"],
291292
few_shots_split="validation",
292293
few_shots_select="sequential",
293-
metric=["loglikelihood_acc_norm"],
294+
metric=[Metrics.loglikelihood_acc_norm],
294295
trust_dataset=True,
295296
version=0,
296297
)
@@ -307,7 +308,7 @@ def __init__(
307308
evaluation_splits=["test"],
308309
few_shots_split="validation",
309310
few_shots_select="sequential",
310-
metric=["loglikelihood_acc_norm"],
311+
metric=[Metrics.loglikelihood_acc_norm],
311312
trust_dataset=True,
312313
version=0,
313314
)
@@ -324,7 +325,7 @@ def __init__(
324325
evaluation_splits=["test"],
325326
few_shots_split="validation",
326327
few_shots_select="sequential",
327-
metric=["loglikelihood_acc_norm"],
328+
metric=[Metrics.loglikelihood_acc_norm],
328329
trust_dataset=True,
329330
version=0,
330331
)
@@ -341,7 +342,7 @@ def __init__(
341342
evaluation_splits=["test"],
342343
few_shots_split="validation",
343344
few_shots_select="sequential",
344-
metric=["loglikelihood_acc_norm"],
345+
metric=[Metrics.loglikelihood_acc_norm],
345346
trust_dataset=True,
346347
version=0,
347348
)
@@ -358,7 +359,7 @@ def __init__(
358359
evaluation_splits=["test"],
359360
few_shots_split="validation",
360361
few_shots_select="sequential",
361-
metric=["loglikelihood_acc_norm"],
362+
metric=[Metrics.loglikelihood_acc_norm],
362363
trust_dataset=True,
363364
version=0,
364365
)
@@ -400,7 +401,7 @@ def boolq_prompt_arabic(line, task_name: str = None):
400401
evaluation_splits=["test"],
401402
few_shots_split="validation",
402403
few_shots_select="sequential",
403-
metric=["loglikelihood_acc_norm"],
404+
metric=[Metrics.loglikelihood_acc_norm],
404405
trust_dataset=True,
405406
version=0,
406407
)
@@ -436,7 +437,7 @@ def copa_prompt_arabic(line, task_name: str = None):
436437
evaluation_splits=["test"],
437438
few_shots_split="validation",
438439
few_shots_select="sequential",
439-
metric=["loglikelihood_acc_norm"],
440+
metric=[Metrics.loglikelihood_acc_norm],
440441
trust_dataset=True,
441442
version=0,
442443
)
@@ -481,7 +482,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
481482
evaluation_splits=["test"],
482483
few_shots_split="validation",
483484
few_shots_select="sequential",
484-
metric=["loglikelihood_acc_norm"],
485+
metric=[Metrics.loglikelihood_acc_norm],
485486
trust_dataset=True,
486487
version=0,
487488
)
@@ -519,7 +520,7 @@ def toxigen_prompt_arabic(line, task_name: str = None):
519520
evaluation_splits=["test"],
520521
few_shots_split="validation",
521522
few_shots_select="sequential",
522-
metric=["loglikelihood_acc_norm"],
523+
metric=[Metrics.loglikelihood_acc_norm],
523524
trust_dataset=True,
524525
version=0,
525526
)
@@ -571,7 +572,7 @@ def sciq_prompt_arabic(line, task_name: str = None):
571572
evaluation_splits=["test"],
572573
few_shots_split="validation",
573574
few_shots_select="sequential",
574-
metric=["loglikelihood_acc_norm"],
575+
metric=[Metrics.loglikelihood_acc_norm],
575576
trust_dataset=True,
576577
version=0,
577578
)

community_tasks/german_rag_evals.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
See: https://huggingface.co/datasets/deutsche-telekom/Ger-RAG-eval
3131
"""
3232

33+
from lighteval.metrics.metrics import Metrics
3334
from lighteval.tasks.lighteval_task import LightevalTaskConfig
3435
from lighteval.tasks.requests import Doc
3536

@@ -161,7 +162,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
161162
evaluation_splits=["test"],
162163
few_shots_split="test",
163164
few_shots_select="sequential",
164-
metric=["loglikelihood_acc"],
165+
metric=[Metrics.loglikelihood_acc],
165166
version=1,
166167
)
167168

@@ -178,7 +179,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
178179
evaluation_splits=["test"],
179180
few_shots_split="test",
180181
few_shots_select="sequential",
181-
metric=["loglikelihood_acc"],
182+
metric=[Metrics.loglikelihood_acc],
182183
version=1,
183184
)
184185

@@ -196,7 +197,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
196197
evaluation_splits=["test"],
197198
few_shots_split="test",
198199
few_shots_select="sequential",
199-
metric=["loglikelihood_acc"],
200+
metric=[Metrics.loglikelihood_acc],
200201
version=1,
201202
)
202203

@@ -213,7 +214,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
213214
evaluation_splits=["test"],
214215
few_shots_split="test",
215216
few_shots_select="sequential",
216-
metric=["loglikelihood_acc"],
217+
metric=[Metrics.loglikelihood_acc],
217218
version=1,
218219
)
219220

examples/nanotron/custom_evaluation_tasks.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def preprocess(text):
8888
prompt_function=hellaswag_prompt,
8989
hf_repo="hellaswag",
9090
hf_subset="default",
91-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
91+
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
9292
trust_dataset=True,
9393
stop_sequence=["\n"],
9494
),
@@ -97,7 +97,7 @@ def preprocess(text):
9797
prompt_function=prompt.winogrande,
9898
hf_repo="winogrande",
9999
hf_subset="winogrande_xl",
100-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
100+
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
101101
trust_dataset=True,
102102
stop_sequence=["\n"],
103103
),
@@ -106,7 +106,7 @@ def preprocess(text):
106106
prompt_function=prompt.piqa_harness,
107107
hf_repo="piqa",
108108
hf_subset="plain_text",
109-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
109+
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
110110
trust_dataset=True,
111111
stop_sequence=["\n"],
112112
),
@@ -116,7 +116,7 @@ def preprocess(text):
116116
hf_repo="lighteval/siqa",
117117
hf_subset="default",
118118
hf_avail_splits=["train", "validation"],
119-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
119+
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
120120
trust_dataset=True,
121121
stop_sequence=["\n"],
122122
),
@@ -125,7 +125,7 @@ def preprocess(text):
125125
prompt_function=prompt.openbookqa,
126126
hf_repo="openbookqa",
127127
hf_subset="main",
128-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
128+
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
129129
trust_dataset=True,
130130
stop_sequence=["\n"],
131131
),
@@ -136,7 +136,7 @@ def preprocess(text):
136136
hf_subset="ARC-Easy",
137137
evaluation_splits=["test"],
138138
generation_size=1,
139-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
139+
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
140140
trust_dataset=True,
141141
stop_sequence=["\n"],
142142
),
@@ -147,7 +147,7 @@ def preprocess(text):
147147
hf_subset="ARC-Challenge",
148148
evaluation_splits=["test"],
149149
generation_size=1,
150-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
150+
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
151151
trust_dataset=True,
152152
stop_sequence=["\n"],
153153
),
@@ -156,7 +156,7 @@ def preprocess(text):
156156
prompt_function=commonsense_qa_prompt,
157157
hf_repo="commonsense_qa",
158158
hf_subset="default",
159-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
159+
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
160160
trust_dataset=True,
161161
stop_sequence=["\n"],
162162
),
@@ -226,7 +226,7 @@ def boolq_prompt(line, task_name: str = None):
226226
prompt_function=boolq_prompt,
227227
hf_repo="super_glue",
228228
hf_subset="boolq",
229-
metric=["target_perplexity"],
229+
metric=[Metrics.target_perplexity],
230230
trust_dataset=True,
231231
stop_sequence=["\n"],
232232
),

examples/nanotron/custom_task.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
# SOFTWARE.
2222

23+
from lighteval.metrics import Metrics
2324
from lighteval.tasks.lighteval_task import LightevalTaskConfig
2425

2526

@@ -79,7 +80,7 @@ def mmlu_anatomy(line):
7980
few_shots_split="dev",
8081
few_shots_select="sequential",
8182
generation_size=5,
82-
metric=["loglikelihood_acc_single_token"],
83+
metric=[Metrics.loglikelihood_acc_single_token],
8384
stop_sequence=["\n"],
8485
output_regex=None,
8586
frozen=False,
@@ -95,7 +96,7 @@ def mmlu_anatomy(line):
9596
few_shots_split="dev",
9697
few_shots_select="sequential",
9798
generation_size=5,
98-
metric=["loglikelihood_acc_single_token"],
99+
metric=[Metrics.loglikelihood_acc_single_token],
99100
stop_sequence=["\n"],
100101
output_regex=None,
101102
frozen=False,

src/lighteval/logging/evaluation_tracker.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import time
2828
from dataclasses import asdict, is_dataclass
2929
from datetime import datetime
30+
from enum import Enum
3031
from pathlib import Path
3132

3233
from datasets import Dataset, load_dataset
@@ -59,6 +60,8 @@ def default(self, o):
5960
return asdict(o)
6061
if callable(o):
6162
return o.__name__
63+
if isinstance(o, Enum):
64+
return o.name
6265
return super().default(o)
6366

6467

0 commit comments

Comments
 (0)