29
29
import random
30
30
import re
31
31
32
+ from lighteval .metrics .metrics import Metrics
32
33
from lighteval .tasks .lighteval_task import LightevalTaskConfig
33
34
from lighteval .tasks .requests import Doc
34
35
from lighteval .tasks .tasks_prompt_formatting import LETTER_INDICES
@@ -86,7 +87,7 @@ def __init__(
86
87
hf_subset = hf_subset ,
87
88
prompt_function = mmlu_arabic ,
88
89
hf_repo = "OALL/Arabic_MMLU" ,
89
- metric = [" loglikelihood_acc_norm" ],
90
+ metric = [Metrics . loglikelihood_acc_norm ],
90
91
hf_avail_splits = ["test" , "dev" ],
91
92
evaluation_splits = ["test" ],
92
93
few_shots_split = "dev" ,
@@ -143,7 +144,7 @@ def __init__(
143
144
hf_subset = hf_subset ,
144
145
prompt_function = acva ,
145
146
hf_repo = "OALL/ACVA" ,
146
- metric = [" loglikelihood_acc_norm" ],
147
+ metric = [Metrics . loglikelihood_acc_norm ],
147
148
hf_avail_splits = ["test" , "validation" ],
148
149
evaluation_splits = ["test" ],
149
150
few_shots_split = "validation" ,
@@ -195,7 +196,7 @@ def arabic_exams(line, task_name: str = None):
195
196
evaluation_splits = ["test" ],
196
197
few_shots_split = "validation" ,
197
198
few_shots_select = "sequential" ,
198
- metric = [" loglikelihood_acc_norm" ],
199
+ metric = [Metrics . loglikelihood_acc_norm ],
199
200
trust_dataset = True ,
200
201
version = 0 ,
201
202
)
@@ -245,7 +246,7 @@ def __init__(
245
246
hf_subset = hf_subset ,
246
247
prompt_function = alghafa_prompt ,
247
248
hf_repo = "OALL/AlGhafa-Arabic-LLM-Benchmark-Native" ,
248
- metric = [" loglikelihood_acc_norm" ],
249
+ metric = [Metrics . loglikelihood_acc_norm ],
249
250
hf_avail_splits = ["test" , "validation" ],
250
251
evaluation_splits = ["test" ],
251
252
few_shots_split = "validation" ,
@@ -273,7 +274,7 @@ def __init__(
273
274
evaluation_splits = ["test" ],
274
275
few_shots_split = "validation" ,
275
276
few_shots_select = "sequential" ,
276
- metric = [" loglikelihood_acc_norm" ],
277
+ metric = [Metrics . loglikelihood_acc_norm ],
277
278
trust_dataset = True ,
278
279
version = 0 ,
279
280
)
@@ -290,7 +291,7 @@ def __init__(
290
291
evaluation_splits = ["test" ],
291
292
few_shots_split = "validation" ,
292
293
few_shots_select = "sequential" ,
293
- metric = [" loglikelihood_acc_norm" ],
294
+ metric = [Metrics . loglikelihood_acc_norm ],
294
295
trust_dataset = True ,
295
296
version = 0 ,
296
297
)
@@ -307,7 +308,7 @@ def __init__(
307
308
evaluation_splits = ["test" ],
308
309
few_shots_split = "validation" ,
309
310
few_shots_select = "sequential" ,
310
- metric = [" loglikelihood_acc_norm" ],
311
+ metric = [Metrics . loglikelihood_acc_norm ],
311
312
trust_dataset = True ,
312
313
version = 0 ,
313
314
)
@@ -324,7 +325,7 @@ def __init__(
324
325
evaluation_splits = ["test" ],
325
326
few_shots_split = "validation" ,
326
327
few_shots_select = "sequential" ,
327
- metric = [" loglikelihood_acc_norm" ],
328
+ metric = [Metrics . loglikelihood_acc_norm ],
328
329
trust_dataset = True ,
329
330
version = 0 ,
330
331
)
@@ -341,7 +342,7 @@ def __init__(
341
342
evaluation_splits = ["test" ],
342
343
few_shots_split = "validation" ,
343
344
few_shots_select = "sequential" ,
344
- metric = [" loglikelihood_acc_norm" ],
345
+ metric = [Metrics . loglikelihood_acc_norm ],
345
346
trust_dataset = True ,
346
347
version = 0 ,
347
348
)
@@ -358,7 +359,7 @@ def __init__(
358
359
evaluation_splits = ["test" ],
359
360
few_shots_split = "validation" ,
360
361
few_shots_select = "sequential" ,
361
- metric = [" loglikelihood_acc_norm" ],
362
+ metric = [Metrics . loglikelihood_acc_norm ],
362
363
trust_dataset = True ,
363
364
version = 0 ,
364
365
)
@@ -400,7 +401,7 @@ def boolq_prompt_arabic(line, task_name: str = None):
400
401
evaluation_splits = ["test" ],
401
402
few_shots_split = "validation" ,
402
403
few_shots_select = "sequential" ,
403
- metric = [" loglikelihood_acc_norm" ],
404
+ metric = [Metrics . loglikelihood_acc_norm ],
404
405
trust_dataset = True ,
405
406
version = 0 ,
406
407
)
@@ -436,7 +437,7 @@ def copa_prompt_arabic(line, task_name: str = None):
436
437
evaluation_splits = ["test" ],
437
438
few_shots_split = "validation" ,
438
439
few_shots_select = "sequential" ,
439
- metric = [" loglikelihood_acc_norm" ],
440
+ metric = [Metrics . loglikelihood_acc_norm ],
440
441
trust_dataset = True ,
441
442
version = 0 ,
442
443
)
@@ -481,7 +482,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
481
482
evaluation_splits = ["test" ],
482
483
few_shots_split = "validation" ,
483
484
few_shots_select = "sequential" ,
484
- metric = [" loglikelihood_acc_norm" ],
485
+ metric = [Metrics . loglikelihood_acc_norm ],
485
486
trust_dataset = True ,
486
487
version = 0 ,
487
488
)
@@ -519,7 +520,7 @@ def toxigen_prompt_arabic(line, task_name: str = None):
519
520
evaluation_splits = ["test" ],
520
521
few_shots_split = "validation" ,
521
522
few_shots_select = "sequential" ,
522
- metric = [" loglikelihood_acc_norm" ],
523
+ metric = [Metrics . loglikelihood_acc_norm ],
523
524
trust_dataset = True ,
524
525
version = 0 ,
525
526
)
@@ -571,7 +572,7 @@ def sciq_prompt_arabic(line, task_name: str = None):
571
572
evaluation_splits = ["test" ],
572
573
few_shots_split = "validation" ,
573
574
few_shots_select = "sequential" ,
574
- metric = [" loglikelihood_acc_norm" ],
575
+ metric = [Metrics . loglikelihood_acc_norm ],
575
576
trust_dataset = True ,
576
577
version = 0 ,
577
578
)
0 commit comments