Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
ead5bdb
defined a sampling type for metrics, works for cli, now needs to port…
clefourrier Aug 18, 2025
8eece33
rm useless case
clefourrier Aug 18, 2025
8c5e5fb
updated tests
clefourrier Aug 18, 2025
ed0a02b
fix test
clefourrier Aug 19, 2025
7394706
added conversion for normalizations
clefourrier Aug 19, 2025
732c488
first pass transforming Hynek's metric functions into classes like th…
clefourrier Aug 19, 2025
c0654c7
imports
clefourrier Aug 19, 2025
a6e271a
removed single token evals since we no longer have the feature, added…
clefourrier Aug 19, 2025
511d0e6
keep on making metrics more adjustable
clefourrier Aug 19, 2025
bc4bb7e
updating test suite given the new names
clefourrier Aug 19, 2025
5d85a6e
manual update of file
clefourrier Aug 19, 2025
917fb79
manual update of file
clefourrier Aug 19, 2025
a367d73
fix mcc single token
clefourrier Aug 19, 2025
404d00f
now metrics are em
clefourrier Aug 19, 2025
f905750
some fixs for tests
clefourrier Aug 19, 2025
82b3fe9
rm trivia qa outdated
clefourrier Aug 19, 2025
5c4f9ab
removed dumdum enum overwrite
clefourrier Aug 19, 2025
c31a39f
fix test
clefourrier Aug 19, 2025
75419c1
rm a space
clefourrier Aug 19, 2025
915943f
cleaner loop
clefourrier Aug 19, 2025
d047766
test
clefourrier Aug 19, 2025
9e29510
better json encoder + a small naming fix
clefourrier Aug 20, 2025
054c6d5
new names
clefourrier Aug 20, 2025
cc30581
fix test
clefourrier Aug 20, 2025
8fd32a2
Merge branch 'main' into clem-fix-870
clefourrier Aug 20, 2025
3dd79e2
up doc
clefourrier Aug 20, 2025
a07cde1
reorg
clefourrier Aug 20, 2025
6150691
enforce correct classes
clefourrier Aug 20, 2025
5805b30
fix
clefourrier Aug 20, 2025
ac5e042
forgot to update extended tasks
clefourrier Aug 20, 2025
2cde901
fix multilingual again
clefourrier Aug 20, 2025
e8274c9
updated
clefourrier Aug 20, 2025
e043030
fix
clefourrier Aug 20, 2025
31d7d78
Apply suggestions from code review
clefourrier Aug 20, 2025
aeabbf9
review comments
clefourrier Aug 20, 2025
5ae3a8a
Merge branch 'main' into clem-fix-870
clefourrier Aug 20, 2025
90f6b37
fix dco
clefourrier Aug 20, 2025
af2cd81
style
clefourrier Aug 20, 2025
2f2dcfb
doc
clefourrier Aug 20, 2025
b1039e3
updated quick tour
clefourrier Aug 20, 2025
98c80d4
Merge branch 'main' into clem-fix-870
clefourrier Aug 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion community_tasks/aimo_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"""

from lighteval.metrics.metrics import Metrics
from lighteval.metrics.normalizations import math_normalizer
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc

Expand All @@ -49,7 +50,9 @@ def aimo_prompt(line, task_name: str = None):
evaluation_splits=["train"],
few_shots_split="train",
few_shots_select="sequential",
metrics=[Metrics.quasi_exact_match_math],
metrics=[
Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer})
],
generation_size=2048,
stop_sequence=None,
)
Expand Down
44 changes: 23 additions & 21 deletions community_tasks/arabic_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@
import re
from typing import Any, Dict, List, Optional, Union

from lighteval.metrics.llm_as_judge import JudgeLM
from lighteval.metrics.metrics import Metric, Metrics
from lighteval.metrics.metrics import Metrics
from lighteval.metrics.normalizations import LogProbCharNorm
from lighteval.metrics.utils.llm_as_judge import JudgeLM
from lighteval.metrics.utils.metric_utils import Metric
from lighteval.tasks.default_prompts import LETTER_INDICES
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc, SamplingMethod
Expand Down Expand Up @@ -103,7 +105,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=arabic_mmlu_pfn,
hf_repo="MBZUAI/ArabicMMLU",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
hf_avail_splits=["test"],
evaluation_splits=["test"],
few_shots_split=["dev"],
Expand Down Expand Up @@ -164,7 +166,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=arabic_mmlu_ht_pfn,
hf_repo="MBZUAI/human_translated_arabic_mmlu",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
hf_avail_splits=["test"],
evaluation_splits=["test"],
few_shots_split=None,
Expand Down Expand Up @@ -228,7 +230,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=arabic_mmlu_mt_pfn,
hf_repo="OALL/Arabic_MMLU",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
hf_avail_splits=["test", "dev"],
evaluation_splits=["test"],
few_shots_split="dev",
Expand Down Expand Up @@ -283,7 +285,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=acva_pfn,
hf_repo="OALL/ACVA",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
hf_avail_splits=["test", "validation"],
evaluation_splits=["test"],
few_shots_split="validation",
Expand Down Expand Up @@ -339,7 +341,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=aratrust_pfn,
hf_repo="asas-ai/AraTrust-categorized",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split=None,
Expand Down Expand Up @@ -387,7 +389,7 @@ def arabic_exams_pfn(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
version=0,
)

Expand Down Expand Up @@ -437,7 +439,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=alghafa_pfn,
hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
hf_avail_splits=["test", "validation"],
evaluation_splits=["test"],
few_shots_split="validation",
Expand All @@ -463,7 +465,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
version=0,
)

Expand All @@ -479,7 +481,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
version=0,
)

Expand All @@ -495,7 +497,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
version=0,
)

Expand All @@ -511,7 +513,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
version=0,
)

Expand All @@ -527,7 +529,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
version=0,
)

Expand All @@ -543,7 +545,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
version=0,
)

Expand Down Expand Up @@ -580,7 +582,7 @@ def boolq_arabic_pfn(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
version=0,
)

Expand Down Expand Up @@ -614,7 +616,7 @@ def copa_arabic_pfn(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
version=0,
)

Expand Down Expand Up @@ -657,7 +659,7 @@ def hellaswag_arabic_pfn(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
version=0,
)

Expand Down Expand Up @@ -693,7 +695,7 @@ def toxigen_arabic_pfn(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
version=0,
)

Expand Down Expand Up @@ -743,7 +745,7 @@ def sciq_arabic_pfn(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
version=0,
)

Expand Down Expand Up @@ -800,7 +802,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=madinah_qa_pfn,
hf_repo="MBZUAI/MadinahQA",
metrics=[Metrics.loglikelihood_acc_norm],
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
hf_avail_splits=["test"],
evaluation_splits=["test"],
few_shots_split=["dev"],
Expand Down
6 changes: 5 additions & 1 deletion community_tasks/french_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import random

from lighteval.metrics.metrics import Metrics
from lighteval.metrics.normalizations import math_normalizer
from lighteval.tasks.default_prompts import LETTER_INDICES
from lighteval.tasks.extended.ifeval.main import ifeval_metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
Expand Down Expand Up @@ -136,7 +137,10 @@ def prompt_bac_fr(line, task_name: str = None):
few_shots_split=None,
few_shots_select="random_sampling",
generation_size=1,
metrics=[Metrics.quasi_exact_match_math, Metrics.exact_match],
metrics=[
Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
Metrics.exact_match,
],
stop_sequence=["\n"],
version=0,
)
Expand Down
Loading
Loading