diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py index 39a7458ad..7895cabff 100644 --- a/community_tasks/aimo_evals.py +++ b/community_tasks/aimo_evals.py @@ -26,6 +26,7 @@ """ from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import math_normalizer from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -49,7 +50,9 @@ def aimo_prompt(line, task_name: str = None): evaluation_splits=["train"], few_shots_split="train", few_shots_select="sequential", - metrics=[Metrics.quasi_exact_match_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}) + ], generation_size=2048, stop_sequence=None, ) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 074ac4c56..cb977eae3 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -31,8 +31,10 @@ import re from typing import Any, Dict, List, Optional, Union -from lighteval.metrics.llm_as_judge import JudgeLM -from lighteval.metrics.metrics import Metric, Metrics +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.metrics.utils.llm_as_judge import JudgeLM +from lighteval.metrics.utils.metric_utils import Metric from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc, SamplingMethod @@ -103,7 +105,7 @@ def __init__( hf_subset=hf_subset, prompt_function=arabic_mmlu_pfn, hf_repo="MBZUAI/ArabicMMLU", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test"], evaluation_splits=["test"], few_shots_split=["dev"], @@ -164,7 +166,7 @@ def __init__( hf_subset=hf_subset, prompt_function=arabic_mmlu_ht_pfn, hf_repo="MBZUAI/human_translated_arabic_mmlu", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test"], evaluation_splits=["test"], few_shots_split=None, @@ -228,7 +230,7 @@ def __init__( hf_subset=hf_subset, prompt_function=arabic_mmlu_mt_pfn, hf_repo="OALL/Arabic_MMLU", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test", "dev"], evaluation_splits=["test"], few_shots_split="dev", @@ -283,7 +285,7 @@ def __init__( hf_subset=hf_subset, prompt_function=acva_pfn, hf_repo="OALL/ACVA", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", @@ -339,7 +341,7 @@ def __init__( hf_subset=hf_subset, prompt_function=aratrust_pfn, hf_repo="asas-ai/AraTrust-categorized", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["train"], evaluation_splits=["train"], few_shots_split=None, @@ -387,7 +389,7 @@ def arabic_exams_pfn(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], version=0, ) @@ -437,7 +439,7 @@ def __init__( hf_subset=hf_subset, prompt_function=alghafa_pfn, hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", @@ -463,7 +465,7 @@ def __init__( evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], version=0, ) @@ -479,7 +481,7 @@ def __init__( evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], version=0, ) @@ -495,7 +497,7 @@ def __init__( evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], version=0, ) @@ -511,7 +513,7 @@ def __init__( evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], version=0, ) @@ -527,7 +529,7 @@ def __init__( evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], version=0, ) @@ -543,7 +545,7 @@ def __init__( evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], version=0, ) @@ -580,7 +582,7 @@ def boolq_arabic_pfn(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], version=0, ) @@ -614,7 +616,7 @@ def copa_arabic_pfn(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], version=0, ) @@ -657,7 +659,7 @@ def hellaswag_arabic_pfn(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], version=0, ) @@ -693,7 +695,7 @@ def toxigen_arabic_pfn(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], version=0, ) @@ -743,7 +745,7 @@ def sciq_arabic_pfn(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], version=0, ) @@ -800,7 +802,7 @@ def __init__( hf_subset=hf_subset, prompt_function=madinah_qa_pfn, hf_repo="MBZUAI/MadinahQA", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test"], evaluation_splits=["test"], few_shots_split=["dev"], diff --git a/community_tasks/french_evals.py b/community_tasks/french_evals.py index 200216b00..8e0480aac 100644 --- a/community_tasks/french_evals.py +++ b/community_tasks/french_evals.py @@ -33,6 +33,7 @@ import random from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import math_normalizer from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.extended.ifeval.main import ifeval_metrics from lighteval.tasks.lighteval_task import LightevalTaskConfig @@ -136,7 +137,10 @@ def prompt_bac_fr(line, task_name: str = None): few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.quasi_exact_match_math, Metrics.exact_match], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.exact_match, + ], stop_sequence=["\n"], version=0, ) diff --git a/community_tasks/serbian_eval.py b/community_tasks/serbian_eval.py index e30ff8a6e..38e8b257e 100644 --- a/community_tasks/serbian_eval.py +++ b/community_tasks/serbian_eval.py @@ -35,6 +35,7 @@ from typing import List, Optional from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -297,7 +298,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.ARC_EASY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) arc_challenge = create_task_config( @@ -305,7 +306,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.ARC_CHALLENGE.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -317,14 +318,14 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.HELLASWAG.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) piqa = create_task_config( task_name="serbian_evals:piqa", prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.PIQA.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) winogrande = create_task_config( @@ -332,7 +333,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.WINOGRANDE.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -356,7 +357,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ANATOMY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_astronomy = create_task_config( @@ -364,7 +365,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ASTRONOMY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_business_ethics = create_task_config( @@ -372,7 +373,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_BUSINESS_ETHICS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_clinical_knowledge = create_task_config( @@ -380,7 +381,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_CLINICAL_KNOWLEDGE.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_miscellaneous = create_task_config( @@ -388,7 +389,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MISCELLANEOUS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_electrical_engineering = create_task_config( @@ -396,7 +397,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ELECTRONIC_ENGINEERING.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -408,7 +409,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_SERBIAN_ALL.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -420,7 +421,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MARKETING.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_management = create_task_config( @@ -428,7 +429,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MANAGEMENT.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -440,7 +441,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_BIOLOGY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_chemistry = create_task_config( @@ -448,7 +449,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_CHEMISTRY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_computer_science = create_task_config( @@ -456,7 +457,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_COMPUTER_SCIENCE.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_mathematics = create_task_config( @@ -464,7 +465,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_MATHEMATICS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_medicine = create_task_config( @@ -472,7 +473,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_MEDICINE.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_physics = create_task_config( @@ -480,7 +481,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_PHYSICS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_computer_security = create_task_config( @@ -488,7 +489,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_COMPUTER_SECURITY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -500,7 +501,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MORAL_DISPUTES.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_moral_scenarios = create_task_config( @@ -508,7 +509,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MORAL_SCENARIOS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_philosophy = create_task_config( @@ -516,7 +517,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_PHILOSOPHY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_world_religions = create_task_config( @@ -524,7 +525,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_WORLD_RELIGIONS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -536,7 +537,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_BIOLOGY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_chemistry = create_task_config( @@ -544,7 +545,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_CHEMISTRY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_computer_science = create_task_config( @@ -552,7 +553,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_COMPUTER_SCIENCE.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_european_history = create_task_config( @@ -560,7 +561,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_EURO_HISTORY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_geography = create_task_config( @@ -568,7 +569,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_GEOGRAPHY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_mathematics = create_task_config( @@ -576,7 +577,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_MATHEMATICS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_microeconomics = create_task_config( @@ -584,7 +585,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_MICROECONOMICS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_physics = create_task_config( @@ -592,7 +593,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_PHYSICS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_psychology = create_task_config( @@ -600,7 +601,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_PSYCHOLOGY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_statistics = create_task_config( @@ -608,7 +609,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_STATISTICS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_world_history = create_task_config( @@ -616,7 +617,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_WORLD_HISTORY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -628,7 +629,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ABSTRACT_ALGEBRA.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_elementary_mathematics = create_task_config( @@ -636,7 +637,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ELEMENTARY_MATHEMATICS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_formal_logic = create_task_config( @@ -644,7 +645,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_FORMAL_LOGIC.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_conceptual_physics = create_task_config( @@ -652,7 +653,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_CONCEPTUAL_PHYSICS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_econometrics = create_task_config( @@ -660,7 +661,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ECONOMETRICS.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_machine_learning = create_task_config( @@ -668,7 +669,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MACHINE_LEARNING.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -680,7 +681,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_GLOBAL_FACT.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_logical_fallacies = create_task_config( @@ -688,7 +689,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_LOGICAL_FALLACIES.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_sociology = create_task_config( @@ -696,7 +697,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_SOCIOLOGY.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_human_aging = create_task_config( @@ -704,7 +705,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HUMAN_AGING.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -716,7 +717,7 @@ def create_task_config( prompt_function=boolq_serbian, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.BOOLQ.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) openbook_qa = create_task_config( @@ -724,7 +725,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.OPENBOOK.value, - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) diff --git a/community_tasks/turkic_evals.py b/community_tasks/turkic_evals.py index f7829fd59..9eae65d5b 100644 --- a/community_tasks/turkic_evals.py +++ b/community_tasks/turkic_evals.py @@ -37,14 +37,10 @@ } """ -import random -import re from functools import partial -from typing import Any, Dict, List, Optional, Union -from lighteval.metrics.llm_as_judge import JudgeLM -from lighteval.metrics.metrics import Metric, MetricCategory, Metrics -from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -127,7 +123,7 @@ def __init__( hf_subset=hf_subset, prompt_function=partial(tumlu_pfn, language=hf_subset), hf_repo="jafarisbarov/TUMLU-mini", - metrics=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test", "dev"], evaluation_splits=["test"], few_shots_split=["dev"], diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx index 4db1c935b..828a9f471 100644 --- a/docs/source/contributing-to-multilingual-evaluations.mdx +++ b/docs/source/contributing-to-multilingual-evaluations.mdx @@ -64,9 +64,9 @@ your_tasks = [ metric=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), # In this function, you choose which template to follow and for which language and formulation diff --git a/docs/source/metric-list.mdx b/docs/source/metric-list.mdx index 643c915d2..06d3dd069 100644 --- a/docs/source/metric-list.mdx +++ b/docs/source/metric-list.mdx @@ -3,21 +3,15 @@ ## Automatic metrics for multiple-choice tasks These metrics use log-likelihood of the different possible targets. -- `loglikelihood_acc`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_acc_single_token`). -- `loglikelihood_acc_norm`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_acc_norm_single_token`). -- `loglikelihood_acc_norm_nospace`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct, with the first space ignored. -- `loglikelihood_f1`: Corpus level F1 score of the multichoice selection - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_f1_single_token`). +- `loglikelihood_acc`: Fraction of instances where the choice with the best logprob was correct - we recommend using a normalization by length +- `loglikelihood_f1`: Corpus level F1 score of the multichoice selection - `mcc`: Matthew's correlation coefficient (a measure of agreement between statistical distributions). -- `recall_at_1`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (`recall_at_1_single_token`). -- `recall_at_2`: Fraction of instances where the choice with the 2nd best logprob or better was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (`recall_at_2_single_token`). -- `mrr`: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance - also exists in a faster version for tasks where the possible choices include only one token (`mrr_single_token`). +- `recall_at_k`: Fraction of instances where the choice with the k-st best logprob or better was correct +- `mrr`: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance - `target_perplexity`: Perplexity of the different choices available. - `acc_golds_likelihood`: A bit different, it actually checks if the average logprob of a single target is above or below 0.5. - `multi_f1_numeric`: Loglikelihood F1 score for multiple gold targets. -All these metrics also exist in a "single token" version (`loglikelihood_acc_single_token`, `loglikelihood_acc_norm_single_token`, `loglikelihood_f1_single_token`, `mcc_single_token`, `recall@2_single_token` and `mrr_single_token`). When the multichoice option compares only one token (ex: "A" vs "B" vs "C" vs "D", or "yes" vs "no"), using these metrics in the single token version will divide the time spent by the number of choices. Single token evals also include: -- `multi_f1_numeric`: Computes the f1 score of all possible choices and averages it. - ## Automatic metrics for perplexity and language modeling These metrics use log-likelihood of prompt. - `word_perplexity`: Perplexity (log probability of the input) weighted by the number of words of the sequence. @@ -28,17 +22,13 @@ These metrics use log-likelihood of prompt. ## Automatic metrics for generative tasks These metrics need the model to generate an output. They are therefore slower. - Base: - - `perfect_exact_match`: Fraction of instances where the prediction matches the gold exactly. - - `exact_match`: Fraction of instances where the prediction matches the gold with the exception of the border whitespaces (= after a `strip` has been applied to both). - - `quasi_exact_match`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, ...). Other variations exist, with other normalizers, such as `quasi_exact_match_triviaqa`, which only normalizes the predictions after applying a strip to all sentences. - - `prefix_exact_match`: Fraction of instances where the beginning of the prediction matches the gold at the exception of the border whitespaces (= after a `strip` has been applied to both). - - `prefix_quasi_exact_match`: Fraction of instances where the normalized beginning of the prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, ...). - - `exact_match_indicator`: Exact match with some preceding context (before an indicator) removed. - - `f1_score_quasi`: Average F1 score in terms of word overlap between the model output and gold, with both being normalized first. - - `f1_score`: Average F1 score in terms of word overlap between the model output and gold without normalisation. + - `exact_match`: Fraction of instances where the prediction matches the gold. Several variations can be made through parametrization: + - normalization on string pre-comparision on whitespace, articles, capitalization, .... + - comparing the full string, or only subsets (prefix, suffix, ...) + - `maj_at_k`: Model majority vote. Samples k generations from the model and assumes the most frequent is the actual prediction. + - `f1_score`: Average F1 score in terms of word overlap between the model output and gold (normalisation optional). - `f1_score_macro`: Corpus level macro F1 score. - `f1_score_macro`: Corpus level micro F1 score. - - `maj_at_5` and `maj_at_8`: Model majority vote. Takes n (5 or 8) generations from the model and assumes the most frequent is the actual prediction. - Summarization: - `rouge`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/). - `rouge1`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap. @@ -64,10 +54,7 @@ These metrics need the model to generate an output. They are therefore slower. - `edit_distance`: Average Levenshtein edit distance between model generation and reference, - `edit_similarity`: Average Levenshtein edit similarity (normalized by the length of longer sequence) between model generation and reference. - Math: - - `quasi_exact_match_math`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for math, where latex symbols, units, etc are removed). - - `maj_at_4_math`: Majority choice evaluation, using the math normalisation for the predictions and gold. - - `quasi_exact_match_gsm8k`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed). - - `maj_at_8_gsm8k`: Majority choice evaluation, using the gsm8k normalisation for the predictions and gold. + - Both `exact_match` and `maj_at_k` can be used to evaluate mathematics tasks with math specific normalization to remove and filter latex. ## LLM-as-Judge - `llm_judge_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. diff --git a/docs/source/package_reference/metrics.mdx b/docs/source/package_reference/metrics.mdx index 57c656966..1b946a82e 100644 --- a/docs/source/package_reference/metrics.mdx +++ b/docs/source/package_reference/metrics.mdx @@ -24,8 +24,8 @@ [[autodoc]] metrics.metrics_corpus.CorpusLevelPerplexityMetric ### CorpusLevelTranslationMetric [[autodoc]] metrics.metrics_corpus.CorpusLevelTranslationMetric -### matthews_corrcoef -[[autodoc]] metrics.metrics_corpus.matthews_corrcoef +### MatthewsCorrCoef +[[autodoc]] metrics.metrics_corpus.MatthewsCorrCoef ## Sample Metrics ### ExactMatches @@ -67,4 +67,4 @@ ## LLM-as-a-Judge ### JudgeLM -[[autodoc]] metrics.llm_as_judge.JudgeLM +[[autodoc]] metrics.utils.llm_as_judge.JudgeLM diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx index 7daea73da..de2059f49 100644 --- a/docs/source/quicktour.mdx +++ b/docs/source/quicktour.mdx @@ -44,6 +44,17 @@ The syntax for the task specification might be a bit hard to grasp at first. The If the fourth value is set to 1, lighteval will check if the prompt (including the few-shot examples) is too long for the context size of the task or the model. If so, the number of few shot examples is automatically reduced. +Tasks have a function applied at the sample level and one at the corpus level. For example, +- an exact match can be applied per sample, then averaged over the corpus to give the final score +- samples can be left untouched before applying Corpus BLEU at the corpus level +etc. + +If the task you are looking at has a sample level function (`sample_level_fn`) which can be parametrized, you can pass parameters in the CLI. +For example +```txt +{suite}|{task}@{parameter_name1}={value1},{parameter_name2}={value2},...|0|0 +``` + All officially supported tasks can be found at the [tasks_list](available-tasks) and in the [extended folder](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks/extended). Moreover, community-provided tasks can be found in the diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx index dacec2795..4a6f379a1 100644 --- a/docs/source/saving-and-reading-results.mdx +++ b/docs/source/saving-and-reading-results.mdx @@ -132,14 +132,14 @@ The detail file contains the following columns: }, "results": { "lighteval|gsm8k|0": { - "qem": 0.0, - "qem_stderr": 0.0, + "em": 0.0, + "em_stderr": 0.0, "maj@8": 0.0, "maj@8_stderr": 0.0 }, "all": { - "qem": 0.0, - "qem_stderr": 0.0, + "em": 0.0, + "em_stderr": 0.0, "maj@8": 0.0, "maj@8_stderr": 0.0 } @@ -155,7 +155,7 @@ The detail file contains the following columns: "hf_subset": "main", "metric": [ { - "metric_name": "qem", + "metric_name": "em", "higher_is_better": true, "category": "3", "use_case": "5", diff --git a/examples/custom_tasks_templates/custom_yourbench_task.py b/examples/custom_tasks_templates/custom_yourbench_task.py index dbae4d368..87a0488d2 100644 --- a/examples/custom_tasks_templates/custom_yourbench_task.py +++ b/examples/custom_tasks_templates/custom_yourbench_task.py @@ -240,7 +240,7 @@ def yourbench_prompt(line, task_name: str = ""): metric_name=["accuracy"], higher_is_better={"accuracy": True}, category=SamplingMethod.GENERATIVE, - sample_level_fn=JudgeLLMYourBench().compute, + sample_level_fn=JudgeLLMYourBench(), corpus_level_fn={"accuracy": np.mean}, ) extend_enum(Metrics, "yourbench_metrics", yourbench_metrics) diff --git a/examples/custom_tasks_templates/custom_yourbench_task_mcq.py b/examples/custom_tasks_templates/custom_yourbench_task_mcq.py index 3e281a66f..e0158b6db 100644 --- a/examples/custom_tasks_templates/custom_yourbench_task_mcq.py +++ b/examples/custom_tasks_templates/custom_yourbench_task_mcq.py @@ -25,12 +25,9 @@ from aenum import extend_enum -from lighteval.metrics.dynamic_metrics import multilingual_extractive_match_metric from lighteval.metrics.metrics import Metrics -from lighteval.metrics.utils.extractive_match_utils import IndicesExtractionConfig from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc -from lighteval.utils.language import Language logger = logging.getLogger(__name__) @@ -74,12 +71,7 @@ def yourbench_prompt(line, task_name: str = ""): ) -yourbench_metrics = multilingual_extractive_match_metric( - language=Language.ENGLISH, - gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - precision=6, -) +yourbench_metrics = Metrics.gpqa_instruct_metric extend_enum(Metrics, "yourbench_metrics", yourbench_metrics) diff --git a/examples/custom_tasks_tests.py b/examples/custom_tasks_tests.py index ceb0b08cd..34c871cd5 100644 --- a/examples/custom_tasks_tests.py +++ b/examples/custom_tasks_tests.py @@ -52,7 +52,7 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.gpqa_instruct_pass_at_1_1n], + metrics=[Metrics.gpqa_instruct_pass_at_k(sample_params={"k": 1})], stop_sequence=[], # no stop sequence, will use eos token version=0, ) diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py index fef7a61d9..8430bee08 100644 --- a/examples/nanotron/custom_evaluation_tasks.py +++ b/examples/nanotron/custom_evaluation_tasks.py @@ -28,11 +28,11 @@ """ import re -from dataclasses import asdict -from typing import Dict, List, Tuple +from typing import List, Tuple import lighteval.tasks.default_prompts as prompt -from lighteval.metrics import Metrics +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm, helm_normalizer, math_normalizer from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -79,7 +79,7 @@ def preprocess(text): query=preprocess(line["activity_label"] + ": " + ctx), choices=[" " + preprocess(ending) for ending in line["endings"]], gold_index=int(line["label"]) if line["label"] != "" else -1, # -1 for test - # "metric": "choices_loglikelihood", + # "metrics": "choices_loglikelihood", ) @@ -89,7 +89,12 @@ def preprocess(text): prompt_function=hellaswag_prompt, hf_repo="hellaswag", hf_subset="default", - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], stop_sequence=["\n"], ), LightevalTaskConfig( @@ -97,7 +102,12 @@ def preprocess(text): prompt_function=prompt.winogrande, hf_repo="winogrande", hf_subset="winogrande_xl", - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], stop_sequence=["\n"], ), LightevalTaskConfig( @@ -105,7 +115,12 @@ def preprocess(text): prompt_function=prompt.piqa_harness, hf_repo="piqa", hf_subset="plain_text", - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], stop_sequence=["\n"], ), LightevalTaskConfig( @@ -114,7 +129,12 @@ def preprocess(text): hf_repo="lighteval/siqa", hf_subset="default", hf_avail_splits=["train", "validation"], - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], stop_sequence=["\n"], ), LightevalTaskConfig( @@ -122,7 +142,12 @@ def preprocess(text): prompt_function=prompt.openbookqa, hf_repo="openbookqa", hf_subset="main", - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], stop_sequence=["\n"], ), LightevalTaskConfig( @@ -132,7 +157,12 @@ def preprocess(text): hf_subset="ARC-Easy", evaluation_splits=["test"], generation_size=1, - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], stop_sequence=["\n"], ), LightevalTaskConfig( @@ -142,7 +172,12 @@ def preprocess(text): hf_subset="ARC-Challenge", evaluation_splits=["test"], generation_size=1, - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], stop_sequence=["\n"], ), LightevalTaskConfig( @@ -150,7 +185,12 @@ def preprocess(text): prompt_function=commonsense_qa_prompt, hf_repo="commonsense_qa", hf_subset="default", - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], stop_sequence=["\n"], ), ] @@ -179,7 +219,9 @@ def natural_questions_prompt(line, task_name: str = None): prompt_function=prompt.triviaqa, hf_repo="trivia_qa", hf_subset="rc.nocontext", - metric=[Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}) + ], generation_size=20, stop_sequence=["\n", ".", ","], ), @@ -188,7 +230,9 @@ def natural_questions_prompt(line, task_name: str = None): prompt_function=natural_questions_prompt, hf_repo="lighteval/natural_questions_clean", hf_subset="default", - metric=[Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}) + ], generation_size=20, stop_sequence=["\n", ".", ","], ), @@ -217,7 +261,7 @@ def boolq_prompt(line, task_name: str = None): prompt_function=boolq_prompt, hf_repo="super_glue", hf_subset="boolq", - metric=[Metrics.target_perplexity], + metrics=[Metrics.target_perplexity], stop_sequence=["\n"], ), LightevalTaskConfig( @@ -225,7 +269,9 @@ def boolq_prompt(line, task_name: str = None): prompt_function=prompt.quac, hf_repo="lighteval/quac_helm", hf_subset="deault", - metric=[Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}) + ], generation_size=20, stop_sequence=["\n", ".", ","], ), @@ -247,7 +293,9 @@ def __init__( prompt_function=prompt.math, hf_repo="DigitalLearningGmbH/MATH-lighteval", hf_subset=None, - metric=[Metrics.quasi_exact_match_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}) + ], hf_avail_splits=None, evaluation_splits=["test"], few_shots_split=None, @@ -261,7 +309,7 @@ def __init__( prompt_function=prompt_function, hf_repo=hf_repo, hf_subset=hf_subset, - metric=metric, + metrics=metrics, hf_avail_splits=hf_avail_splits, evaluation_splits=evaluation_splits, few_shots_split=few_shots_split, @@ -288,7 +336,7 @@ def __init__( hf_subset="main", hf_avail_splits=["train", "test"], evaluation_splits=["test"], - metric=[Metrics.perfect_exact_match], + metrics=[Metrics.perfect_exact_match], generation_size=10, stop_sequence=["\n"], ) @@ -343,8 +391,12 @@ def __init__( prompt_function=mmlu_prompt, hf_repo="lighteval/mmlu", hf_subset=None, - # metric=[Metrics.loglikelihood_acc_single_token], - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], hf_avail_splits=None, evaluation_splits=["test"], few_shots_split="dev", @@ -358,7 +410,7 @@ def __init__( prompt_function=prompt_function, hf_repo=hf_repo, hf_subset=hf_subset, - metric=metric, + metrics=metrics, hf_avail_splits=hf_avail_splits, evaluation_splits=evaluation_splits, few_shots_split=few_shots_split, @@ -455,7 +507,7 @@ def __init__( prompt_function=bbh_prompt, hf_repo="lighteval/big_bench_hard", hf_subset=None, - metric=[Metrics.exact_match], + metrics=[Metrics.exact_match], hf_avail_splits=["train"], evaluation_splits=["train"], few_shots_split="train", @@ -469,7 +521,7 @@ def __init__( prompt_function=prompt_function, hf_repo=hf_repo, hf_subset=hf_subset, - metric=metric, + metrics=metrics, hf_avail_splits=hf_avail_splits, evaluation_splits=evaluation_splits, few_shots_split=few_shots_split, @@ -584,8 +636,12 @@ def __init__( prompt_function=agi_eval_prompt_no_letters, hf_repo="lighteval/agi_eval_en", hf_subset=None, - # metric=[Metrics.loglikelihood_acc_single_token], - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], hf_avail_splits=["train", "validation"], evaluation_splits=["train"], few_shots_split="validation", @@ -599,7 +655,7 @@ def __init__( prompt_function=prompt_function, hf_repo=hf_repo, hf_subset=hf_subset, - metric=metric, + metrics=metrics, hf_avail_splits=hf_avail_splits, evaluation_splits=evaluation_splits, few_shots_split=few_shots_split, @@ -620,7 +676,10 @@ def __init__( name="agi_eval:math", hf_subset="math", prompt_function=agi_eval_math_prompt, - metric=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], generation_size=40, ), CustomAGIEvalEvaluationTask(name="agi_eval:sat-en", hf_subset="sat-en"), @@ -639,7 +698,7 @@ def __init__( # name="human_eval", # prompt_function=prompt.human_eval", # hf_repo="lighteval/human_eval", -# metric=["human_eval_pass_at_1"], +# metrics=["human_eval_pass_at_1"], # ), diff --git a/examples/nanotron/custom_task.py b/examples/nanotron/custom_task.py index feaa849ba..55a318edd 100644 --- a/examples/nanotron/custom_task.py +++ b/examples/nanotron/custom_task.py @@ -80,7 +80,7 @@ def mmlu_anatomy(line): few_shots_split="dev", few_shots_select="sequential", generation_size=5, - metric=[Metrics.loglikelihood_acc_single_token], + metric=[Metrics.loglikelihood_acc], stop_sequence=["\n"], ), LightevalTaskConfig( @@ -94,7 +94,7 @@ def mmlu_anatomy(line): few_shots_split="dev", few_shots_select="sequential", generation_size=5, - metric=[Metrics.loglikelihood_acc_single_token], + metric=[Metrics.loglikelihood_acc], stop_sequence=["\n"], ), ] diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index c9a336fc1..2bbe9bfdf 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -81,7 +81,13 @@ def default(self, o): return str(o) if isinstance(o, Enum): return o.name - return super().default(o) + if hasattr(o, "__str__"): + return str(o) + try: + return super().default(o) + except TypeError: + # For classes without json serialization + return type(o).__name__ class EvaluationTracker: diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index ab261485b..5d82d3c38 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -30,7 +30,7 @@ import git import xxhash -from lighteval.metrics.stderr import get_stderr_function +from lighteval.metrics.utils.stderr import get_stderr_function from lighteval.models.abstract_model import ModelConfig from lighteval.models.model_output import ModelResponse from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig @@ -360,13 +360,13 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = # The metric is in a subset which has already been computed and saved continue + aggregation = task.aggregation()[metric_name] + try: - metric_result = task.aggregation()[metric_name](metric_values) + metric_result = aggregation(metric_values) except OverflowError: logger.warning(f"{task_name}, {metric_name} got an OVERFLOW ERROR when aggregating.") metric_result = float("nan") - except KeyError: - continue if isinstance(metric_result, dict): # For some corpus level grouping metrics self.metric_aggregated[task_name].update(metric_result) @@ -379,7 +379,6 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = None # We skip stderr for some corpus metrics that return dicts, or if bootstrap_iters is 0 ) else: - aggregation = task.aggregation()[metric_name] stderr = get_stderr_function(aggregation=aggregation, number_experiments=bootstrap_iters) if stderr is not None and len(metric_values) > 1: try: diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py index 2a4b6d4c2..0bfce20c3 100644 --- a/src/lighteval/metrics/__init__.py +++ b/src/lighteval/metrics/__init__.py @@ -21,7 +21,7 @@ # SOFTWARE. -from lighteval.metrics.metrics import Metric +from lighteval.metrics.utils.metric_utils import Metric from lighteval.models.model_output import ModelResponse from lighteval.tasks.requests import Doc @@ -32,7 +32,7 @@ def apply_metric(responses: list[ModelResponse], docs: list[Doc], metrics: list[ if metric.batched_compute: outputs_per_metrics: list = [] - outputs_per_metrics.append(metric.compute(responses=responses, docs=docs)) + outputs_per_metrics.append(metric.compute_sample(responses=responses, docs=docs)) # We merge the outputs per metric in a list of dict for each sample # example: [{metric1_sample1, metric2_sample1}, {metric1_sample2, metric2_sample2}] @@ -47,7 +47,7 @@ def apply_metric(responses: list[ModelResponse], docs: list[Doc], metrics: list[ output = {} for metric in metrics: output.update( - metric.compute( + metric.compute_sample( model_response=model_response, doc=doc, ) diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py index 745d606ec..9ced582c7 100644 --- a/src/lighteval/metrics/dynamic_metrics.py +++ b/src/lighteval/metrics/dynamic_metrics.py @@ -40,13 +40,12 @@ from lighteval.metrics.utils.extractive_match_utils import ( # noqa: F401 ExprExtractionConfig, ExtractionTarget, - IndicesExtractionConfig, LatexExtractionConfig, extract_target_from_pred, get_extraction_regexes, ) from lighteval.metrics.utils.math_comparison import compare_gold_target -from lighteval.metrics.utils.metric_utils import SampleLevelMetric +from lighteval.metrics.utils.metric_utils import SampleLevelComputation, SampleLevelMetric from lighteval.models.model_output import ModelResponse from lighteval.tasks.requests import Doc, SamplingMethod from lighteval.utils.language import Language @@ -56,171 +55,168 @@ logger = logging.getLogger(__name__) -def loglikelihood_acc_metric(normalization: LogProbNormalization | None = None) -> SampleLevelMetric: - """ - Creates an accuracy (loglikelihood) metric, which returns accuracy given normalization. - """ - - normalization_str = f"_{normalization.name}" if normalization else "" - metric_name = f"acc{normalization_str}" - return SampleLevelMetric( - metric_name=metric_name, - sample_level_fn=LoglikelihoodAcc(logprob_normalization=normalization).compute, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - -def normalized_multi_choice_prob_metric( - normalization: LogProbNormalization | None = None, - aggregation_function: Callable[[np.ndarray], float] = np.max, -) -> SampleLevelMetric: - """ - Creates a normalized multi-choice probability metric, which returns the probability of the gold choice / sum of probabilities of all choices (after logprobs are normalized). - """ - - normalization_str = f"_{normalization.name}" if normalization else "" - metric_name = f"normalized_mc_prob{normalization_str}" - - return SampleLevelMetric( - metric_name=metric_name, - sample_level_fn=NormalizedMultiChoiceProbability( - log_prob_normalization=normalization, aggregation_function=aggregation_function - ).compute, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - -def probability_metric( - normalization: LogProbTokenNorm | None = None, - aggregation_function: Callable[[np.ndarray], float] = np.max, -) -> SampleLevelMetric: - """ - Creates a probability metric, which returns the probability of the gold choice given normalization. - """ - - normalization_str = f"_{normalization.name}" if normalization else "" - metric_name = f"prob{normalization_str}" - - return SampleLevelMetric( - metric_name=metric_name, - sample_level_fn=Probability(normalization=normalization, aggregation_function=aggregation_function).compute, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - -def multilingual_quasi_f1_score_metric( - language: Language, aggregation_function: Callable[[list[float]], float] = max -) -> SampleLevelMetric: - """ - Creates a language-aware F1 score metric, which returns the F1 score. - - Args: - language: The language of the samples. - aggregation_function: Aggregation samples to use when multiple golds are present. - - Returns: - F1 score metric. - """ - metric_name = f"f1_{language.value}" - - multilang_normalizer = get_multilingual_normalizer(language) - return SampleLevelMetric( - metric_name=metric_name, - sample_level_fn=F1_score( - normalize_gold=multilang_normalizer, - normalize_pred=multilang_normalizer, - aggregation_function=aggregation_function, - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - -def multilingual_quasi_exact_match_metric( - language: Language, - match_type: Literal["prefix", "suffix", "full"] = "full", - aggregation_function: Callable[[list[float]], float] = max, -) -> SampleLevelMetric: - """ - Creates a language-aware exact match metric, which returns the exact match score - Args: - language: The language of the samples. - match_type: The type of match to use - - "prefix": Prefixes must match - - "suffix": Suffixes must match - - "full": Full strings must match - aggregation_function: Aggregation samples to use when multiple golds are present. - Returns: - Exact match metric. - """ - metric_name = f"exact_match_{language.value}_{match_type}" - multilang_normalizer = get_multilingual_normalizer(language) - return SampleLevelMetric( - metric_name=metric_name, - sample_level_fn=ExactMatches( - normalize_gold=multilang_normalizer, - normalize_pred=multilang_normalizer, - aggregation_function=aggregation_function, - type_exact_match=match_type, - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - -def multilingual_extractive_match_metric( - language: Language = Language.ENGLISH, - gold_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),), - pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(), LatexExtractionConfig()), - aggregation_function: Callable[[list[float]], float] = max, - fallback_mode: Literal["no_fallback", "first_match"] = "first_match", - extraction_mode: Literal["first_match", "any_match"] = "any_match", - precision: int = 6, - timeout_seconds: int = 5, -) -> SampleLevelMetric: - """Creates a language-aware extractive match metric that extracts answers from the model's output. - - Known issues: - - If the task is to simplify an expression, the metric might overestimate the accuracy. This is because if the model doesn't output any anchor for the extraction (e.g final answer is..), - it's possible that the extracted prediction will be the expression to simplify. Because we do simplifications ourselves, it can thus happen that sympy will correctly simplify the expression, - thus it will match gold, despite model not doing anything. PRs to fix this are welcome. - - - There is currently no StringExtractionConfig, so if the gold is \boxed{\text{Friday}} and model outputs Friday it will not match, because nothing will be extracted. - - Args: - language: Language - The language of the samples. - gold_extraction_target: Sequence[ExtractionTarget] - Extraction targets to use for gold answers. Defaults to extracting simple math expressions. - pred_extraction_target: Sequence[ExtractionTarget] - Extraction targets to use for predictions. Defaults to extracting simple math expressions. - aggregation_function: Callable[[list[float]], float] - Function to aggregate scores when multiple golds/predictions are present. Defaults to max. - fallback_mode: Literal["no_fallback", "first_match"] - How to perform extraction. Defaults to "first_match". - - "no_fallback": Only use first successfully parsed matches - - "first_match": Use the first successfully parsed match + first match irregardless the parsing success - extraction_mode: Literal["first_match", "any_match"] - - "first_match": Only tries to extract the first regex match if it fails no other matches are tried - - "any_match": Tries to extract any regex match - - precision: int - Number of decimal places to use when comparing numerical values. Defaults to 6. - timeout_seconds: int - Timeout for the extraction (each attempt) and comparison. Defaults to 5. - - Returns: - A sample level metric that extracts and compares mathematical expressions. - - """ +class LogLikelihoodAccMetric(SampleLevelMetric): + def __init__(self, normalization: LogProbNormalization | None = None): + """ + Creates an accuracy (loglikelihood) metric, which returns accuracy given normalization. + """ + super().__init__( + metric_name="acc" + (f"_{normalization.name}" if normalization else ""), + sample_level_fn=LoglikelihoodAcc(logprob_normalization=normalization), + category=SamplingMethod.LOGPROBS, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + + +class NormalizedMultiChoiceProbMetric(SampleLevelMetric): + def __init__( + self, + normalization: LogProbNormalization | None = None, + aggregation_function: Callable[[np.ndarray], float] = np.max, + ): + """ + Creates a normalized multi-choice probability metric, which returns the probability of the gold choice / sum of probabilities of all choices (after logprobs are normalized). + """ + super().__init__( + metric_name="normalized_mc_prob" + (f"_{normalization.name}" if normalization else ""), + sample_level_fn=NormalizedMultiChoiceProbability( + log_prob_normalization=normalization, aggregation_function=aggregation_function + ), + category=SamplingMethod.LOGPROBS, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + + +class ProbabilityMetric(SampleLevelMetric): + def __init__( + self, + normalization: LogProbTokenNorm | None = None, + aggregation_function: Callable[[np.ndarray], float] = np.max, + ): + """ + Creates a probability metric, which returns the probability of the gold choice given normalization. + """ + super().__init__( + metric_name="prob" + (f"_{normalization.name}" if normalization else ""), + sample_level_fn=Probability(normalization=normalization, aggregation_function=aggregation_function), + category=SamplingMethod.LOGPROBS, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + + +class MultilingualQuasiF1ScoreMetric(SampleLevelMetric): + def __init__(self, language: Language, aggregation_function: Callable[[list[float]], float] = max): + """ + Creates a language-aware F1 score metric, which returns the F1 score. + + Args: + language: The language of the samples. + aggregation_function: Aggregation samples to use when multiple golds are present. + """ + super().__init__( + metric_name=f"f1_{language.value}", + sample_level_fn=F1_score( + normalize_gold=get_multilingual_normalizer(language), + normalize_pred=get_multilingual_normalizer(language), + aggregation_function=aggregation_function, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + + +class MultilingualQuasiExactMatchMetric(SampleLevelMetric): + def __init__( + self, + language: Language, + match_type: Literal["prefix", "suffix", "full"] = "full", + aggregation_function: Callable[[list[float]], float] = max, + ): + """ + Creates a language-aware exact match metric, which returns the exact match score + Args: + language: The language of the samples. + match_type: The type of match to use + - "prefix": Prefixes must match + - "suffix": Suffixes must match + - "full": Full strings must match + aggregation_function: Aggregation samples to use when multiple golds are present. + Returns: + Exact match metric. + """ + super().__init__( + metric_name=f"exact_match_{language.value}_{match_type}", + sample_level_fn=ExactMatches( + normalize_gold=get_multilingual_normalizer(language), + normalize_pred=get_multilingual_normalizer(language), + aggregation_function=aggregation_function, + type_exact_match=match_type, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + + +class MultilingualExtractiveMatchMetric(SampleLevelComputation): + def __init__( + self, + language: Language = Language.ENGLISH, + gold_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),), + pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(), LatexExtractionConfig()), + aggregation_function: Callable[[list[float]], float] = max, + fallback_mode: Literal["no_fallback", "first_match"] = "first_match", + extraction_mode: Literal["first_match", "any_match"] = "any_match", + precision: int = 6, + timeout_seconds: int = 5, + ): + """Creates a language-aware extractive match metric that extracts answers from the model's output. + + Known issues: + - If the task is to simplify an expression, the metric might overestimate the accuracy. This is because if the model doesn't output any anchor for the extraction (e.g final answer is..), + it's possible that the extracted prediction will be the expression to simplify. Because we do simplifications ourselves, it can thus happen that sympy will correctly simplify the expression, + thus it will match gold, despite model not doing anything. PRs to fix this are welcome. + + - There is currently no StringExtractionConfig, so if the gold is \boxed{\text{Friday}} and model outputs Friday it will not match, because nothing will be extracted. + + Args: + language: Language + The language of the samples. + gold_extraction_target: Sequence[ExtractionTarget] + Extraction targets to use for gold answers. Defaults to extracting simple math expressions. + pred_extraction_target: Sequence[ExtractionTarget] + Extraction targets to use for predictions. Defaults to extracting simple math expressions. + aggregation_function: Callable[[list[float]], float] + Function to aggregate scores when multiple golds/predictions are present. Defaults to max. + fallback_mode: Literal["no_fallback", "first_match"] + How to perform extraction. Defaults to "first_match". + - "no_fallback": Only use first successfully parsed matches + - "first_match": Use the first successfully parsed match + first match irregardless the parsing success + extraction_mode: Literal["first_match", "any_match"] + - "first_match": Only tries to extract the first regex match if it fails no other matches are tried + - "any_match": Tries to extract any regex match + + precision: int + Number of decimal places to use when comparing numerical values. Defaults to 6. + timeout_seconds: int + Timeout for the extraction (each attempt) and comparison. Defaults to 5. + + Returns: + A sample level metric that extracts and compares mathematical expressions. + + """ + self.language = language + self.gold_extraction_target = gold_extraction_target + self.pred_extraction_target = pred_extraction_target + self.aggregation_function = aggregation_function + self.fallback_mode = fallback_mode + self.extraction_mode = extraction_mode + self.precision = precision + self.timeout_seconds = timeout_seconds @timeout(2) def add_to_specifics_with_timeout( @@ -234,19 +230,23 @@ def add_to_specifics_with_timeout( ] formatted_doc.specific["extracted_golds"] = [str(gold) for golds in extracted_golds for gold in golds] - def sample_level_fn(doc: Doc, model_response: ModelResponse) -> float: + def compute(self, doc: Doc, model_response: ModelResponse) -> float: golds = doc.get_golds() predictions = model_response.final_text - gold_extraction_regexes = get_extraction_regexes(doc, gold_extraction_target, language) - pred_extraction_regexes = get_extraction_regexes(doc, pred_extraction_target, language) + gold_extraction_regexes = get_extraction_regexes(doc, self.gold_extraction_target, self.language) + pred_extraction_regexes = get_extraction_regexes(doc, self.pred_extraction_target, self.language) extracted_predictions = [ - extract_target_from_pred(pred, pred_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds) + extract_target_from_pred( + pred, pred_extraction_regexes, self.fallback_mode, self.extraction_mode, self.timeout_seconds + ) for pred in predictions ] extracted_golds = [ - extract_target_from_pred(gold, gold_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds) + extract_target_from_pred( + gold, gold_extraction_regexes, self.fallback_mode, self.extraction_mode, self.timeout_seconds + ) for gold in golds ] @@ -262,16 +262,16 @@ def sample_level_fn(doc: Doc, model_response: ModelResponse) -> float: # We have to use timeout because the sypmy to str conversion can be very slow try: - add_to_specifics_with_timeout(doc, extracted_predictions, extracted_golds) + self.add_to_specifics_with_timeout(doc, extracted_predictions, extracted_golds) except Exception: # noqa: E722 logger.warning("Timeout when adding extracted predictions and golds to specific") - return aggregation_function( + return self.aggregation_function( [ ( 1.0 if any( - compare_gold_target(gold, pred, precision, timeout_seconds=timeout_seconds) + compare_gold_target(gold, pred, self.precision, timeout_seconds=self.timeout_seconds) for gold in extracted_golds ) else 0.0 @@ -279,11 +279,3 @@ def sample_level_fn(doc: Doc, model_response: ModelResponse) -> float: for pred in extracted_predictions ] ) - - return SampleLevelMetric( - metric_name="extractive_match", - sample_level_fn=sample_level_fn, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) diff --git a/src/lighteval/metrics/harness_compatibility/drop.py b/src/lighteval/metrics/harness_compatibility/drop.py index f12828cbe..382d9ad08 100644 --- a/src/lighteval/metrics/harness_compatibility/drop.py +++ b/src/lighteval/metrics/harness_compatibility/drop.py @@ -27,28 +27,40 @@ import numpy as np from scipy.optimize import linear_sum_assignment +from lighteval.metrics.metrics_sample import SampleLevelComputation from lighteval.models.model_output import ModelResponse from lighteval.tasks.requests import Doc -def drop_metrics(doc: Doc, model_response: ModelResponse): # noqa: C901 - """F1 score from bag of words: comes from Harness Drop. DROP offers two metrics, - a quasi exact match and a numeracy-focused F1 score. Quasi in the sense that it - does some normalizations before matching and numeracy-focused in the sense that - if there's number mismatch between the target and prediction F1 score is set to 0. - F1 score is computed using the intersection of target and prediction's BoW - representations with the additional spice that if the answer and/or prediction is - comprised of multiple spans, a greedy matching is done between the two sets of spans - (based on the very BoW overlap) and the average over F1 of pairs is returned. - DROP also accepts multiple answers in which case, the maximum of F1/ Exact Match - between prediction and the different answers is taken. +class DropMetrics(SampleLevelComputation): + def compute(self, doc: Doc, model_response: ModelResponse): # noqa: C901 + """F1 score from bag of words: comes from Harness Drop. DROP offers two metrics, + a quasi exact match and a numeracy-focused F1 score. Quasi in the sense that it + does some normalizations before matching and numeracy-focused in the sense that + if there's number mismatch between the target and prediction F1 score is set to 0. + F1 score is computed using the intersection of target and prediction's BoW + representations with the additional spice that if the answer and/or prediction is + comprised of multiple spans, a greedy matching is done between the two sets of spans + (based on the very BoW overlap) and the average over F1 of pairs is returned. + DROP also accepts multiple answers in which case, the maximum of F1/ Exact Match + between prediction and the different answers is taken. - For more information, please refer to the section 5 of the DROP paper (https://aclanthology.org/N19-1246/). + For more information, please refer to the section 5 of the DROP paper (https://aclanthology.org/N19-1246/). - Todo: this code is really hard to follow, simplify when possible - """ - - def _answer_to_bags(answer: List[str]) -> Tuple[List[str], List[Set[str]]]: + Todo: this code is really hard to follow, simplify when possible + """ + max_em = 0 + max_f1 = 0 + for gold_answer in doc.specific["golds_no_preprocessing"]: + exact_match, f1_score = self._get_metrics(model_response.text, gold_answer) + if isinstance(gold_answer, list): + gold_answer = gold_answer[0] + if gold_answer.strip(): + max_em = max(max_em, exact_match) + max_f1 = max(max_f1, f1_score) + return {"em": max_em, "f1": max_f1} + + def _answer_to_bags(self, answer: List[str]) -> Tuple[List[str], List[Set[str]]]: if isinstance(answer, (list, tuple)): raw_spans = answer else: @@ -56,12 +68,12 @@ def _answer_to_bags(answer: List[str]) -> Tuple[List[str], List[Set[str]]]: normalized_spans = [] token_bags = [] for raw_span in raw_spans: - normalized_span = _normalize(raw_span) + normalized_span = self._normalize(raw_span) normalized_spans.append(normalized_span) token_bags.append(set(normalized_span.split())) return normalized_spans, token_bags - def _get_metrics(predicted: List[str], gold: List[str]): + def _get_metrics(self, predicted: List[str], gold: List[str]): """ Takes a predicted answer and a gold answer (that are both either a string or a list of strings), and returns exact match and the DROP F1 metric for the prediction. If you are @@ -69,8 +81,8 @@ def _get_metrics(predicted: List[str], gold: List[str]): validation, or while training), this is the function you want to call, after using :func:`answer_json_to_strings` when reading the gold answer from the released data file. """ - pred_normalized_spans, pred_bags = _answer_to_bags(predicted) - gold_normalized_spans, gold_bags = _answer_to_bags(gold) + pred_normalized_spans, pred_bags = self._answer_to_bags(predicted) + gold_normalized_spans, gold_bags = self._answer_to_bags(gold) if set(pred_normalized_spans) == set(gold_normalized_spans) and len(gold_normalized_spans) == len( gold_normalized_spans @@ -79,32 +91,32 @@ def _get_metrics(predicted: List[str], gold: List[str]): else: exact_match = 0.0 - f1_per_bag = _align_bags(pred_bags, gold_bags) + f1_per_bag = self._align_bags(pred_bags, gold_bags) f1 = np.mean(f1_per_bag) f1 = round(f1, 2) return exact_match, f1 - def _is_number(text): + def _is_number(self, text): try: float(text) return True except ValueError: return False - def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]): + def _match_numbers_if_present(self, gold_bag: Set[str], predicted_bag: Set[str]): gold_numbers = set() predicted_numbers = set() for word in gold_bag: - if _is_number(word): + if self._is_number(word): gold_numbers.add(word) for word in predicted_bag: - if _is_number(word): + if self._is_number(word): predicted_numbers.add(word) if (not gold_numbers) or gold_numbers.intersection(predicted_numbers): return True return False - def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> np.array: + def _align_bags(self, predicted: List[Set[str]], gold: List[Set[str]]) -> np.array: """ Takes gold and predicted answer sets and first finds the optimal 1-1 alignment between them and gets maximum metric values over all the answers. @@ -112,8 +124,8 @@ def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> np.array: scores = np.zeros([len(gold), len(predicted)]) for gold_index, gold_item in enumerate(gold): for pred_index, pred_item in enumerate(predicted): - if _match_numbers_if_present(gold_item, pred_item): - scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item) + if self._match_numbers_if_present(gold_item, pred_item): + scores[gold_index, pred_index] = self._compute_f1(pred_item, gold_item) row_ind, col_ind = linear_sum_assignment(-scores) max_scores = np.zeros([max(len(gold), len(predicted))]) @@ -121,7 +133,7 @@ def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> np.array: max_scores[row] = max(max_scores[row], scores[row, column]) return max_scores - def _compute_f1(predicted_bag, gold_bag): + def _compute_f1(self, predicted_bag, gold_bag): intersection = len(gold_bag.intersection(predicted_bag)) if not predicted_bag: precision = 1.0 @@ -135,40 +147,30 @@ def _compute_f1(predicted_bag, gold_bag): return 0 return (2 * precision * recall) / (precision + recall) - def _remove_articles(text): + def _remove_articles(self, text): return re.compile(r"\b(a|an|the)\b", re.UNICODE).sub(" ", text) - def _white_space_fix(text): + def _white_space_fix(self, text): return " ".join(text.split()) - def _remove_punc(text): + def _remove_punc(self, text): exclude = set(string.punctuation) - if not _is_number(text): + if not self._is_number(text): return "".join(ch for ch in text if ch not in exclude) else: return text - def _fix_number(text): - return str(float(text)) if _is_number(text) else text + def _fix_number(self, text): + return str(float(text)) if self._is_number(text) else text - def _tokenize(text): + def _tokenize(self, text): return re.split(" |-", text) - def _normalize(answer: str): + def _normalize(self, answer: str): tokens = [ - _white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower())))) for token in _tokenize(answer) + self._white_space_fix(self._remove_articles(self._fix_number(self._remove_punc(token.lower())))) + for token in self._tokenize(answer) ] tokens = [token for token in tokens if token.strip()] normalized = " ".join(tokens).strip() return normalized - - max_em = 0 - max_f1 = 0 - for gold_answer in doc.specific["golds_no_preprocessing"]: - exact_match, f1_score = _get_metrics(model_response.text, gold_answer) - if isinstance(gold_answer, list): - gold_answer = gold_answer[0] - if gold_answer.strip(): - max_em = max(max_em, exact_match) - max_f1 = max(max_f1, f1_score) - return {"qem": max_em, "f1": max_f1} diff --git a/src/lighteval/metrics/harness_compatibility/truthful_qa.py b/src/lighteval/metrics/harness_compatibility/truthful_qa.py index 771077222..d8cbc3662 100644 --- a/src/lighteval/metrics/harness_compatibility/truthful_qa.py +++ b/src/lighteval/metrics/harness_compatibility/truthful_qa.py @@ -22,39 +22,41 @@ import numpy as np +from lighteval.metrics.metrics_sample import SampleLevelComputation from lighteval.models.model_output import ModelResponse from lighteval.tasks.requests import Doc from lighteval.utils.utils import as_list # Comes from the harness -def truthfulqa_mc_metrics(doc: Doc, model_response: ModelResponse): - def mc1(lls): +class TruthfulqaMCMetrics(SampleLevelComputation): + def compute(self, doc: Doc, model_response: ModelResponse): + gold_ixs = as_list(doc.gold_index) + choices_logprob = model_response.logprobs + + # The harness assumes that all items are gold before the last one, but that is not always the case + # For gold ix 5, 6, 8, the harness will look at the first "gap" (7) and consider that the following + # items are not gold (even though here, 8 is gold). Example at item 371 of the dataset. + # This is broken and will have to be fixed once we OSS this, by actually separating + # gold and not gold items for mc2 computations + len_mc1 = doc.specific["len_mc1"] + last_harness_gold = gold_ixs[1] - 1 # fake value to init the loop + for g in gold_ixs[1:]: # we ignore the first item, which is the gold for mc1 + if last_harness_gold == g - 1: + last_harness_gold = g + else: + break + # TODO: This completely ignores any normalization, but keeping it as is + mc2_last_gold_ix = last_harness_gold - len_mc1 + 1 + mc1_lls, mc2_lls = choices_logprob[:len_mc1], choices_logprob[len_mc1:] + return {"truthfulqa_mc1": self.mc1(mc1_lls), "truthfulqa_mc2": self.mc2(mc2_lls, mc2_last_gold_ix)} + + def mc1(self, lls): # The gold answers in `mc1_targets` are always first (index = `0`). return np.argmax(lls) == 0 - def mc2(lls, split_idx): + def mc2(self, lls, split_idx): ll_true, ll_false = lls[:split_idx], lls[split_idx:] p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false)) p_true = p_true / (sum(p_true) + sum(p_false)) return sum(p_true) - - gold_ixs = as_list(doc.gold_index) - choices_logprob = model_response.logprobs - - # The harness assumes that all items are gold before the last one, but that is not always the case - # For gold ix 5, 6, 8, the harness will look at the first "gap" (7) and consider that the following - # items are not gold (even though here, 8 is gold). Example at item 371 of the dataset. - # This is broken and will have to be fixed once we OSS this, by actually separating - # gold and not gold items for mc2 computations - len_mc1 = doc.specific["len_mc1"] - last_harness_gold = gold_ixs[1] - 1 # fake value to init the loop - for g in gold_ixs[1:]: # we ignore the first item, which is the gold for mc1 - if last_harness_gold == g - 1: - last_harness_gold = g - else: - break - # TODO: This completely ignores any normalization, but keeping it as is - mc2_last_gold_ix = last_harness_gold - len_mc1 + 1 - mc1_lls, mc2_lls = choices_logprob[:len_mc1], choices_logprob[len_mc1:] - return {"truthfulqa_mc1": mc1(mc1_lls), "truthfulqa_mc2": mc2(mc2_lls, mc2_last_gold_ix)} diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 903295240..a0c75c133 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -20,30 +20,29 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from typing import Callable + +from copy import deepcopy import numpy as np from aenum import Enum from lighteval.metrics.dynamic_metrics import ( - ExprExtractionConfig, - IndicesExtractionConfig, - LatexExtractionConfig, - multilingual_extractive_match_metric, + MultilingualExtractiveMatchMetric, ) -from lighteval.metrics.harness_compatibility.drop import drop_metrics -from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics +from lighteval.metrics.harness_compatibility.drop import DropMetrics +from lighteval.metrics.harness_compatibility.truthful_qa import TruthfulqaMCMetrics from lighteval.metrics.metrics_corpus import ( CorpusLevelF1Score, CorpusLevelPerplexityMetric, CorpusLevelTranslationMetric, - matthews_corrcoef, + MatthewsCorrCoef, ) from lighteval.metrics.metrics_sample import ( BLEU, BLEURT, MRR, ROUGE, + AccGoldLikelihood, AvgAtK, BertScore, ExactMatches, @@ -57,15 +56,9 @@ PassAtK, Recall, StringDistance, - acc_golds_likelihood, ) from lighteval.metrics.normalizations import ( - LogProbCharNorm, bigbench_normalizer, - gsm8k_normalizer, - harness_triviaqa_normalizer, - helm_normalizer, - math_normalizer, remove_braces, remove_braces_and_strip, ) @@ -75,58 +68,81 @@ PerplexityPreparator, TargetPerplexityPreparator, ) +from lighteval.metrics.utils.extractive_match_utils import ( + ExprExtractionConfig, + IndicesExtractionConfig, + LatexExtractionConfig, +) from lighteval.metrics.utils.metric_utils import ( CorpusLevelMetric, CorpusLevelMetricGrouping, - Metric, - MetricGrouping, SampleLevelMetric, SampleLevelMetricGrouping, SamplingMethod, ) from lighteval.utils.language import Language -from lighteval.utils.utils import as_list class Metrics(Enum): acc_golds_likelihood = SampleLevelMetric( # todo: we need a better name for this! metric_name="acc", - sample_level_fn=acc_golds_likelihood, + sample_level_fn=AccGoldLikelihood(), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, ) + avg_at_k = SampleLevelMetric( + metric_name="avg@k", + sample_level_fn=AvgAtK(strip_strings=True), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + avg_at_k_math = SampleLevelMetric( + metric_name="avg@k", + sample_level_fn=AvgAtK( + sample_scoring_function=MultilingualExtractiveMatchMetric( + language=Language.ENGLISH, + gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], + pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], + precision=6, + ), + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) bert_score = SampleLevelMetricGrouping( metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"], - sample_level_fn=BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip).compute, + sample_level_fn=BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip), category=SamplingMethod.GENERATIVE, corpus_level_fn={"BERTScore-P": np.mean, "BERTScore-R": np.mean, "BERTScore-F": np.mean}, higher_is_better={"BERTScore-P": True, "BERTScore-R": True, "BERTScore-F": True}, ) bits_per_byte = CorpusLevelMetric( metric_name="bits_per_byte", - sample_level_fn=PerplexityPreparator(units_type="bytes").prepare, + sample_level_fn=PerplexityPreparator(units_type="bytes"), category=SamplingMethod.PERPLEXITY, - corpus_level_fn=CorpusLevelPerplexityMetric("bits_per_byte").compute, + corpus_level_fn=CorpusLevelPerplexityMetric("bits_per_byte"), higher_is_better=False, ) bleu = CorpusLevelMetric( metric_name="bleu", - sample_level_fn=GenerativePreparator().prepare, + sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, - corpus_level_fn=CorpusLevelTranslationMetric("bleu").compute, + corpus_level_fn=CorpusLevelTranslationMetric("bleu"), higher_is_better=True, ) bleu_1 = SampleLevelMetric( metric_name="bleu_1", - sample_level_fn=BLEU(n_gram=1).compute, + sample_level_fn=BLEU(n_gram=1), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) bleu_4 = SampleLevelMetric( metric_name="bleu_4", - sample_level_fn=BLEU(n_gram=4).compute, + sample_level_fn=BLEU(n_gram=4), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -134,69 +150,75 @@ class Metrics(Enum): bleurt = SampleLevelMetric( metric_name="bleurt", - sample_level_fn=BLEURT().compute, + sample_level_fn=BLEURT(), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) byte_perplexity = CorpusLevelMetric( metric_name="byte_perplexity", - sample_level_fn=PerplexityPreparator(units_type="bytes").prepare, + sample_level_fn=PerplexityPreparator(units_type="bytes"), category=SamplingMethod.PERPLEXITY, - corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity").compute, + corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity"), higher_is_better=False, ) chrf = CorpusLevelMetric( metric_name="chrf", - sample_level_fn=GenerativePreparator().prepare, + sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, - corpus_level_fn=CorpusLevelTranslationMetric("chrf").compute, + corpus_level_fn=CorpusLevelTranslationMetric("chrf"), higher_is_better=True, ) chrf_plus = CorpusLevelMetric( metric_name="chrf++", - sample_level_fn=GenerativePreparator().prepare, + sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, - corpus_level_fn=CorpusLevelTranslationMetric("chrf++").compute, + corpus_level_fn=CorpusLevelTranslationMetric("chrf++"), higher_is_better=True, ) copyright = SampleLevelMetricGrouping( metric_name=["longest_common_prefix_length", "edit_distance", "edit_similarity"], sample_level_fn=StringDistance( metric_types=["longest_common_prefix_length", "edit_distance", "edit_similarity"], strip_prediction=True - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn={"longest_common_prefix_length": max, "edit_distance": min, "edit_similarity": max}, higher_is_better={"longest_common_prefix_length": True, "edit_distance": False, "edit_similarity": True}, ) drop = SampleLevelMetricGrouping( - metric_name=["qem", "f1"], - sample_level_fn=drop_metrics, + metric_name=["em", "f1"], + sample_level_fn=DropMetrics(), category=SamplingMethod.GENERATIVE, - corpus_level_fn={"qem": max, "f1": max}, - higher_is_better={"qem": True, "f1": True}, + corpus_level_fn={"em": max, "f1": max}, + higher_is_better={"em": True, "f1": True}, ) exact_match = SampleLevelMetric( metric_name="em", - sample_level_fn=ExactMatches(strip_strings=True).compute, + sample_level_fn=ExactMatches(strip_strings=True), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) - expr_gold_metric = multilingual_extractive_match_metric( - language=Language.ENGLISH, - fallback_mode="first_match", - precision=5, - gold_extraction_target=(ExprExtractionConfig(),), - # Match boxed first before trying other regexes - pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), - aggregation_function=max, + expr_gold_metric = SampleLevelMetric( + metric_name="extractive_match", + sample_level_fn=MultilingualExtractiveMatchMetric( + language=Language.ENGLISH, + fallback_mode="first_match", + precision=5, + gold_extraction_target=(ExprExtractionConfig(),), + # Match boxed first before trying other regexes + pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), + aggregation_function=max, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, ) extractiveness = SampleLevelMetricGrouping( metric_name=["summarization_coverage", "summarization_density", "summarization_compression"], sample_level_fn=Extractiveness( normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text" - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn={ "summarization_coverage": np.mean, @@ -209,463 +231,170 @@ class Metrics(Enum): "summarization_compression": True, }, ) - f1_score_quasi = SampleLevelMetric( - metric_name="f1_score_quasi", - sample_level_fn=F1_score(normalize_gold=helm_normalizer, normalize_pred=helm_normalizer).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) f1_score = SampleLevelMetric( metric_name="f1", - sample_level_fn=F1_score().compute, + sample_level_fn=F1_score(), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) f1_score_macro = CorpusLevelMetric( metric_name="f1", - sample_level_fn=GenerativePreparator().prepare, + sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, - corpus_level_fn=CorpusLevelF1Score(average="macro").compute, + corpus_level_fn=CorpusLevelF1Score(average="macro"), higher_is_better=True, ) f1_score_micro = CorpusLevelMetric( metric_name="f1", - sample_level_fn=GenerativePreparator().prepare, + sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, - corpus_level_fn=CorpusLevelF1Score(average="micro").compute, + corpus_level_fn=CorpusLevelF1Score(average="micro"), higher_is_better=True, ) faithfulness = SampleLevelMetric( metric_name="summac", sample_level_fn=Faithfulness( normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text" - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) - latex_gold_metric = multilingual_extractive_match_metric( - language=Language.ENGLISH, - fallback_mode="first_match", - precision=5, - gold_extraction_target=(LatexExtractionConfig(),), - # Match boxed first before trying other regexes - pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), - aggregation_function=max, - ) - loglikelihood_acc = SampleLevelMetric( - metric_name="acc", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=None).compute, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - loglikelihood_acc_norm = SampleLevelMetric( - metric_name="acc_norm", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm()).compute, - category=SamplingMethod.LOGPROBS, + g_pass_at_k = SampleLevelMetricGrouping( + metric_name="g-pass@k", + sample_level_fn=GPassAtK(strip_strings=True), + category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) - loglikelihood_acc_norm_nospace = SampleLevelMetric( - metric_name="acc_norm", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm(ignore_first_space=True)).compute, - category=SamplingMethod.LOGPROBS, + g_pass_at_k_math = SampleLevelMetricGrouping( + metric_name="math-g-pass@k", + sample_level_fn=GPassAtK( + name_prefix="math", + strip_strings=True, + sample_scoring_function=MultilingualExtractiveMatchMetric( + language=Language.ENGLISH, + fallback_mode="first_match", + precision=5, + gold_extraction_target=(ExprExtractionConfig(),), + # Match boxed first before trying other regexes + pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), + aggregation_function=max, + ), + ), + category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) - loglikelihood_acc_norm_single_token = SampleLevelMetric( - metric_name="acc_norm", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm()).compute, - category=SamplingMethod.LOGPROBS, + g_pass_at_k_latex = SampleLevelMetricGrouping( + metric_name="latex-g-pass@k", + sample_level_fn=GPassAtK( + name_prefix="latex", + strip_strings=True, + sample_scoring_function=MultilingualExtractiveMatchMetric( + language=Language.ENGLISH, + fallback_mode="first_match", + precision=5, + gold_extraction_target=(LatexExtractionConfig(),), + # Match boxed first before trying other regexes + pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), + aggregation_function=max, + ), + ), + category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) - loglikelihood_acc_single_token = SampleLevelMetric( + loglikelihood_acc = SampleLevelMetric( metric_name="acc", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=None).compute, + sample_level_fn=LoglikelihoodAcc(logprob_normalization=None), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, ) loglikelihood_f1 = CorpusLevelMetric( metric_name="loglikelihood_f1", - sample_level_fn=LoglikelihoodPreparator().prepare, + sample_level_fn=LoglikelihoodPreparator(), category=SamplingMethod.LOGPROBS, - corpus_level_fn=CorpusLevelF1Score(None).compute, + corpus_level_fn=CorpusLevelF1Score(None), higher_is_better=True, ) - loglikelihood_f1_single_token = CorpusLevelMetric( - metric_name="loglikelihood_f1", - sample_level_fn=LoglikelihoodPreparator(is_single_token=True).prepare, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=CorpusLevelF1Score(None).compute, + maj_at_k = SampleLevelMetric( + metric_name="maj@k", + sample_level_fn=MajAtK(), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, higher_is_better=True, ) mcc = CorpusLevelMetric( metric_name="mcc", - sample_level_fn=LoglikelihoodPreparator().prepare, + sample_level_fn=LoglikelihoodPreparator(), category=SamplingMethod.LOGPROBS, - corpus_level_fn=matthews_corrcoef, + corpus_level_fn=MatthewsCorrCoef(), higher_is_better=True, ) - mcc_single_token = CorpusLevelMetric( - metric_name="mcc", - sample_level_fn=LoglikelihoodPreparator().prepare, + mrr = SampleLevelMetric( + metric_name="mrr", + sample_level_fn=MRR(), category=SamplingMethod.LOGPROBS, - corpus_level_fn=matthews_corrcoef, - higher_is_better=True, - ) - maj_at_4_math = SampleLevelMetric( - metric_name="maj@4", - sample_level_fn=MajAtK( - k=4, strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - maj_at_5 = SampleLevelMetric( - metric_name="maj@5", - sample_level_fn=MajAtK(k=5).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - maj_at_8 = SampleLevelMetric( - metric_name="maj@8", - sample_level_fn=MajAtK(k=8).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - maj_at_8_gsm8k = SampleLevelMetric( - metric_name="maj@8", - sample_level_fn=MajAtK( - k=8, strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - math_avg_at_64 = SampleLevelMetric( - metric_name="math_avg@64", - sample_level_fn=AvgAtK( - k=64, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ).sample_level_fn(doc, model_response), - ).compute, - category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) - - math_pass_at_1_1n = SampleLevelMetric( - metric_name="math_pass@1:1_samples", - sample_level_fn=PassAtK( - k=1, - n=1, - strip_strings=True, - # Extracting mathematical expressions and latex expressions - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ).sample_level_fn(doc, model_response), - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, + multi_f1_numeric = CorpusLevelMetric( + metric_name="mf1", + sample_level_fn=LoglikelihoodPreparator(is_single_token=True), + category=SamplingMethod.LOGPROBS, + corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3), higher_is_better=True, ) - math_pass_at_1_4n = SampleLevelMetric( - metric_name="math_pass@1:4_samples", - sample_level_fn=PassAtK( - k=1, - n=4, - strip_strings=True, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ).sample_level_fn(doc, model_response), - ).compute, + pass_at_k = SampleLevelMetric( + metric_name="pass@k", + sample_level_fn=PassAtK(strip_strings=True), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) - math_pass_at_1_8n = SampleLevelMetric( - metric_name="math_pass@1:8_samples", + pass_at_k_math = SampleLevelMetric( + metric_name="pass@k", sample_level_fn=PassAtK( - k=1, - n=8, strip_strings=True, # Extracting mathematical expressions and latex expressions - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ).sample_level_fn(doc, model_response), - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - math_pass_at_1_16n = SampleLevelMetric( - metric_name="math_pass@1:16_samples", - sample_level_fn=PassAtK( - k=1, - n=16, - strip_strings=True, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ).sample_level_fn(doc, model_response), - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - math_pass_at_1_32n = SampleLevelMetric( - metric_name="math_pass@1:32_samples", - sample_level_fn=PassAtK( - k=1, - n=32, - strip_strings=True, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, - ).sample_level_fn(doc, model_response), - ).compute, + ), + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) - math_pass_at_1_64n = SampleLevelMetric( - metric_name="math_pass@1:64_samples", + pass_at_k_letters = SampleLevelMetric( + metric_name="pass@k", sample_level_fn=PassAtK( - k=1, - n=64, - strip_strings=True, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], precision=6, - ).sample_level_fn(doc, model_response), - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - mrr = SampleLevelMetric( - metric_name="mrr", - sample_level_fn=MRR().compute, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - mrr_single_token = SampleLevelMetric( - metric_name="mrr", - sample_level_fn=mrr, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - multi_f1_numeric = CorpusLevelMetric( - metric_name="mf1", - sample_level_fn=LoglikelihoodPreparator(is_single_token=True).prepare, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3).compute, - higher_is_better=True, - ) - avg_at_64 = SampleLevelMetric( - metric_name="avg@64", - sample_level_fn=PassAtK(k=64, n=64, strip_strings=True).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - pass_at_1 = SampleLevelMetric( - metric_name="pass@1:32_samples", - sample_level_fn=PassAtK(k=1, n=32, strip_strings=True).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - pass_at_10 = SampleLevelMetric( - metric_name="pass@10:32_samples", - sample_level_fn=PassAtK(k=10, n=32, strip_strings=True).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - pass_at_100 = SampleLevelMetric( - metric_name="pass@100:32_samples", - sample_level_fn=PassAtK(k=100, n=32, strip_strings=True).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - g_pass_at_16 = SampleLevelMetricGrouping( - metric_name=["G-Pass@16:48_samples"], - sample_level_fn=GPassAtK(k=16, n=48, strip_strings=True).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), - higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), - ) - g_pass_at_8_16 = SampleLevelMetricGrouping( - metric_name=["G-Pass@8-16:48_samples"], - sample_level_fn=GPassAtK(k=[8, 16], n=48, strip_strings=True).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), - higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), - ) - g_pass_at_16_expr_gold = SampleLevelMetricGrouping( - metric_name=["G-Pass@16:48_samples"], - sample_level_fn=GPassAtK( - k=16, - n=48, - strip_strings=True, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( - language=Language.ENGLISH, - fallback_mode="first_match", - precision=5, - gold_extraction_target=(ExprExtractionConfig(),), - # Match boxed first before trying other regexes - pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), - aggregation_function=max, - ).sample_level_fn(doc, model_response), - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), - higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), - ) - g_pass_at_16_latex_gold = SampleLevelMetricGrouping( - metric_name=["G-Pass@16:48_samples"], - sample_level_fn=GPassAtK( - k=16, - n=48, - strip_strings=True, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( - language=Language.ENGLISH, - fallback_mode="first_match", - precision=5, - gold_extraction_target=(LatexExtractionConfig(),), - # Match boxed first before trying other regexes - pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), - aggregation_function=max, - ).sample_level_fn(doc, model_response), - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), - higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), - ) - perfect_exact_match = SampleLevelMetric( - metric_name="perfect_em", - sample_level_fn=ExactMatches().compute, + ), + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) prediction_perplexity = SampleLevelMetric( metric_name="ppl", - sample_level_fn=None, # todo!!! + sample_level_fn=PerplexityPreparator("words"), category=SamplingMethod.PERPLEXITY, - corpus_level_fn=CorpusLevelPerplexityMetric("perplexity").compute, - higher_is_better=True, - ) - prefix_exact_match = SampleLevelMetric( - metric_name="pem", - sample_level_fn=ExactMatches(strip_strings=True, type_exact_match="prefix").compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - prefix_quasi_exact_match = SampleLevelMetric( - metric_name="pqem", - sample_level_fn=ExactMatches( - normalize_gold=helm_normalizer, - normalize_pred=helm_normalizer, - type_exact_match="prefix", - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - quasi_exact_match = SampleLevelMetric( - metric_name="qem", - sample_level_fn=ExactMatches( - normalize_gold=helm_normalizer, - normalize_pred=helm_normalizer, - strip_strings=True, - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - quasi_exact_match_math = SampleLevelMetric( - metric_name="qem", - sample_level_fn=ExactMatches( - strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - quasi_exact_match_triviaqa = SampleLevelMetric( - metric_name="qem", - sample_level_fn=ExactMatches(strip_strings=True, normalize_pred=harness_triviaqa_normalizer).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - quasi_exact_match_gsm8k = SampleLevelMetric( - metric_name="qem", - sample_level_fn=ExactMatches( - strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - recall_at_1_single_token = SampleLevelMetric( - metric_name="acc", - sample_level_fn=Recall(at=1).compute, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - recall_at_2_single_token = SampleLevelMetric( - metric_name="recall@2", - sample_level_fn=Recall(at=2).compute, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, + corpus_level_fn=CorpusLevelPerplexityMetric("perplexity"), higher_is_better=True, ) - recall_at_1 = SampleLevelMetric( - metric_name="acc", - sample_level_fn=Recall(at=1), - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - recall_at_2 = SampleLevelMetric( - metric_name="recall@2", - sample_level_fn=Recall(at=2), + recall_at_k = SampleLevelMetric( + metric_name="recall", + sample_level_fn=Recall(k=1), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, @@ -677,35 +406,35 @@ class Metrics(Enum): bootstrap=True, normalize_gold=bigbench_normalizer, normalize_pred=bigbench_normalizer, - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn={"rouge1": np.mean, "rouge2": np.mean, "rougeL": np.mean, "rougeLsum": np.mean}, higher_is_better={"rouge1": True, "rouge2": True, "rougeL": True, "rougeLsum": True}, ) rouge1 = SampleLevelMetric( metric_name="rouge1", - sample_level_fn=ROUGE("rouge1").compute, + sample_level_fn=ROUGE("rouge1"), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) rouge2 = SampleLevelMetric( metric_name="rouge2", - sample_level_fn=ROUGE("rouge2").compute, + sample_level_fn=ROUGE("rouge2"), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) rougeL = SampleLevelMetric( metric_name="rougeL", - sample_level_fn=ROUGE("rougeL").compute, + sample_level_fn=ROUGE("rougeL"), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) rougeLsum = SampleLevelMetric( metric_name="rougeLsum", - sample_level_fn=ROUGE("rougeLsum").compute, + sample_level_fn=ROUGE("rougeLsum"), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -714,90 +443,62 @@ class Metrics(Enum): metric_name=["simpleqa_judge"], higher_is_better={"simpleqa_judge": True}, category=SamplingMethod.GENERATIVE, - sample_level_fn=JudgeLLMSimpleQA().compute, - batched_compute=True, + sample_level_fn=JudgeLLMSimpleQA(), corpus_level_fn={ "simpleqa_judge": np.mean, }, + batched_compute=True, ) target_perplexity = SampleLevelMetric( metric_name="ppl", - sample_level_fn=TargetPerplexityPreparator(units_type="words").prepare, + sample_level_fn=TargetPerplexityPreparator(units_type="words"), category=SamplingMethod.LOGPROBS, - corpus_level_fn=CorpusLevelPerplexityMetric("perplexity").compute, + corpus_level_fn=CorpusLevelPerplexityMetric("perplexity"), higher_is_better=False, ) ter = CorpusLevelMetric( metric_name="ter", - sample_level_fn=GenerativePreparator().prepare, + sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, - corpus_level_fn=CorpusLevelTranslationMetric("ter").compute, + corpus_level_fn=CorpusLevelTranslationMetric("ter"), higher_is_better=False, ) truthfulqa_mc_metrics = SampleLevelMetricGrouping( metric_name=["truthfulqa_mc1", "truthfulqa_mc2"], - sample_level_fn=truthfulqa_mc_metrics, + sample_level_fn=TruthfulqaMCMetrics(), category=SamplingMethod.LOGPROBS, corpus_level_fn={"truthfulqa_mc1": np.mean, "truthfulqa_mc2": np.mean}, higher_is_better={"truthfulqa_mc1": True, "truthfulqa_mc2": True}, ) word_perplexity = CorpusLevelMetric( metric_name="word_perplexity", - sample_level_fn=PerplexityPreparator(units_type="words").prepare, + sample_level_fn=PerplexityPreparator(units_type="words"), category=SamplingMethod.PERPLEXITY, - corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity").compute, + corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity"), higher_is_better=False, ) - gpqa_instruct_metric = multilingual_extractive_match_metric( - language=Language.ENGLISH, - gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - precision=6, - ) - gpqa_instruct_pass_at_1_1n = SampleLevelMetric( - metric_name="gpqa_pass@1:1_samples", - sample_level_fn=PassAtK( - k=1, - n=1, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( - language=Language.ENGLISH, - gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - precision=6, - ).sample_level_fn(doc, model_response), - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - gpqa_instruct_pass_at_1_4n = SampleLevelMetric( - metric_name="gpqa_pass@1:4_samples", - sample_level_fn=PassAtK( - k=1, - n=4, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( - language=Language.ENGLISH, - gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - precision=6, - ).sample_level_fn(doc, model_response), - ).compute, + gpqa_instruct_metric = SampleLevelMetric( + metric_name="extractive_match", + sample_level_fn=MultilingualExtractiveMatchMetric( + language=Language.ENGLISH, + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + precision=6, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) - gpqa_instruct_pass_at_1_8n = SampleLevelMetric( - metric_name="gpqa_pass@1:8_samples", + gpqa_instruct_pass_at_k = SampleLevelMetric( + metric_name="gpqa_pass@k", sample_level_fn=PassAtK( - k=1, - n=8, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], precision=6, - ).sample_level_fn(doc, model_response), - ).compute, + ), + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -806,34 +507,8 @@ class Metrics(Enum): def __str__(self): return self.name.replace("_at_", "@") - @staticmethod - def higher_is_better(): - res = {} - for metric in Metrics: - if isinstance(metric.value, MetricGrouping): - res.update(metric.value.higher_is_better) - else: - res[metric.value.metric_name] = metric.value.higher_is_better - return res - - @staticmethod - def corpus_level_fns(metrics: list[Metric]) -> dict[str, Callable]: - res = {} - for metric in metrics: - if isinstance(metric, MetricGrouping): - if isinstance(metric.corpus_level_fn, dict): - res.update(metric.corpus_level_fn) - else: - # Must make sure there is a caching implementation here - for m in metric.metric_name: - res[m] = metric.corpus_level_fn - else: - res[metric.metric_name] = metric.corpus_level_fn - return res - - @staticmethod - def all_metrics(): - res = [] - for metric in Metrics: - res.extend(as_list(metric.value.metric_name)) - return res + def __call__(self, sample_params): + # When parametrizing, we don't look at the Metrics enum, + # but at a specific single metric (a value) + # Be very careful to not change the default value of the enum + return deepcopy(self.value)(sample_params=sample_params) diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 030725a53..09018bf70 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -27,6 +27,7 @@ import logging import math +from abc import ABC, abstractmethod from typing import Literal import numpy as np @@ -44,22 +45,29 @@ logger = logging.getLogger(__name__) +class CorpusLevelComputation(ABC): + @abstractmethod + def compute_corpus(self): + raise NotImplementedError + + # General aggregations -def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float: - """Computes the Matthews Correlation Coefficient, using scikit learn ([doc](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)). +class MatthewsCorrCoef(CorpusLevelComputation): + def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float: + """Computes the Matthews Correlation Coefficient, using scikit learn ([doc](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)). - Args: - items (list[dict]): List of GenerativeCorpusMetricInput + Args: + items (list[dict]): List of GenerativeCorpusMetricInput - Returns: - float: Score - """ - golds = [i.golds for i in items] - preds = [i.preds for i in items] - return sklearn.metrics.matthews_corrcoef(golds, preds) + Returns: + float: Score + """ + golds = [i.golds for i in items] + preds = [i.preds for i in items] + return sklearn.metrics.matthews_corrcoef(golds, preds) -class CorpusLevelF1Score: +class CorpusLevelF1Score(CorpusLevelComputation): def __init__(self, average: str, num_classes: int = 2): """Stores the relevant parameters for the task's corpus level f1 score. @@ -74,7 +82,7 @@ def __init__(self, average: str, num_classes: int = 2): self.average = average self.num_classes = num_classes - def compute(self, items: list[LogprobCorpusMetricInput]): + def compute_corpus(self, items: list[LogprobCorpusMetricInput]): """Computes the metric score over all the corpus generated items, by using the scikit learn implementation.""" golds = [i.golds for i in items] preds = [i.preds for i in items] @@ -90,7 +98,7 @@ def compute(self, items: list[LogprobCorpusMetricInput]): return float(np.mean(f1s)) -class CorpusLevelTranslationMetric: +class CorpusLevelTranslationMetric(CorpusLevelComputation): def __init__(self, metric_type: str, lang: Literal["zh", "ja", "ko", ""] = ""): """Stores the relevant parameters for a corpus level translation metric. @@ -112,7 +120,7 @@ def get_metric(self): else: raise ValueError(f"Unknown corpus level translation metric type : {self.metric_type}") - def compute(self, items: list[GenerativeCorpusMetricInput]) -> float: + def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float: """Computes the metric score over all the corpus generated items, by using the sacrebleu implementation.""" metric = self.get_metric() golds = [i.golds for i in items] @@ -127,7 +135,7 @@ def compute(self, items: list[GenerativeCorpusMetricInput]) -> float: return float(metric.corpus_score(hypotheses=preds, references=golds).score) -class CorpusLevelPerplexityMetric: +class CorpusLevelPerplexityMetric(CorpusLevelComputation): def __init__(self, metric_type: str): """Stores the relevant parameter for a corpus level perplexity metric. Perplexity metrics compute more or less the same thing, which is a variation on the @@ -145,7 +153,7 @@ def __init__(self, metric_type: str): self.metric_type = metric_type - def compute(self, items: list[PerplexityCorpusMetricInput]): + def compute_corpus(self, items: list[PerplexityCorpusMetricInput]): """Computes the metric score over all the corpus generated items.""" logprobs = [i.logprobs for i in items] weights = [i.weights for i in items] diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 706a7664a..ce2005c1b 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -24,8 +24,10 @@ using simple function (min, mean, max, ...) at the corpus level. Most metrics fall under this category. """ +import inspect import logging import os +from abc import ABC, abstractmethod from typing import Callable, Literal, Union import nltk @@ -42,7 +44,6 @@ from lighteval.metrics.imports.bert_scorer import BERTScorer from lighteval.metrics.imports.data_stats_metric import DataStatsMetric from lighteval.metrics.imports.summac import SummaCZS -from lighteval.metrics.llm_as_judge import JudgeLM from lighteval.metrics.normalizations import ( LogProbNormalization, LogProbTokenNorm, @@ -51,6 +52,7 @@ remove_braces_and_strip, ) from lighteval.metrics.utils.judge_utils import get_judge_prompt_simpleqa, process_judge_response_simpleqa +from lighteval.metrics.utils.llm_as_judge import JudgeLM from lighteval.models.model_output import ModelResponse from lighteval.tasks.requests import Doc from lighteval.utils.utils import as_list, safe_divide @@ -59,7 +61,13 @@ logger = logging.getLogger(__name__) -class ExactMatches: +class SampleLevelComputation(ABC): + @abstractmethod + def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): + raise NotImplementedError + + +class ExactMatches(SampleLevelComputation): def __init__( self, aggregation_function: Callable[[list[float]], float] = max, @@ -147,7 +155,7 @@ def compute_one_item( return 1 if gold == pred else 0 -class F1_score: +class F1_score(SampleLevelComputation): def __init__( self, aggregation_function: Callable[[list[float]], float] = max, @@ -219,7 +227,7 @@ def compute_one_item(self, gold: str, pred: str) -> float: return ret -class LoglikelihoodAcc: +class LoglikelihoodAcc(SampleLevelComputation): def __init__(self, logprob_normalization: LogProbNormalization | None = None): """Log likelihood accuracy class. It tests if the highest log-probability of the possible choices is actually in the gold ones. @@ -276,7 +284,7 @@ def compute( return int(best_choice in gold_ixs) -class NormalizedMultiChoiceProbability: +class NormalizedMultiChoiceProbability(SampleLevelComputation): def __init__( self, log_prob_normalization: LogProbNormalization | None = None, @@ -339,7 +347,7 @@ def compute( return gold_idx_agg_prob -class Probability: +class Probability(SampleLevelComputation): def __init__( self, normalization: LogProbTokenNorm | None = None, @@ -392,15 +400,15 @@ def compute( return self.aggregation_function(probs) -class Recall: - def __init__(self, at: int) -> None: - """Recall metric class. It checks if the top `at` best choices include one of the golds or not. +class Recall(SampleLevelComputation): + def __init__(self, k: int) -> None: + """Recall metric class. It checks if the top `k` best choices include one of the golds or not. Args: at (int): Depth level of the recall. Recall at 1 is equivalent to a logprob accuracy without normalization. """ - self.recall_depth = at + self.recall_depth = k def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> int: """Computes the recall at the requested depth level: looks at the `n` best predicted choices (with the @@ -421,7 +429,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> int: return int(any(ix in gold_ixs for ix in np.array(choices_logprobs).argsort()[::-1][: self.recall_depth])) -class MRR: +class MRR(SampleLevelComputation): def __init__(self, length_normalization: bool = False): """A mean reciprocal rank class. @@ -455,19 +463,20 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: return 1.0 / (min(ranked_choices) + 1) -def acc_golds_likelihood(doc, model_response, **kwargs) -> int: - """Tests if at least one of predicted gold targets' argmax of logits equals the gold. +class AccGoldLikelihood(SampleLevelComputation): + def compute(self, doc, model_response, **kwargs) -> int: + """Tests if at least one of predicted gold targets' argmax of logits equals the gold. - Args: - argmax_logits_eq_gold_list (list[int]): List of scores 1/0 indicating whether the argmax of logits equals the gold + Args: + argmax_logits_eq_gold_list (list[int]): List of scores 1/0 indicating whether the argmax of logits equals the gold - Returns: - int: 1 if at least one of the possible golds has argmax of logits == gold, 0 otherwise - """ - return int(any(model_response.argmax_logits_eq_gold)) + Returns: + int: 1 if at least one of the possible golds has argmax of logits == gold, 0 otherwise + """ + return int(any(model_response.argmax_logits_eq_gold)) -class ROUGE: +class ROUGE(SampleLevelComputation): ALLOWED_ROUGE_METHODS = ["rouge1", "rouge2", "rougeL", "rougeLsum"] def __init__( @@ -578,7 +587,7 @@ def _rouge_score_with_bootsrap(self, golds: list[str], predictions: list[str]): return {method: result[method].mid.fmeasure * 100 for method in self.methods} -class BertScore: +class BertScore(SampleLevelComputation): def __init__( self, normalize_gold: Callable | None = None, @@ -640,7 +649,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str return {"BERTScore-P": p[0].item(), "BERTScore-R": r[0].item(), "BERTScore-F": f[0].item()} -class Extractiveness: +class Extractiveness(SampleLevelComputation): def __init__( self, normalize_input: callable = remove_braces, @@ -694,7 +703,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str } -class Faithfulness: +class Faithfulness(SampleLevelComputation): def __init__( self, normalize_input: Callable = remove_braces, @@ -743,7 +752,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str return self.summac.score_one(inp, prediction)["score"] -class BLEURT: +class BLEURT(SampleLevelComputation): def __init__(self): """Creates a BLEURT scorer using a light bleurt-tiny-512 model. For more complex use cases, could also be Elron/bleurt-base-128 @@ -782,7 +791,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: return scores.item() -class BLEU: +class BLEU(SampleLevelComputation): def __init__(self, n_gram: int): """BLEU scorer class. Relies on `nltk`'s sentencebleu for scoring. TODO: Will have to move this to sacrebleu. @@ -820,7 +829,7 @@ def _bleu_score(self, gold: list[str], pred: str): return sentence_bleu([word_tokenize(g) for g in gold], word_tokenize(pred), weights=weights) -class StringDistance: +class StringDistance(SampleLevelComputation): def __init__( self, metric_types: list[str] | str, @@ -911,7 +920,7 @@ def edit_similarity(self, s1, s2): return 1.0 - edist / max(len(s1), len(s2)) if len(s1) > 0 and len(s2) > 0 else 0 -class JudgeLLM: +class JudgeLLM(SampleLevelComputation): available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4", "gpt-4o-2024-08-06"] def __init__( @@ -1075,12 +1084,72 @@ def compute(self, model_responses: list[ModelResponse], docs: list[Doc], **kwarg return metrics -class AvgAtK: +class SamplingMetric: + """All sampling metrics we have defined below use the same set of normalization parameters and same behavior for the default sample_scoring_function. + This class just holds the normalization and applies it to all samples passed to preprocess, then uses the default sample function if not provided. + """ + def __init__( self, - k: int, + normalize: Callable | str | None = None, + strip_strings: bool = False, sample_scoring_function: Callable[[Doc, ModelResponse], float] | str | None = None, ): + if isinstance(normalize, str): + import lighteval.metrics.normalizations + + allowed_normalizations = inspect.getmembers( + lighteval.metrics.normalizations, inspect.isfunction + ) # -> {name: fn} + if normalize in allowed_normalizations: + self.normalize = allowed_normalizations[normalize] + else: + raise ValueError(f"Unknown normalization function: {normalize}") + else: + self.normalize = normalize + self.strip_strings = strip_strings + + if callable(sample_scoring_function): + self.score_sample = sample_scoring_function + self.type_exact_match = None + else: + if isinstance(sample_scoring_function, str): + if sample_scoring_function not in ["prefix", "suffix", "full"]: + raise ValueError( + f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {sample_scoring_function} instead." + ) + self.type_exact_match = sample_scoring_function + else: + self.type_exact_match = "full" + self.compute_score = self.default_sample_scoring + + def preprocess(self, text: str) -> str: + if not text: + return "" + + if self.strip_strings: + text = text.strip() + + if self.normalize: + text = self.normalize(text) + + return text + + def default_sample_scoring(self, doc: Doc, model_response: ModelResponse) -> int: + gold = doc.get_golds()[0] + pred = model_response.final_text[0] + if self.type_exact_match == "prefix": + return 1 if pred.startswith(gold) else 0 + if self.type_exact_match == "suffix": + return 1 if pred.endswith(gold) else 0 + return 1 if gold == pred else 0 + + def name_metrics(self) -> str | list[str]: + raise NotImplementedError + + +class AvgAtK(SamplingMetric, SampleLevelComputation): + def __init__(self, k: int | None = None, **kwargs): """Sample score averages all the individual k predictions scores. Args: @@ -1092,20 +1161,9 @@ def __init__( sample_scoring_function (callable | str, optional): Function to use to compute the score for each sample. If None, uses the default scoring function which is a simple exact match. """ + super().__init__(kwargs) self.k = k - # Managed the logic of the per prediction of sample scoring - if callable(sample_scoring_function): - self.compute_score = sample_scoring_function - else: - if isinstance(sample_scoring_function, str): - if sample_scoring_function not in ["prefix", "suffix", "full"]: - raise ValueError( - f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {sample_scoring_function} instead." - ) - type_exact_match = sample_scoring_function - else: - type_exact_match = "full" - self.compute_score = self.default_sample_scoring(type_exact_match) + self.attribute_must_be_set = ["k"] def compute(self, model_response: ModelResponse, doc: Doc, **kwargs): """Computes the metric over a list of golds and predictions for one single sample. @@ -1126,55 +1184,17 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs): avg_score = np.mean(all_scores) return avg_score - def default_sample_scoring(self, type_exact_match: str) -> callable: - def sample_scoring_function(doc: Doc, model_response: ModelResponse) -> int: - """Default sample scoring function that checks if the prediction is equal to the gold.""" - pred = model_response.final_text[0] - gold = doc.get_golds()[0] - - if type_exact_match == "prefix": - return 1 if pred.startswith(gold) else 0 - if type_exact_match == "suffix": - return 1 if pred.endswith(gold) else 0 - return 1 if gold == pred else 0 + def num_samples(self): + return self.k - return sample_scoring_function +class MajAtK(SamplingMetric, SampleLevelComputation): + def __init__(self, k: int = None, **kwargs): + """An exact match class.""" + super().__init__(kwargs) -class MajAtK: - def __init__( - self, - k: int, - normalize_gold: Callable | None = None, - normalize_pred: Callable | None = None, - strip_strings: bool = False, - type_exact_match: str = "full", - ): - """An exact match class. - - Args: - normalize_gold (callable, optional): Function to use to normalize the reference strings. - Defaults to None if no normalization is applied. - normalize_pred (callable, optional): Function to use to normalize the predicted strings. - Defaults to None if no normalization is applied. - strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False. - type_exact_match (str, optional): Defines what type of match to apply (post normalization if present). - Can be any of `prefix`, `suffix` or `full`. Defaults to "full". - `prefix` checks if the prediction starts with the gold, - `suffix` if the prediction ends with the gold, - `full` if the prediction and gold are equal - """ self.k = k - self.normalize_gold = normalize_gold - self.normalize_pred = normalize_pred - self.strip_strings = strip_strings - - if type_exact_match not in ["prefix", "suffix", "full"]: - # todo: we could add a set exact match - raise ValueError( - f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {type_exact_match} instead." - ) - self.type_exact_match = type_exact_match + self.attribute_must_be_set = ["k"] def compute(self, model_response: ModelResponse, docs: Doc, **kwargs): """Computes the metric over a list of golds and predictions for one single sample. @@ -1188,94 +1208,43 @@ def compute(self, model_response: ModelResponse, docs: Doc, **kwargs): Returns: float: Aggregated score over the current sample's items. """ + if self.k is None: + raise Exception("You did not set the value of k") golds = docs.get_golds() - predictions = model_response.final_text if len(golds) > 1: raise Exception("Cannot compute maj@k with several golds") - gold = self.get_processed_gold(golds[0]) + processed_choices = [self.preprocess(gold=g) for g in docs.get_golds()] + new_doc = Doc( + choices=processed_choices, + query=docs.query, + gold_index=docs.gold_index, + ) all_answers = [] - for pred in predictions[: self.k]: - all_answers.append(self.get_processed_pred(pred=pred)) + for pred in model_response.final_text[: self.k]: + all_answers.append(self.preprocess(pred=pred)) majority_prediction = max(all_answers, key=all_answers.count) - return self.compute_score(majority_prediction, gold) - - def get_processed_gold(self, gold: str) -> str: - if self.strip_strings: - gold = gold.strip() - - if self.normalize_gold: - gold = self.normalize_gold(gold) - - return gold - - def get_processed_pred(self, pred: str) -> str: - if not pred: - return "" - - if self.strip_strings: - pred = pred.strip() - - if self.normalize_pred: - pred = self.normalize_pred(pred) - - return pred + new_model_response = ModelResponse( + text=[majority_prediction], + ) + return self.compute_score(new_model_response, new_doc) - def compute_score(self, pred: str, gold: str) -> int: - if self.type_exact_match == "prefix": - return 1 if pred.startswith(gold) else 0 - if self.type_exact_match == "suffix": - return 1 if pred.endswith(gold) else 0 - return 1 if gold == pred else 0 + def num_samples(self): + return self.k -class PassAtK: - def __init__( - self, - k: int, - n: int | None = None, - normalize_gold: Callable | None = None, - normalize_pred: Callable | None = None, - strip_strings: bool = False, - sample_scoring_function: Callable[[Doc, ModelResponse], float] | str | None = None, - ): +class PassAtK(SamplingMetric, SampleLevelComputation): + def __init__(self, k: int | None = None, n: int | None = None, **kwargs): """Computing pass at k Args: k (int): Threshold for the number of successful attempts. n (int): Number of samples to generate - normalize_gold (callable, optional): Function to use to normalize the reference strings. - Defaults to None if no normalization is applied. - normalize_pred (callable, optional): Function to use to normalize the predicted strings. - Defaults to None if no normalization is applied. - strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False. - sample_scoring_function (callable or str, optional): Function to use to score each sample. - Either pass the full function (should take a string prediction and a string gold, and return a score between 0 and 1) - a string (any of `prefix`, `suffix` or `full`) to define the type of exact match that you want, or nothing to defaults to "full". - `prefix` checks if the prediction starts with the gold, - `suffix` if the prediction ends with the gold, - `full` if the prediction and gold are equal """ + super().__init__(kwargs) self.k = k self.n = n - self.normalize_gold = normalize_gold - self.normalize_pred = normalize_pred - self.strip_strings = strip_strings - - # Managed the logic of the per prediction of sample scoring - if callable(sample_scoring_function): - self.score_sample = sample_scoring_function - self.type_exact_match = None - else: - if isinstance(sample_scoring_function, str): - if sample_scoring_function not in ["prefix", "suffix", "full"]: - raise ValueError( - f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {sample_scoring_function} instead." - ) - self.type_exact_match = sample_scoring_function - else: - self.type_exact_match = "full" - self.score_sample = self.default_sample_scoring + self.attribute_must_be_set = ["k"] def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: """Computes the metric over a list of golds and predictions for one single item with possibly many samples. @@ -1290,17 +1259,17 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: float: Aggregated score over the current sample's items. """ golds = doc.get_golds() - predictions = model_response.final_text if len(golds) > 1: raise Exception("Cannot compute pass@k with several golds") + predictions = model_response.final_text if self.n is None: self.n = len(predictions) logger.warning("n undefined in the pass@k. We assume it's the same as the sample's number of predictions.") elif len(predictions) < self.n: logger.warning(f"Number of predictions is less than {self.n} for pass@k.") - processed_choices = [self.get_processed_gold(gold=g) for g in doc.choices] + processed_choices = [self.preprocess(gold=g) for g in doc.choices] new_doc = Doc( choices=processed_choices, query=doc.query, @@ -1309,7 +1278,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: all_scores = [] for pred in predictions[: self.n]: - cur_pred = self.get_processed_pred(pred=pred) + cur_pred = self.preprocess(pred=pred) new_model_response = ModelResponse( text=[cur_pred], ) @@ -1317,37 +1286,6 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: return self.pass_at_k(all_scores) - def get_processed_gold(self, gold: str) -> str: - if self.strip_strings: - gold = gold.strip() - - if self.normalize_gold: - gold = self.normalize_gold(gold) - - return gold - - def get_processed_pred(self, pred: str) -> str: - if not pred: - return "" - - if self.strip_strings: - pred = pred.strip() - - if self.normalize_pred: - pred = self.normalize_pred(pred) - - return pred - - def default_sample_scoring(self, doc, model_response) -> int: - pred = model_response.final_text[0] - gold = doc.get_golds()[0] - - if self.type_exact_match == "prefix": - return 1 if pred.startswith(gold) else 0 - if self.type_exact_match == "suffix": - return 1 if pred.endswith(gold) else 0 - return 1 if gold == pred else 0 - def pass_at_k(self, all_scores: list[int]) -> float: """Algo from https://arxiv.org/pdf/2107.03374""" c: int = all_scores.count(1) @@ -1356,17 +1294,18 @@ def pass_at_k(self, all_scores: list[int]) -> float: return 1.0 - np.prod(1.0 - self.k / np.arange(self.n - c + 1, self.n + 1)) + def num_samples(self): + return self.n if self.n is not None else self.k -class GPassAtK: + +class GPassAtK(SamplingMetric, SampleLevelComputation): def __init__( self, - k: Union[int, list[int]], + k: Union[int, list[int]] | None = None, n: int | None = None, thresholds: list[float] = [0.0, 0.25, 0.5, 0.75, 1.0], - normalize_gold: Callable | None = None, - normalize_pred: Callable | None = None, - strip_strings: bool = False, - sample_scoring_function: Callable[[Doc, ModelResponse], float] | str | None = None, + name_prefix: str = None, + **kwargs, ): """Computing G-Pass@k from http://arxiv.org/abs/2412.13147 @@ -1374,39 +1313,22 @@ def __init__( k (int, list): The number of successful attempts to be considered. n (int): Number of samples to generate. thresholds (list): Thresholds to control successful attempts in k generate. - normalize_gold (callable, optional): Function to use to normalize the reference strings. - Defaults to None if no normalization is applied. - normalize_pred (callable, optional): Function to use to normalize the predicted strings. - Defaults to None if no normalization is applied. - strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False. - sample_scoring_function (callable or str, optional): Function to use to score each sample. - Either pass the full function (should take a string prediction and a string gold, and return a score between 0 and 1) - a string (any of `prefix`, `suffix` or `full`) to define the type of exact match that you want, or nothing to defaults to "full". - `prefix` checks if the prediction starts with the gold, - `suffix` if the prediction ends with the gold, - `full` if the prediction and gold are equal """ - self.k = as_list(k) + super().__init__(kwargs) + self._k = k self.n = n + self.attribute_must_be_set = ["k"] + self.thresholds = thresholds - self.normalize_gold = normalize_gold - self.normalize_pred = normalize_pred - self.strip_strings = strip_strings + self.name = (f"{name_prefix}_" if name_prefix else "") + "g-pass@" - # Managed the logic of the per prediction of sample scoring - if callable(sample_scoring_function): - self.score_sample = sample_scoring_function - self.type_exact_match = None - else: - if isinstance(sample_scoring_function, str): - if sample_scoring_function not in ["prefix", "suffix", "full"]: - raise ValueError( - f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {sample_scoring_function} instead." - ) - self.type_exact_match = sample_scoring_function - else: - self.type_exact_match = "full" - self.score_sample = self.default_sample_scoring + @property + def k(self): + return as_list(self._k) + + @k.setter + def k(self, new_val): + self._k = as_list(new_val) def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: """Computes the metric over a list of golds and predictions for one single item with possibly many samples. @@ -1434,7 +1356,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: elif len(predictions) < self.n: logger.warning(f"Number of predictions is less than {self.n} for G-Pass@k.") - processed_choices = [self.get_processed_gold(gold=g) for g in doc.choices] + processed_choices = [self.preprocess(gold=g) for g in doc.choices] new_doc = Doc( choices=processed_choices, query=doc.query, @@ -1443,7 +1365,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: all_scores = [] for pred in predictions[: self.n]: - cur_pred = self.get_processed_pred(pred=pred) + cur_pred = self.preprocess(pred=pred) new_model_response = ModelResponse( text=[cur_pred], ) @@ -1451,36 +1373,6 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: return self.g_pass_at_k(all_scores) - def get_processed_gold(self, gold: str) -> str: - if self.strip_strings: - gold = gold.strip() - - if self.normalize_gold: - gold = self.normalize_gold(gold) - - return gold - - def get_processed_pred(self, pred: str) -> str: - if not pred: - return "" - - if self.strip_strings: - pred = pred.strip() - - if self.normalize_pred: - pred = self.normalize_pred(pred) - - return pred - - def default_sample_scoring(self, doc: Doc, model_response: ModelResponse) -> int: - gold = doc.get_golds()[0] - pred = model_response.final_text[0] - if self.type_exact_match == "prefix": - return 1 if pred.startswith(gold) else 0 - if self.type_exact_match == "suffix": - return 1 if pred.endswith(gold) else 0 - return 1 if gold == pred else 0 - def g_pass_at_k(self, all_scores: list[int]) -> float: """Computation of G-Pass@k details from http://arxiv.org/abs/2412.13147""" c: int = sum(all_scores) @@ -1510,20 +1402,23 @@ def compute_mg_pass_at_k(n, c, k): metrics = {} for k in ks: for t in thresholds: - metrics[f"G-Pass@{k}_{t}"] = compute_g_pass_at_k(n, c, k, t) - metrics[f"mG-Pass@{k}"] = compute_mg_pass_at_k(n, c, k) + metrics[f"{self.name}@{k}_{t}"] = compute_g_pass_at_k(n, c, k, t) + metrics[f"m{self.name}@{k}"] = compute_mg_pass_at_k(n, c, k) return metrics @property - def all_metrics(self): + def metric_names(self): ks: int = self.k thresholds: list[float] = self.thresholds metrics = [] for k in ks: for t in thresholds: - metrics.append(f"G-Pass@{k}_{t}") - metrics.append(f"mG-Pass@{k}") + metrics.append(f"{self.name}@{k}_{t}") + metrics.append(f"m{self.name}@{k}") return metrics + + def num_samples(self): + return self.n if self.n is not None else self.k diff --git a/src/lighteval/metrics/normalizations.py b/src/lighteval/metrics/normalizations.py index 565a5e379..97c1bd384 100644 --- a/src/lighteval/metrics/normalizations.py +++ b/src/lighteval/metrics/normalizations.py @@ -426,8 +426,6 @@ class LogProbPMINorm: name: str = "norm_pmi" - pass - @dataclass class LogProbTokenNorm: @@ -437,7 +435,6 @@ class LogProbTokenNorm: """ name: str = "norm_token" - pass @dataclass diff --git a/src/lighteval/metrics/sample_preparator.py b/src/lighteval/metrics/sample_preparator.py index 2b99483b7..ad9338e76 100644 --- a/src/lighteval/metrics/sample_preparator.py +++ b/src/lighteval/metrics/sample_preparator.py @@ -60,7 +60,11 @@ class PerplexityCorpusMetricInput(CorpusMetricInput): weights: list[int] -class GenerativePreparator: +class Preparator: + pass + + +class GenerativePreparator(Preparator): @staticmethod def prepare(doc: Doc, model_response: ModelResponse, **kwargs): """Prepares an individual generative example to the format expected by metrics computed at the corpus level (aggregated). @@ -77,7 +81,7 @@ def prepare(doc: Doc, model_response: ModelResponse, **kwargs): return GenerativeCorpusMetricInput(golds=golds, preds=predictions) -class LoglikelihoodPreparator: +class LoglikelihoodPreparator(Preparator): def __init__(self, is_single_token: bool = False): """Init. @@ -110,7 +114,7 @@ def prepare(self, doc: Doc, model_response: ModelResponse, **kwargs) -> LogprobC return LogprobCorpusMetricInput(golds=gold_ixs, preds=np.argmax(choices_logprob)) -class TargetPerplexityPreparator: +class TargetPerplexityPreparator(Preparator): def __init__(self, units_type: str) -> None: """Init. @@ -155,7 +159,7 @@ def prepare(self, doc: Doc, model_response: ModelResponse, **kwargs): return PerplexityCorpusMetricInput(logprobs=logprobs_flat, weights=self.count_units(reference_text_flat)) -class PerplexityPreparator: +class PerplexityPreparator(Preparator): def __init__(self, units_type: str) -> None: """Init. diff --git a/src/lighteval/metrics/utils/__init__.py b/src/lighteval/metrics/utils/__init__.py deleted file mode 100644 index a732db8d0..000000000 --- a/src/lighteval/metrics/utils/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. diff --git a/src/lighteval/metrics/judge_prompts.jsonl b/src/lighteval/metrics/utils/judge_prompts.jsonl similarity index 100% rename from src/lighteval/metrics/judge_prompts.jsonl rename to src/lighteval/metrics/utils/judge_prompts.jsonl diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/utils/llm_as_judge.py similarity index 100% rename from src/lighteval/metrics/llm_as_judge.py rename to src/lighteval/metrics/utils/llm_as_judge.py diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py index 78d30c59a..85b1e2bc6 100644 --- a/src/lighteval/metrics/utils/metric_utils.py +++ b/src/lighteval/metrics/utils/metric_utils.py @@ -23,6 +23,9 @@ from dataclasses import dataclass from typing import Callable +from lighteval.metrics.metrics_corpus import CorpusLevelComputation +from lighteval.metrics.metrics_sample import SampleLevelComputation +from lighteval.metrics.sample_preparator import Preparator from lighteval.tasks.requests import SamplingMethod @@ -31,20 +34,65 @@ class Metric: metric_name: str higher_is_better: bool category: SamplingMethod - sample_level_fn: Callable - corpus_level_fn: Callable + sample_level_fn: SampleLevelComputation | Preparator + corpus_level_fn: CorpusLevelComputation | Callable batched_compute: bool = False def get_doc(self): return self.sample_level_fn.__doc__ - def compute( + def compute_sample( self, **kwargs ) -> dict: # result: Union[list[ModelResponse], ModelResponse], formatted_doc: Doc) -> dict: + if isinstance(self.sample_level_fn, SampleLevelComputation): + sample_level_fn = self.sample_level_fn.compute + elif isinstance(self.sample_level_fn, Preparator): + sample_level_fn = self.sample_level_fn.prepare + else: + raise ValueError( + f"Incorrect type for {self.sample_level_fn}, should be a SampleLevelComputation or Preparator" + ) + + if isinstance(self, MetricGrouping): + return sample_level_fn(**kwargs) + return {self.metric_name: sample_level_fn(**kwargs)} + + def get_corpus_aggregations(self) -> dict: + if isinstance(self, MetricGrouping): + if isinstance(self.corpus_level_fn, dict): + corpus_level_fn = self.corpus_level_fn + else: + corpus_level_fn = dict.fromkeys(self.metric_name, self.corpus_level_fn) + else: + corpus_level_fn = {self.metric_name: self.corpus_level_fn} + + for name, item in corpus_level_fn.items(): + if isinstance(item, Callable): + corpus_level_fn[name] = item + else: + corpus_level_fn[name] = item.compute_corpus + + return corpus_level_fn + + def __call__(self, sample_params: dict | None): + """Allow creating new instances with modified parameters""" + if sample_params is not None: + for k, v in sample_params.items(): + setattr(self.sample_level_fn, k, v) + + # Once the parameters are updated, we need to adjust the + # metric name to what will be returned + sample_params_name = "&".join(sample_params.keys()) if isinstance(self, MetricGrouping): - return self.sample_level_fn(**kwargs) # result, formatted_doc, - return {self.metric_name: self.sample_level_fn(**kwargs)} # result, formatted_doc, + if hasattr(self.sample_level_fn, "metric_names"): + # this is mostly for the gpass@k metrics + self.metric_name = self.sample_level_fn.metric_names + else: + self.metric_name = [f"{metric}_with_{sample_params_name}" for metric in self.metric_name] + else: + self.metric_name = f"{self.metric_name}_with_{sample_params_name}" + return self @dataclass diff --git a/src/lighteval/metrics/stderr.py b/src/lighteval/metrics/utils/stderr.py similarity index 100% rename from src/lighteval/metrics/stderr.py rename to src/lighteval/metrics/utils/stderr.py diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 61236fef3..12d1d3a15 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -21,6 +21,13 @@ # SOFTWARE. import lighteval.tasks.default_prompts as prompt from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import ( + LogProbCharNorm, + gsm8k_normalizer, + harness_triviaqa_normalizer, + helm_normalizer, + math_normalizer, +) from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.templates.qa import get_qa_prompt_function from lighteval.utils.language import Language @@ -97,7 +104,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -112,7 +122,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -127,7 +140,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -142,7 +158,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -157,7 +176,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -172,7 +194,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -187,7 +212,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -202,7 +230,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -217,7 +248,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -232,7 +266,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -247,7 +284,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -262,7 +302,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -277,7 +320,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -292,7 +338,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -307,7 +356,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -322,7 +374,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -337,7 +392,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, version=0, ) @@ -352,14 +410,7 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[ - Metrics.math_pass_at_1_1n, - Metrics.math_pass_at_1_4n, - Metrics.math_pass_at_1_8n, - Metrics.math_pass_at_1_16n, - Metrics.math_pass_at_1_32n, - Metrics.math_pass_at_1_64n, - ], + metrics=[Metrics.pass_at_k_math(sample_params={"k": 1})], version=2, ) aime24_avg = LightevalTaskConfig( @@ -373,9 +424,7 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[ - Metrics.math_avg_at_64, - ], + metrics=[Metrics.avg_at_k_math(sample_params={"k": 64})], version=2, ) aime24_gpassk = LightevalTaskConfig( @@ -389,7 +438,7 @@ few_shots_split=None, few_shots_select=None, generation_size=8192, - metrics=[Metrics.g_pass_at_16_expr_gold], + metrics=[Metrics.g_pass_at_k_math(sample_params={"k": 16, "n": 48})], version=1, ) aime25 = LightevalTaskConfig( @@ -403,14 +452,7 @@ few_shots_split=None, few_shots_select=None, generation_size=10000, - metrics=[ - Metrics.math_pass_at_1_1n, - # Metrics.math_pass_at_1_4n, - # Metrics.math_pass_at_1_8n, - # Metrics.math_pass_at_1_16n, - # Metrics.math_pass_at_1_32n, - # Metrics.math_pass_at_1_64n, - ], + metrics=[Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1})], version=2, ) aime25_gpassk = LightevalTaskConfig( @@ -424,7 +466,7 @@ few_shots_split=None, few_shots_select=None, generation_size=8192, - metrics=[Metrics.g_pass_at_16_expr_gold], + metrics=[Metrics.g_pass_at_k_math(sample_params={"k": 16, "n": 48})], version=1, ) anachronisms_bigbench = LightevalTaskConfig( @@ -558,7 +600,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -573,7 +618,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -588,7 +636,10 @@ few_shots_split=None, few_shots_select="random_sampling_from_train", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], version=0, ) @@ -603,7 +654,10 @@ few_shots_split=None, few_shots_select="random_sampling_from_train", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], version=0, ) @@ -768,7 +822,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -783,7 +837,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -843,7 +897,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=None, version=0, ) @@ -860,9 +914,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -1148,7 +1208,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1164,7 +1227,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1180,7 +1246,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1196,7 +1265,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1212,7 +1284,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1228,7 +1303,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1244,7 +1322,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1260,7 +1341,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1276,7 +1360,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1292,7 +1379,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1308,7 +1398,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1324,7 +1417,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1340,7 +1436,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1356,7 +1455,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1372,7 +1474,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1388,7 +1493,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1404,7 +1512,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1420,7 +1531,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, version=0, @@ -1438,10 +1552,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1459,10 +1579,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1480,10 +1606,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1501,10 +1633,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1522,10 +1660,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1543,10 +1687,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1564,10 +1714,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1585,10 +1741,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1606,10 +1768,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1627,10 +1795,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1648,10 +1822,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1669,10 +1849,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1690,10 +1876,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1711,10 +1903,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1732,10 +1930,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1753,10 +1957,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1774,10 +1984,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1795,10 +2011,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1816,10 +2038,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1837,10 +2065,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1858,10 +2092,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1879,10 +2119,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1900,10 +2146,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1921,10 +2173,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1942,10 +2200,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1963,10 +2227,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -1984,10 +2254,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], version=0, @@ -2005,10 +2281,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], version=0, @@ -2026,10 +2308,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], version=0, @@ -2047,10 +2335,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], version=0, @@ -2068,10 +2362,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], version=0, @@ -2089,10 +2389,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], version=0, @@ -2110,10 +2416,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], version=0, @@ -2131,10 +2443,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], version=0, @@ -2152,10 +2470,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], version=0, @@ -2173,10 +2497,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], version=0, @@ -2194,10 +2524,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], version=0, @@ -2215,10 +2551,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], version=0, @@ -2236,10 +2578,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], version=0, @@ -2270,7 +2618,10 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -2287,9 +2638,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2307,9 +2664,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2327,9 +2690,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2347,9 +2716,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2367,9 +2742,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2387,9 +2768,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2407,9 +2794,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2427,9 +2820,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2447,9 +2846,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2467,9 +2872,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2487,9 +2898,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2507,9 +2924,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2527,9 +2950,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2547,9 +2976,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2567,9 +3002,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2587,9 +3028,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2607,9 +3054,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2627,9 +3080,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2647,9 +3106,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2667,9 +3132,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2687,9 +3158,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2707,9 +3184,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2727,9 +3210,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -2747,9 +3236,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3007,9 +3502,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3027,9 +3528,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3047,9 +3554,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3067,9 +3580,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3087,9 +3606,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3105,7 +3630,10 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -3122,9 +3650,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3142,9 +3676,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3162,9 +3702,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3182,9 +3728,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3202,9 +3754,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3222,9 +3780,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3240,7 +3804,10 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -3255,7 +3822,10 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -3272,9 +3842,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3290,7 +3866,10 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -3307,9 +3886,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3327,9 +3912,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3347,9 +3938,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3367,9 +3964,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3387,9 +3990,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3407,9 +4016,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3427,9 +4042,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3447,9 +4068,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3467,9 +4094,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3487,9 +4120,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -3520,7 +4159,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3550,7 +4192,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3580,7 +4225,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3610,7 +4258,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3640,7 +4291,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3670,7 +4324,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3700,7 +4357,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3730,7 +4390,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3760,7 +4423,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3790,7 +4456,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3820,7 +4489,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3850,7 +4522,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3880,7 +4555,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3910,7 +4588,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3940,7 +4621,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -3970,7 +4654,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4000,7 +4687,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4030,7 +4720,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4060,7 +4753,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4090,7 +4786,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4120,7 +4819,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4150,7 +4852,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4180,7 +4885,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4210,7 +4918,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4240,7 +4951,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4270,7 +4984,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4300,7 +5017,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4330,7 +5050,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4360,7 +5083,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4390,7 +5116,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4420,7 +5149,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4450,7 +5182,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4480,7 +5215,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4510,7 +5248,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4540,7 +5281,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4570,7 +5314,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4600,7 +5347,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4630,7 +5380,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4660,7 +5413,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4690,7 +5446,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4720,7 +5479,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4750,7 +5512,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4780,7 +5545,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4810,7 +5578,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4840,7 +5611,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4870,7 +5644,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4900,7 +5677,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4930,7 +5710,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4960,7 +5743,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -4990,7 +5776,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5020,7 +5809,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5050,7 +5842,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5080,7 +5875,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5110,7 +5908,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5140,7 +5941,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5170,7 +5974,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5200,7 +6007,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5230,7 +6040,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5260,7 +6073,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5290,7 +6106,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5320,7 +6139,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5350,7 +6172,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5380,7 +6205,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5410,7 +6238,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5440,7 +6271,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5470,7 +6304,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5500,7 +6337,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -5607,9 +6447,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -5627,9 +6473,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -5645,7 +6497,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -5690,7 +6542,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -5705,7 +6557,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -5720,7 +6572,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -5752,9 +6604,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -5774,9 +6632,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -5796,9 +6660,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -5818,9 +6688,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -5840,9 +6716,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -5862,9 +6744,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -5884,9 +6772,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -5906,9 +6800,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -5928,9 +6828,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -5978,7 +6884,12 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[ + Metrics.rouge_t5, + Metrics.bleu, + Metrics.loglikelihood_acc, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], stop_sequence=["\n"], version=0, ) @@ -6010,9 +6921,15 @@ generation_size=1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6043,7 +6960,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.perfect_exact_match], + metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=[".", ";", "!", "?"], version=0, ) @@ -6058,7 +6975,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -6327,7 +7244,7 @@ "choices": [line["answers"]["input_text"][0]], }, ), - suite=("lighteval",), + suite=["lighteval"], hf_repo="stanfordnlp/coqa", hf_subset="default", hf_avail_splits=["train", "validation"], @@ -6336,8 +7253,14 @@ generation_size=100, version=1, metrics=( - Metrics.prefix_quasi_exact_match, - Metrics.f1_score_quasi, + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), ), ) coqa_bb_lighteval = LightevalTaskConfig( @@ -6351,7 +7274,7 @@ few_shots_split=None, few_shots_select=None, generation_size=10, - metrics=[Metrics.perfect_exact_match, Metrics.f1_score], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False}), Metrics.f1_score], stop_sequence=["\n"], version=0, ) @@ -6368,7 +7291,7 @@ generation_size=128, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.rougeL, Metrics.bleu_1, @@ -6433,7 +7356,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -6523,7 +7446,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -6560,8 +7483,14 @@ generation_size=250, stop_sequence=["Question:", "question:", "\n"], metrics=( - Metrics.prefix_quasi_exact_match, - Metrics.f1_score_quasi, + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), ), version=1, ) @@ -6651,7 +7580,12 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[ + Metrics.rouge_t5, + Metrics.bleu, + Metrics.loglikelihood_acc, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], stop_sequence=["\n"], version=0, ) @@ -6758,9 +7692,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6778,9 +7718,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6798,9 +7744,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6818,9 +7770,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6838,9 +7796,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6858,9 +7822,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6878,9 +7848,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6898,9 +7874,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6918,9 +7900,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6938,9 +7926,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6958,9 +7952,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6978,9 +7978,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -6998,9 +8004,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -7018,9 +8030,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -7038,9 +8056,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -7251,7 +8275,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -7281,7 +8305,12 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[ + Metrics.rouge_t5, + Metrics.bleu, + Metrics.loglikelihood_acc, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], stop_sequence=["\n"], version=0, ) @@ -7296,7 +8325,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.mcc_single_token], + metrics=[Metrics.loglikelihood_acc, Metrics.mcc], stop_sequence=["\n"], version=0, ) @@ -7476,11 +8505,7 @@ few_shots_split=None, few_shots_select=None, generation_size=32768, # needed for reasoning models like R1 - metrics=[ - Metrics.gpqa_instruct_pass_at_1_1n, - Metrics.gpqa_instruct_pass_at_1_4n, - Metrics.gpqa_instruct_pass_at_1_8n, - ], + metrics=[Metrics.gpqa_instruct_pass_at_k(sample_params={"k": 1})], stop_sequence=[], # no stop sequence, will use eos token version=1, ) @@ -7555,7 +8580,9 @@ few_shots_split=None, few_shots_select="random_sampling_from_train", generation_size=256, - metrics=[Metrics.quasi_exact_match_gsm8k], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": gsm8k_normalizer, "normalize_pred": gsm8k_normalizer}) + ], stop_sequence=[], version=0, ) @@ -7576,6 +8603,42 @@ stop_sequence=["Question:"], version=0, ) +headqa_en_lighteval = LightevalTaskConfig( + name="headqa:en", + suite=["lighteval", "headqa"], + prompt_function=prompt.headqa, + hf_repo="lighteval/headqa_harness", + hf_subset="en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], + stop_sequence=["\n"], + version=0, +) +headqa_es_lighteval = LightevalTaskConfig( + name="headqa:es", + suite=["lighteval", "headqa"], + prompt_function=prompt.headqa, + hf_repo="lighteval/headqa_harness", + hf_subset="es", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], + stop_sequence=["\n"], + version=0, +) hellaswag_leaderboard = LightevalTaskConfig( name="hellaswag", suite=["leaderboard"], @@ -7587,7 +8650,10 @@ few_shots_split=None, few_shots_select="random_sampling_from_train", generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -7604,9 +8670,15 @@ generation_size=1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -7637,7 +8709,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -7744,9 +8816,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -7766,9 +8844,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -7833,9 +8917,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -7853,9 +8943,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -7873,9 +8969,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -7893,9 +8995,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -7913,9 +9021,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -7933,9 +9047,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -7966,7 +9086,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -8197,8 +9317,14 @@ generation_size=250, stop_sequence=["\n", "Question:", "question:"], metrics=( - Metrics.prefix_quasi_exact_match, - Metrics.f1_score_quasi, + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), ), ) kanji_ascii_bigbench = LightevalTaskConfig( @@ -8407,7 +9533,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -8506,9 +9632,15 @@ metrics=[ Metrics.loglikelihood_acc, Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -8526,7 +9658,7 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8547,7 +9679,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8568,7 +9700,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8589,7 +9721,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8610,7 +9742,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8631,7 +9763,7 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8652,7 +9784,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8673,7 +9805,7 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8694,7 +9826,7 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8715,7 +9847,7 @@ generation_size=10, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8736,7 +9868,7 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8757,7 +9889,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8778,7 +9910,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8797,7 +9929,11 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match, Metrics.f1_score], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.f1_score, + ], stop_sequence=["\n"], version=0, ) @@ -8814,7 +9950,7 @@ generation_size=430, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8835,7 +9971,7 @@ generation_size=788, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8856,7 +9992,7 @@ generation_size=338, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8877,7 +10013,7 @@ generation_size=274, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8898,7 +10034,7 @@ generation_size=274, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8919,7 +10055,7 @@ generation_size=10, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8940,7 +10076,7 @@ generation_size=10, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8961,7 +10097,7 @@ generation_size=10, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -8982,7 +10118,7 @@ generation_size=10, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9003,7 +10139,7 @@ generation_size=10, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9024,7 +10160,7 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9043,7 +10179,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -9058,7 +10194,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=None, version=0, ) @@ -9148,7 +10284,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], version=0, ) @@ -9165,9 +10304,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -9185,9 +10330,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -9205,9 +10356,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -9225,9 +10382,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -9245,9 +10408,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -9264,8 +10433,7 @@ few_shots_select=None, generation_size=32768, metrics=[ - Metrics.math_pass_at_1_1n, - Metrics.math_pass_at_1_4n, + Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}), ], version=2, ) @@ -9280,7 +10448,7 @@ few_shots_split=None, few_shots_select=None, generation_size=8192, - metrics=[Metrics.g_pass_at_16_latex_gold], + metrics=[Metrics.g_pass_at_k_latex(sample_params={"k": 16, "n": 48})], version=1, ) math_algebra_lighteval = LightevalTaskConfig( @@ -9294,7 +10462,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=1, ) @@ -9309,7 +10487,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=1, ) @@ -9324,7 +10512,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=1, ) @@ -9339,7 +10537,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=1, ) @@ -9354,7 +10562,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=1, ) @@ -9369,7 +10587,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=1, ) @@ -9384,7 +10612,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=1, ) @@ -9399,7 +10637,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=0, ) @@ -9414,7 +10662,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=0, ) @@ -9429,7 +10687,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=0, ) @@ -9444,7 +10712,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=0, ) @@ -9459,7 +10737,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=0, ) @@ -9474,7 +10762,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=0, ) @@ -9489,7 +10787,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], version=0, ) @@ -9519,7 +10827,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -9534,7 +10845,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -9551,7 +10862,7 @@ generation_size=128, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.rougeL, Metrics.bleu_1, @@ -9573,7 +10884,7 @@ generation_size=128, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.rougeL, Metrics.bleu_1, @@ -9595,7 +10906,7 @@ generation_size=128, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.rougeL, Metrics.bleu_1, @@ -9618,9 +10929,15 @@ metrics=[ Metrics.loglikelihood_acc, Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -9638,7 +10955,7 @@ generation_size=512, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.rougeL, Metrics.bleu_1, @@ -9661,9 +10978,15 @@ metrics=[ Metrics.loglikelihood_acc, Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -9709,7 +11032,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "Question="], version=0, ) @@ -9724,7 +11050,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "Pregunta="], version=0, ) @@ -9739,7 +11068,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "Question="], version=0, ) @@ -9754,7 +11086,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "Frage="], version=0, ) @@ -9769,7 +11104,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "\u0417\u0430\u0434\u0430\u0447\u0430="], version=0, ) @@ -9784,7 +11122,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "\u95ee\u9898="], version=0, ) @@ -9799,7 +11140,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "\u554f\u984c="], version=0, ) @@ -9814,7 +11158,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "\u0e42\u0e08\u0e17\u0e22\u0e4c="], version=0, ) @@ -9829,7 +11176,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "Swali="], version=0, ) @@ -9844,7 +11194,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8="], version=0, ) @@ -9859,7 +11212,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28="], version=0, ) @@ -9951,9 +11307,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10001,9 +11363,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10051,9 +11419,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10101,9 +11475,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10151,9 +11531,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10201,9 +11587,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10251,9 +11643,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10301,9 +11699,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10351,9 +11755,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10401,9 +11811,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10451,9 +11867,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10501,9 +11923,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10551,9 +11979,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10601,9 +12035,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10651,9 +12091,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10701,9 +12147,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10751,9 +12203,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10801,9 +12259,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10851,9 +12315,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10901,9 +12371,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -10951,9 +12427,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11001,9 +12483,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11051,9 +12539,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11101,9 +12595,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11151,9 +12651,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11201,9 +12707,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11251,9 +12763,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11301,9 +12819,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11351,9 +12875,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11401,9 +12931,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11451,9 +12987,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11501,9 +13043,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11551,9 +13099,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11601,9 +13155,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11651,9 +13211,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11701,9 +13267,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11751,9 +13323,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11801,9 +13379,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11851,9 +13435,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11901,9 +13491,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -11951,9 +13547,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12001,9 +13603,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12051,9 +13659,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12101,9 +13715,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12151,9 +13771,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12201,9 +13827,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12251,9 +13883,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12301,9 +13939,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12351,9 +13995,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12401,9 +14051,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12451,9 +14107,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12501,9 +14163,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12551,9 +14219,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12601,9 +14275,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12651,9 +14331,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12701,9 +14387,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12751,9 +14443,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -12784,7 +14482,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -12904,7 +14602,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -12979,7 +14677,7 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.recall_at_1, Metrics.recall_at_2, Metrics.mrr], + metrics=[Metrics.recall_at_k, Metrics.recall_at_k(sample_params={"k": 2}), Metrics.mrr], stop_sequence=["\n"], version=0, ) @@ -12994,7 +14692,7 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.recall_at_1, Metrics.recall_at_2, Metrics.mrr], + metrics=[Metrics.recall_at_k, Metrics.recall_at_k(sample_params={"k": 2}), Metrics.mrr], stop_sequence=["\n"], version=0, ) @@ -13011,7 +14709,7 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.rougeL, Metrics.bleu_1, @@ -13049,8 +14747,14 @@ generation_size=250, stop_sequence=["\n", "Question:", "question:"], metrics=( - Metrics.prefix_quasi_exact_match, - Metrics.f1_score_quasi, + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), ), ) navigate_bigbench = LightevalTaskConfig( @@ -13109,7 +14813,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -13124,7 +14831,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -13139,7 +14849,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -13154,7 +14867,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -13169,7 +14885,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -13184,7 +14903,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -13199,7 +14921,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -13214,7 +14939,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], version=0, ) @@ -13229,7 +14957,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -13261,9 +14989,15 @@ generation_size=1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -13279,7 +15013,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], version=0, ) @@ -13294,7 +15031,7 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -13309,7 +15046,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -13339,7 +15076,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=None, version=0, ) @@ -13354,7 +15091,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -13369,7 +15106,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -13444,7 +15181,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -13459,7 +15196,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], version=0, ) @@ -13476,9 +15216,15 @@ generation_size=1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -13539,7 +15285,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], version=0, ) @@ -13571,9 +15320,15 @@ generation_size=1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -13589,7 +15344,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], version=0, ) @@ -13604,7 +15362,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], version=0, ) @@ -13619,7 +15380,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], version=0, ) @@ -13634,7 +15398,12 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleurt, Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[ + Metrics.bleurt, + Metrics.bleu, + Metrics.rouge_t5, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], stop_sequence=["\n"], version=0, ) @@ -13649,7 +15418,7 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.f1_score_quasi], + metrics=[Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer})], stop_sequence=["\n"], version=0, ) @@ -13679,7 +15448,11 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match, Metrics.f1_score], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.f1_score, + ], stop_sequence=["\n"], version=0, ) @@ -13726,9 +15499,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -13748,9 +15527,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -13770,9 +15555,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -13792,9 +15583,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -13814,9 +15611,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -13836,9 +15639,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -13858,9 +15667,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -13880,9 +15695,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -13902,9 +15723,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -13924,9 +15751,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -13946,9 +15779,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -14011,7 +15850,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -14026,7 +15865,12 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[ + Metrics.rouge_t5, + Metrics.bleu, + Metrics.loglikelihood_acc, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], stop_sequence=["\n"], version=0, ) @@ -14101,7 +15945,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -14116,7 +15960,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], version=0, ) @@ -14131,7 +15978,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -14146,7 +15993,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -14191,7 +16038,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -14221,7 +16068,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -14251,7 +16098,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -14266,7 +16113,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -14296,7 +16143,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -14313,9 +16160,15 @@ generation_size=1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -14399,8 +16252,14 @@ stop_sequence=["\n", "Question:", "question:"], generation_size=200, metrics=( - Metrics.prefix_quasi_exact_match, - Metrics.f1_score_quasi, + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), ), ) storycloze_2016_lighteval = LightevalTaskConfig( @@ -14474,7 +16333,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -14690,7 +16549,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], version=0, ) @@ -14737,9 +16599,15 @@ generation_size=50, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -14787,9 +16655,15 @@ generation_size=50, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -14807,9 +16681,15 @@ generation_size=50, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -14855,7 +16735,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -15170,7 +17050,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], version=0, ) @@ -15215,7 +17098,9 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.quasi_exact_match_triviaqa], + metrics=[ + Metrics.exact_match(sample_params={"strip_strings": True, "normalize_pred": harness_triviaqa_normalizer}) + ], stop_sequence=["\n", ".", ","], version=0, ) @@ -15263,9 +17148,15 @@ metrics=[ Metrics.loglikelihood_acc, Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15371,7 +17262,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -15386,7 +17277,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -15401,7 +17292,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -15416,7 +17307,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -15431,7 +17322,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -15446,7 +17337,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -15523,9 +17414,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15543,9 +17440,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15563,9 +17466,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15583,9 +17492,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15603,9 +17518,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15623,9 +17544,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15643,9 +17570,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15663,9 +17596,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15683,9 +17622,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15703,9 +17648,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15723,9 +17674,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15743,9 +17700,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15763,9 +17726,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15783,9 +17752,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15803,9 +17778,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15823,9 +17804,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15843,9 +17830,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15863,9 +17856,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15883,9 +17882,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15903,9 +17908,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15923,9 +17934,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15943,9 +17960,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15963,9 +17986,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -15983,9 +18012,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16003,9 +18038,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16023,9 +18064,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16043,9 +18090,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16063,9 +18116,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16083,9 +18142,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16103,9 +18168,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16123,9 +18194,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16143,9 +18220,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16163,9 +18246,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16183,9 +18272,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16203,9 +18298,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16223,9 +18324,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16243,9 +18350,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16263,9 +18376,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16283,9 +18402,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16303,9 +18428,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16323,9 +18454,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16343,9 +18480,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16363,9 +18506,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16383,9 +18532,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16403,9 +18558,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16423,9 +18584,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16443,9 +18610,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16463,9 +18636,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16483,9 +18662,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16503,9 +18688,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16523,9 +18714,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16543,9 +18740,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16563,9 +18766,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16583,9 +18792,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16603,9 +18818,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16623,9 +18844,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16643,9 +18870,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16663,9 +18896,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16683,9 +18922,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16703,9 +18948,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16723,9 +18974,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16743,9 +19000,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16763,9 +19026,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16783,9 +19052,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16803,9 +19078,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16823,9 +19104,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16843,9 +19130,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16863,9 +19156,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16883,9 +19182,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16903,9 +19208,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16923,9 +19234,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16943,9 +19260,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16963,9 +19286,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -16983,9 +19312,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -17003,9 +19338,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -17023,9 +19364,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -17043,9 +19390,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -17063,9 +19416,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -17083,9 +19442,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -17103,9 +19468,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -17123,9 +19494,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -17143,9 +19520,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -17163,9 +19546,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -17183,9 +19572,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -17203,9 +19598,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -17223,9 +19624,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], version=0, @@ -19851,7 +22258,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) @@ -19866,7 +22273,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], version=0, ) diff --git a/src/lighteval/tasks/extended/hle/main.py b/src/lighteval/tasks/extended/hle/main.py index fcc47820f..1e2540984 100644 --- a/src/lighteval/tasks/extended/hle/main.py +++ b/src/lighteval/tasks/extended/hle/main.py @@ -206,8 +206,8 @@ def hle_text_only(line, task_name: str = None): metric_name=["accuracy", "confidence_half_width", "calibration_error"], higher_is_better=dict.fromkeys(["accuracy", "confidence_half_width", "calibration_error"], True), category=SamplingMethod.GENERATIVE, - sample_level_fn=JudgeLLMHLE().compute, - corpus_level_fn=JudgeLLMHLE().compute_corpus, + sample_level_fn=JudgeLLMHLE(), + corpus_level_fn=JudgeLLMHLE(), ) extend_enum(Metrics, "hle_metrics", hle_metrics) diff --git a/src/lighteval/tasks/extended/mix_eval/main.py b/src/lighteval/tasks/extended/mix_eval/main.py index 0e108f90c..2d9b7569a 100644 --- a/src/lighteval/tasks/extended/mix_eval/main.py +++ b/src/lighteval/tasks/extended/mix_eval/main.py @@ -111,7 +111,7 @@ def process_judge_response_freeform_gpt(x): process_judge_response=process_judge_response, judge_backend="vllm", short_judge_name="flow", - ).compute, + ), corpus_level_fn={ "judge_score_flow": np.mean, }, @@ -127,7 +127,7 @@ def process_judge_response_freeform_gpt(x): process_judge_response=process_judge_response_multichoice_gpt, judge_backend="openai", short_judge_name="gpt-3.5", - ).compute, + ), corpus_level_fn={ "judge_score_gpt-3.5": np.mean, }, @@ -148,7 +148,7 @@ def mean_dv_5(x): process_judge_response=process_judge_response, judge_backend="vllm", short_judge_name="flow", - ).compute, + ), corpus_level_fn={ "judge_score_flow": mean_dv_5, }, @@ -164,7 +164,7 @@ def mean_dv_5(x): process_judge_response=process_judge_response_freeform_gpt, judge_backend="openai", short_judge_name="gpt-3.5", - ).compute, + ), corpus_level_fn={ "judge_score_gpt-3.5": np.mean, }, diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index 1756fb212..e32194747 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -70,7 +70,7 @@ def flow_judge_mt_bench_prompt(question, answer, options, gold): template=flow_judge_mt_bench_prompt, process_judge_response=process_judge_response, judge_backend="vllm", - ).compute, + ), corpus_level_fn={ "judge_score_turn_1": np.mean, "judge_score_turn_2": np.mean, diff --git a/src/lighteval/tasks/extended/olympiade_bench/main.py b/src/lighteval/tasks/extended/olympiade_bench/main.py index 090562a1b..d9fe0d2bc 100644 --- a/src/lighteval/tasks/extended/olympiade_bench/main.py +++ b/src/lighteval/tasks/extended/olympiade_bench/main.py @@ -21,11 +21,14 @@ # SOFTWARE. +import numpy as np + from lighteval.metrics.dynamic_metrics import ( ExprExtractionConfig, LatexExtractionConfig, - multilingual_extractive_match_metric, + MultilingualExtractiveMatchMetric, ) +from lighteval.metrics.metrics import SampleLevelMetric, SamplingMethod from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc from lighteval.utils.language import Language @@ -200,11 +203,17 @@ def olympiad_bench_prompt(line, task_name: str = None): extraction_targets = [ExprExtractionConfig(), LatexExtractionConfig()] -metric = multilingual_extractive_match_metric( - language=Language.ENGLISH, - gold_extraction_target=extraction_targets, - pred_extraction_target=extraction_targets, - precision=6, +metric = SampleLevelMetric( + metric_name="extractive_match", + sample_level_fn=MultilingualExtractiveMatchMetric( + language=Language.ENGLISH, + gold_extraction_target=extraction_targets, + pred_extraction_target=extraction_targets, + precision=6, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, ) task_configs = [] diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py index d195bc89b..bf65ac530 100644 --- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py +++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py @@ -38,7 +38,8 @@ import lighteval.tasks.default_prompts as prompt from lighteval.metrics.metrics import CorpusLevelMetricGrouping, Metrics -from lighteval.metrics.metrics_sample import ExactMatches, LoglikelihoodAcc +from lighteval.metrics.metrics_corpus import CorpusLevelComputation +from lighteval.metrics.metrics_sample import ExactMatches, LoglikelihoodAcc, SampleLevelComputation from lighteval.metrics.normalizations import gsm8k_normalizer from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import SamplingMethod @@ -71,7 +72,7 @@ def neg_log_like(x): # Evaluation function -class TinyCorpusAggregator: +class TinyCorpusAggregator(SampleLevelComputation, CorpusLevelComputation): LEADEBRBOARD_SCENARIOS = ["truthfulqa", "gsm8k", "winogrande", "arc", "hellaswag"] BENCHS = ["lb", "mmlu"] METRICS = ["irt", "pirt", "gpirt"] @@ -111,7 +112,7 @@ def compute(self, **args): res = LoglikelihoodAcc().compute(**args) return dict.fromkeys(self.METRICS, res) - def aggregate(self, y_input): + def compute_corpus(self, y_input): if len(y_input) == self.num_samples and self.estimates is not None: return self.estimates[self.task] @@ -276,8 +277,8 @@ def aggregate(self, y_input): CorpusLevelMetricGrouping( metric_name=TinyCorpusAggregator.METRICS, higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True), - sample_level_fn=TinyCorpusAggregator(name).compute, + sample_level_fn=TinyCorpusAggregator(name), category=category, - corpus_level_fn=TinyCorpusAggregator(name).aggregate, + corpus_level_fn=TinyCorpusAggregator(name), ), ) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index c146041a5..6f1f73f05 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -31,7 +31,9 @@ from multiprocess import Pool from pytablewriter import MarkdownTableWriter -from lighteval.metrics.metrics import Metric, Metrics +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.metrics_sample import SamplingMetric +from lighteval.metrics.utils.metric_utils import Metric from lighteval.tasks.prompt_manager import FewShotSampler from lighteval.tasks.requests import ( Doc, @@ -191,11 +193,9 @@ def __init__( # We assume num_samples always contains 1 (for base generative evals) self.num_samples = [1] for metric in self.metrics: - metric_names = as_list(metric.metric_name) - - for metric_name in metric_names: + if isinstance(metric.sample_level_fn, SamplingMetric): # Update the number of samples to generate using the information in the metric name - self.num_samples.append(extract_num_samples(metric_name)) + self.num_samples.append(metric.sample_level_fn.num_samples()) def get_first_possible_fewshot_splits(self, available_splits: ListLike[str]) -> str | None: """ @@ -329,7 +329,10 @@ def aggregation(self): Return a dict with metric name and its aggregation function for all metrics """ - return Metrics.corpus_level_fns(self.metrics) + aggregations = {} + for metric in self.metrics: + aggregations.update(metric.get_corpus_aggregations()) + return aggregations @staticmethod def load_datasets(tasks: dict[str, "LightevalTask"], dataset_loading_processes: int = 1) -> None: @@ -377,24 +380,3 @@ def download_dataset_worker( # It returns DatasetDict because we don't specify a split return dataset # type: ignore - - -def extract_num_samples(metric_name: str) -> int: - """Gets the number of samples to generate from the metric name. - Assumes that any metric with @ in it's name depends on the number of samples. - - Args: - metric_name (str): The metric name in the task. - - Returns: - int: The number of samples to generate. - """ - if "@" in metric_name: - metric_name = metric_name.split("@")[-1] - if "_" in metric_name: - metric_name = metric_name.split("_")[0] - if ":" in metric_name: - return int(metric_name.split(":")[-1]) - else: - return int(metric_name) - return 1 diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py index 1c00060e3..5d6c107bc 100644 --- a/src/lighteval/tasks/multilingual/tasks.py +++ b/src/lighteval/tasks/multilingual/tasks.py @@ -27,9 +27,9 @@ from langcodes import standardize_tag from lighteval.metrics.dynamic_metrics import ( - loglikelihood_acc_metric, - multilingual_quasi_exact_match_metric, - multilingual_quasi_f1_score_metric, + LogLikelihoodAccMetric, + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, ) from lighteval.metrics.metrics import Metrics from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm @@ -83,9 +83,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), prompt_function=get_nli_prompt_function( @@ -138,9 +138,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), prompt_function=get_nli_prompt_function( @@ -218,9 +218,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -265,9 +265,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -321,9 +321,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -366,9 +366,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -402,9 +402,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -437,9 +437,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -486,8 +486,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -534,8 +534,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -585,8 +585,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -627,8 +627,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -700,8 +700,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -734,8 +734,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -763,8 +763,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), ], ), ) @@ -791,8 +791,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -836,8 +836,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(language, "prefix"), - multilingual_quasi_f1_score_metric(language), + MultilingualQuasiExactMatchMetric(language, "prefix"), + MultilingualQuasiF1ScoreMetric(language), ), ) for language in [ @@ -879,8 +879,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(Language.GERMAN, "prefix"), - multilingual_quasi_f1_score_metric(Language.GERMAN), + MultilingualQuasiExactMatchMetric(Language.GERMAN, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.GERMAN), ), ) ] @@ -908,8 +908,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(Language.ITALIAN, "prefix"), - multilingual_quasi_f1_score_metric(Language.ITALIAN), + MultilingualQuasiExactMatchMetric(Language.ITALIAN, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.ITALIAN), ), ) ] @@ -935,8 +935,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(Language.THAI, "prefix"), - multilingual_quasi_f1_score_metric(Language.THAI), + MultilingualQuasiExactMatchMetric(Language.THAI, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.THAI), ), ) ] @@ -960,8 +960,8 @@ evaluation_splits=("validation",), few_shots_split="train", metrics=( - multilingual_quasi_exact_match_metric(Language.RUSSIAN, "prefix"), - multilingual_quasi_f1_score_metric(Language.RUSSIAN), + MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.RUSSIAN), ), generation_size=400, stop_sequence=("\n",), @@ -989,8 +989,8 @@ evaluation_splits=("validation",), few_shots_split="train", metrics=( - multilingual_quasi_exact_match_metric(Language.PORTUGUESE, "prefix"), - multilingual_quasi_f1_score_metric(Language.PORTUGUESE), + MultilingualQuasiExactMatchMetric(Language.PORTUGUESE, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.PORTUGUESE), ), generation_size=400, stop_sequence=("\n",), @@ -1018,8 +1018,8 @@ evaluation_splits=("validation",), few_shots_split="train", metrics=( - multilingual_quasi_exact_match_metric(Language.SPANISH, "prefix"), - multilingual_quasi_f1_score_metric(Language.SPANISH), + MultilingualQuasiExactMatchMetric(Language.SPANISH, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.SPANISH), ), generation_size=400, stop_sequence=("\n",), @@ -1046,8 +1046,8 @@ evaluation_splits=("validation",), few_shots_split="train", metrics=( - multilingual_quasi_exact_match_metric(Language.ARABIC, "prefix"), - multilingual_quasi_f1_score_metric(Language.ARABIC), + MultilingualQuasiExactMatchMetric(Language.ARABIC, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.ARABIC), ), generation_size=400, stop_sequence=("\n",), @@ -1073,8 +1073,8 @@ evaluation_splits=("test",), few_shots_split="validation", metrics=( - multilingual_quasi_exact_match_metric(Language.SWAHILI, "prefix"), - multilingual_quasi_f1_score_metric(Language.SWAHILI), + MultilingualQuasiExactMatchMetric(Language.SWAHILI, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.SWAHILI), ), generation_size=400, stop_sequence=("\n",), @@ -1100,8 +1100,8 @@ evaluation_splits=("validation",), few_shots_split="train", metrics=( - multilingual_quasi_exact_match_metric(Language.CHINESE, "prefix"), - multilingual_quasi_f1_score_metric(Language.CHINESE), + MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.CHINESE), ), generation_size=400, stop_sequence=("\n",), @@ -1128,8 +1128,8 @@ few_shots_split="train", generation_size=400, metrics=( - multilingual_quasi_exact_match_metric(Language.CHINESE, "prefix"), - multilingual_quasi_f1_score_metric(Language.CHINESE), + MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.CHINESE), ), stop_sequence=("\n",), ) @@ -1157,8 +1157,8 @@ hf_avail_splits=("test",), generation_size=400, metrics=( - multilingual_quasi_exact_match_metric(language, "prefix"), - multilingual_quasi_f1_score_metric(language), + MultilingualQuasiExactMatchMetric(language, "prefix"), + MultilingualQuasiF1ScoreMetric(language), ), stop_sequence=("\n",), ) @@ -1198,8 +1198,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(Language.FRENCH, "prefix"), - multilingual_quasi_f1_score_metric(Language.FRENCH), + MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.FRENCH), ), ) ] @@ -1224,8 +1224,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(Language.TURKISH, "prefix"), - multilingual_quasi_f1_score_metric(Language.TURKISH), + MultilingualQuasiExactMatchMetric(Language.TURKISH, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.TURKISH), ), ) ] @@ -1253,8 +1253,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(language, "prefix"), - multilingual_quasi_f1_score_metric(language), + MultilingualQuasiExactMatchMetric(language, "prefix"), + MultilingualQuasiF1ScoreMetric(language), ), ) for language in [ @@ -1296,8 +1296,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -1327,8 +1327,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -1352,8 +1352,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -1388,8 +1388,8 @@ generation_size=400, stop_sequence=("\n",), metrics=[ - multilingual_quasi_exact_match_metric(lang, "prefix"), - multilingual_quasi_f1_score_metric(lang), + MultilingualQuasiExactMatchMetric(lang, "prefix"), + MultilingualQuasiF1ScoreMetric(lang), ], ) for lang in [ @@ -1425,8 +1425,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -1679,9 +1679,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -1726,9 +1726,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -1790,9 +1790,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -1855,9 +1855,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -1941,9 +1941,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -1997,9 +1997,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2029,9 +2029,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2136,9 +2136,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2217,9 +2217,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2264,9 +2264,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2328,9 +2328,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2386,8 +2386,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -2421,9 +2421,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2471,10 +2471,10 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ] - + ([loglikelihood_acc_metric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore + + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore ), ) for subset in ["easy", "challenge"] @@ -2507,10 +2507,10 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ] - + ([loglikelihood_acc_metric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore + + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore ), ) for subset in ["easy", "challenge"] @@ -2535,8 +2535,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -2572,10 +2572,10 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ] - + ([loglikelihood_acc_metric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore + + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore ), ) for subset in ["easy", "challenge"] @@ -2631,8 +2631,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -2704,8 +2704,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -2867,8 +2867,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -2901,8 +2901,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -2943,8 +2943,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -2993,9 +2993,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -3051,8 +3051,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3089,8 +3089,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3123,8 +3123,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3158,8 +3158,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3204,8 +3204,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3248,8 +3248,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3277,7 +3277,7 @@ few_shots_split="validation", generation_size=25, metrics=[ - multilingual_quasi_exact_match_metric(Language.CHINESE, "full"), + MultilingualQuasiExactMatchMetric(Language.CHINESE, "full"), ], stop_sequence=("\n",), ) @@ -3302,7 +3302,7 @@ few_shots_split="train", generation_size=25, metrics=[ - multilingual_quasi_exact_match_metric(language, "full"), + MultilingualQuasiExactMatchMetric(language, "full"), ], stop_sequence=("\n",), ) @@ -3341,7 +3341,7 @@ few_shots_split="train", generation_size=25, metrics=[ - multilingual_quasi_exact_match_metric(language, "full"), + MultilingualQuasiExactMatchMetric(language, "full"), ], stop_sequence=("\n",), ) @@ -3413,9 +3413,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -3504,8 +3504,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3541,8 +3541,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3576,8 +3576,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3614,8 +3614,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3650,8 +3650,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3710,8 +3710,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3755,9 +3755,9 @@ evaluation_splits=("test",), hf_avail_splits=["test"], metrics=[ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ) for language in [ @@ -3787,9 +3787,9 @@ evaluation_splits=("validation",), few_shots_split="train", metrics=[ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ) for formulation in [ @@ -3839,12 +3839,12 @@ hf_avail_splits=["train"], stop_sequence=("\n",), metrics=[ - multilingual_quasi_exact_match_metric(language, "prefix"), - multilingual_quasi_f1_score_metric(language), + MultilingualQuasiExactMatchMetric(language, "prefix"), + MultilingualQuasiF1ScoreMetric(language), ] if subset in ["entity", "long_answer", "short_phrase"] else [ - multilingual_quasi_exact_match_metric(language, "full"), + MultilingualQuasiExactMatchMetric(language, "full"), ], ) for subset in MKQA_TASK_TO_ID.keys() @@ -3896,8 +3896,8 @@ generation_size=400, stop_sequence=("\n",), metrics=[ - multilingual_quasi_exact_match_metric(lang, "prefix"), - multilingual_quasi_f1_score_metric(lang), + MultilingualQuasiExactMatchMetric(lang, "prefix"), + MultilingualQuasiF1ScoreMetric(lang), ], ) for lang in [ @@ -3931,8 +3931,8 @@ generation_size=400, stop_sequence=("\n",), metrics=[ - multilingual_quasi_exact_match_metric(Language.FRENCH, "prefix"), - multilingual_quasi_f1_score_metric(Language.FRENCH), + MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.FRENCH), ], ) ] @@ -3956,8 +3956,8 @@ generation_size=400, stop_sequence=("\n",), metrics=[ - multilingual_quasi_exact_match_metric(Language.RUSSIAN, "prefix"), - multilingual_quasi_f1_score_metric(Language.RUSSIAN), + MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.RUSSIAN), ], ) ] @@ -4051,7 +4051,7 @@ hf_subset=subset, evaluation_splits=("test",), few_shots_split="validation", - metrics=[multilingual_quasi_exact_match_metric(Language.ARABIC, "full"), loglikelihood_acc_metric()], + metrics=[MultilingualQuasiExactMatchMetric(Language.ARABIC, "full"), LogLikelihoodAccMetric()], generation_size=5, stop_sequence=("\n",), ) @@ -4078,7 +4078,7 @@ few_shots_split="valid", generation_size=5, stop_sequence=["\n"], - metrics=[multilingual_quasi_exact_match_metric(Language.FRENCH, "full"), loglikelihood_acc_metric()], + metrics=[MultilingualQuasiExactMatchMetric(Language.FRENCH, "full"), LogLikelihoodAccMetric()], ) ] @@ -4101,7 +4101,7 @@ few_shots_split="train", generation_size=5, stop_sequence=["\n"], - metrics=[multilingual_quasi_exact_match_metric(language, "full"), loglikelihood_acc_metric()], + metrics=[MultilingualQuasiExactMatchMetric(language, "full"), LogLikelihoodAccMetric()], ) for language in [ Language.HINDI, diff --git a/src/lighteval/tasks/multilingual/utils/task_utils.py b/src/lighteval/tasks/multilingual/utils/task_utils.py index d8e73dac8..d439eed16 100644 --- a/src/lighteval/tasks/multilingual/utils/task_utils.py +++ b/src/lighteval/tasks/multilingual/utils/task_utils.py @@ -21,7 +21,7 @@ # SOFTWARE. -from lighteval.metrics.dynamic_metrics import loglikelihood_acc_metric +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric from lighteval.metrics.utils.metric_utils import Metric from lighteval.tasks.templates.utils.formulation import Formulation, MCFFormulation @@ -37,6 +37,6 @@ def get_metrics_for_formulation(formulation: Formulation, metrics: list[Metric]) match formulation: # case MCFFormulation(choice_prefix="Letters"): - return [loglikelihood_acc_metric(normalization=None)] + return [LogLikelihoodAccMetric(normalization=None)] case _: return metrics diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 27592de82..0a91c5554 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import ast import collections import copy import importlib @@ -139,22 +140,34 @@ def get_tasks_configs(self, task: str) -> list[LightevalTaskConfig]: returns a LightevalTaskConfig object based on the task name and fewshot and truncate_few_shots values. """ - task_info_dict = self.taskinfo_selector(task) + task_to_params = self.taskinfo_selector(task) configs = [] - for task_name, task_info in task_info_dict.items(): - # We can have multiple few_shot and truncate_few_shots values for the same task - for task_info_dict in task_info: + for task_name, task_param in task_to_params.items(): + # We can have multiple versions of the same task running (for ex, different few shots, different metric params, etc) + for subtask_param in task_param: config = self.task_registry.get(task_name) - if config is not None: - config = copy.deepcopy(config) - config.num_fewshots = task_info_dict["fewshots"] - config.truncate_fewshots = task_info_dict["truncate_fewshots"] - config.full_name = f"{task_name}|{config.num_fewshots}" - configs.append(config) - else: + if config is None: raise ValueError(f"Cannot find task {task_name} in task list or in custom task registry") + config = copy.deepcopy(config) + config.num_fewshots = subtask_param["fewshots"] + config.truncate_fewshots = subtask_param["truncate_fewshots"] + config.full_name = f"{task_name}|{config.num_fewshots}" + # If some tasks are parametrizable and in cli, we set attributes here + for metric in [m for m in config.metrics if "@" in m.metric_name]: # parametrizable metric + for attribute, value in subtask_param["metric_params"].items(): + setattr(metric.sample_level_fn, attribute, value) + required = getattr(metric.sample_level_fn, "attribute_must_be_set", []) + for attribute in required: + if getattr(metric.sample_level_fn, attribute) is None: + raise ValueError( + f"Metric {metric.metric_name} for task {task_name} " + f"was not correctly parametrized. Forgot to set '{attribute}'." + ) + + configs.append(config) + return configs @property @@ -237,7 +250,7 @@ def taskinfo_selector(self, tasks: str) -> dict[str, list[dict]]: - A sorted list of unique task names in the format "suite|task". - A dictionary mapping each task name to a list of tuples representing the few_shot and truncate_few_shots values. """ - few_shot_dict = collections.defaultdict(list) + task_to_params = collections.defaultdict(list) # We can provide a path to a file with a list of tasks or a string of comma-separated tasks if os.path.exists(tasks): @@ -257,8 +270,15 @@ def taskinfo_selector(self, tasks: str) -> dict[str, list[dict]]: expanded_tasks_list.extend(expanded_tasks) for task in expanded_tasks_list: + metric_params_dict = {} try: suite_name, task_name, few_shot, truncate_few_shots = tuple(task.split("|")) + if "@" in task_name: + task_name, metric_params = task_name.split("@") + # We convert k:v,k2:v2 to {"k": "v", "k2": "v2"}, then to correct type + metric_params_dict = dict(item.split("=") for item in metric_params.split(",") if item) + metric_params_dict = {k: ast.literal_eval(v) for k, v in metric_params_dict.items()} + truncate_few_shots = int(truncate_few_shots) except ValueError: raise ValueError( @@ -279,9 +299,15 @@ def taskinfo_selector(self, tasks: str) -> dict[str, list[dict]]: # This adds support for task supersets (eg: mmlu -> all the mmlu tasks) for expanded_task in self.expand_task_definition(f"{suite_name}|{task_name}"): # Store few_shot info for each task name (suite|task) - few_shot_dict[expanded_task].append({"fewshots": few_shot, "truncate_fewshots": truncate_few_shots}) + task_to_params[expanded_task].append( + { + "fewshots": few_shot, + "truncate_fewshots": truncate_few_shots, + "metric_params": metric_params_dict, + } + ) - return few_shot_dict + return task_to_params @property @lru_cache diff --git a/tests/metrics/test_extractive_match.py b/tests/metrics/test_extractive_match.py index 603813ef5..d2fd71606 100644 --- a/tests/metrics/test_extractive_match.py +++ b/tests/metrics/test_extractive_match.py @@ -23,11 +23,11 @@ import pytest import sympy -from lighteval.metrics.dynamic_metrics import ( +from lighteval.metrics.dynamic_metrics import MultilingualExtractiveMatchMetric +from lighteval.metrics.utils.extractive_match_utils import ( ExprExtractionConfig, IndicesExtractionConfig, LatexExtractionConfig, - multilingual_extractive_match_metric, ) from lighteval.metrics.utils.math_comparison import sympy_expr_eq from lighteval.models.model_output import ModelResponse @@ -66,12 +66,12 @@ def compare_strings( model_response = ModelResponse(text=[pred]) doc = Doc(choices=[gold, "", "", ""], query="", gold_index=0) - return multilingual_extractive_match_metric( + return MultilingualExtractiveMatchMetric( language=language, gold_extraction_target=extraction_targets, pred_extraction_target=extraction_targets, precision=precision, - ).sample_level_fn( + ).compute( model_response=model_response, doc=doc, ) diff --git a/tests/metrics/test_metric_requests.py b/tests/metrics/test_metric_requests.py index b748f7363..7ceb94c68 100644 --- a/tests/metrics/test_metric_requests.py +++ b/tests/metrics/test_metric_requests.py @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from lighteval.metrics.dynamic_metrics import loglikelihood_acc_metric +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric from lighteval.metrics.metrics import Metrics from lighteval.metrics.normalizations import LogProbPMINorm from lighteval.metrics.utils.metric_utils import Metric @@ -69,7 +69,7 @@ def test_pmi_request(): ] ) - metric = loglikelihood_acc_metric(normalization=LogProbPMINorm()) + metric = LogLikelihoodAccMetric(normalization=LogProbPMINorm()) pmi_test_config = get_pmi_task(metrics=[metric]) task = LightevalTask(pmi_test_config) result = fake_evaluate_task(task, fake_model, max_samples=1)["results"]["test:pmi_test_task:0"] @@ -93,7 +93,7 @@ def test_pmi_request_with_logprob_metric(): ] ) - metrics = [loglikelihood_acc_metric(normalization=LogProbPMINorm()), loglikelihood_acc_metric(normalization=None)] + metrics = [LogLikelihoodAccMetric(normalization=LogProbPMINorm()), LogLikelihoodAccMetric(normalization=None)] pmi_test_config = get_pmi_task(metrics=metrics) task = LightevalTask(pmi_test_config) result = fake_evaluate_task(task, fake_model, max_samples=1)["results"]["test:pmi_test_task:0"] @@ -126,7 +126,7 @@ def test_pmi_request_with_generative_metric(): ], ) - metrics = [loglikelihood_acc_metric(normalization=LogProbPMINorm()), Metrics.exact_match.value] + metrics = [LogLikelihoodAccMetric(normalization=LogProbPMINorm()), Metrics.exact_match.value] pmi_test_config = get_pmi_task(metrics=metrics) task = LightevalTask(pmi_test_config) results = fake_evaluate_task(task, fake_model, max_samples=1)["results"]["test:pmi_test_task:0"] diff --git a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json index a55fd6f82..7c8c77d79 100644 --- a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json +++ b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d38c5cdb9dd354222ccd238df2675b0999181b663322dab612655aa12f9ef372 -size 49944 +oid sha256:2fbcbcf4031d545999b8e02afffa2537f642a1239664af16160e5fcd250a4ecc +size 50626 diff --git a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json index 7bc559c14..66ab85090 100644 --- a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json +++ b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be52fd994b9978b91eb057bb72ec6110e2e49016ca0f2b296ba5bf75ba056725 -size 49883 +oid sha256:d1302090702deaf018f21f1dc5ffd2a2a2b93e19b50aa459508146f130aa9ecf +size 50565 diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json index 9679e6592..8f4be7807 100644 --- a/tests/reference_scores/harness_metrics.json +++ b/tests/reference_scores/harness_metrics.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5dffe1e990e1e839322b74ff02f306ea468ad7602492f62f987cae1bb546b84 -size 48376580 +oid sha256:c2080305011a7ac8b0895ec1fbb26b45af4e3dced6272abf67156ebf57656f88 +size 48360080 diff --git a/tests/tasks/test_lighteval_task.py b/tests/tasks/test_lighteval_task.py index d338b8a76..df2b5ad4a 100644 --- a/tests/tasks/test_lighteval_task.py +++ b/tests/tasks/test_lighteval_task.py @@ -20,9 +20,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import pytest -from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig, extract_num_samples +from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -64,18 +63,3 @@ def test_dataset_filter(): filtered_docs = task.eval_docs() assert len(filtered_docs) == 1 assert filtered_docs[0].query == "hi" - - -@pytest.mark.parametrize( - "metric_name, expected", - [ - ("maj@1", 1), - ("pass@1:32_samples", 32), - ("pass@10:64_samples", 64), - ("codegen_pass@1:16", 16), - ("other_name@2", 2), - ("other_name", 1), - ], -) -def test_extract_num_samples(metric_name, expected): - assert extract_num_samples(metric_name) == expected diff --git a/tests/tasks/test_registry.py b/tests/tasks/test_registry.py index 0f51a88b5..caeb4e787 100644 --- a/tests/tasks/test_registry.py +++ b/tests/tasks/test_registry.py @@ -53,8 +53,8 @@ def test_custom_task_groups(): assert set(task_info.keys()) == {"custom|test_task_revision"} assert task_info["custom|test_task_revision"] == [ - {"fewshots": 0, "truncate_fewshots": False}, - {"fewshots": 1, "truncate_fewshots": False}, + {"fewshots": 0, "truncate_fewshots": False, "metric_params": {}}, + {"fewshots": 1, "truncate_fewshots": False, "metric_params": {}}, ] @@ -66,7 +66,7 @@ def test_custom_tasks(): task_info = registry.taskinfo_selector("custom|test_task_revision|0|0") assert list(task_info.keys()) == ["custom|test_task_revision"] - assert task_info["custom|test_task_revision"] == [{"fewshots": 0, "truncate_fewshots": False}] + assert task_info["custom|test_task_revision"] == [{"fewshots": 0, "truncate_fewshots": False, "metric_params": {}}] def test_superset_expansion(): @@ -78,9 +78,9 @@ def test_superset_expansion(): task_info = registry.taskinfo_selector("lighteval|storycloze|0|0") assert list(task_info.keys()) == ["lighteval|storycloze:2016", "lighteval|storycloze:2018"] - assert task_info["lighteval|storycloze:2016"] == [{"fewshots": 0, "truncate_fewshots": False}] and task_info[ - "lighteval|storycloze:2018" - ] == [{"fewshots": 0, "truncate_fewshots": False}] + assert task_info["lighteval|storycloze:2016"] == [ + {"fewshots": 0, "truncate_fewshots": False, "metric_params": {}} + ] and task_info["lighteval|storycloze:2018"] == [{"fewshots": 0, "truncate_fewshots": False, "metric_params": {}}] def test_superset_with_subset_task(): @@ -95,11 +95,38 @@ def test_superset_with_subset_task(): assert len(task_info.keys()) == 57 # Since it's defined twice assert task_info["original|mmlu:abstract_algebra"] == [ - {"fewshots": 3, "truncate_fewshots": False}, - {"fewshots": 5, "truncate_fewshots": False}, + { + "fewshots": 3, + "truncate_fewshots": False, + "metric_params": {}, + }, + {"fewshots": 5, "truncate_fewshots": False, "metric_params": {}}, ] +def test_cli_sampling_params(): + """ + Tests task setting the sampling parameters in CLI. + """ + registry = Registry() + + task_info = registry.taskinfo_selector("lighteval|math_500@k=1|0|0") + + assert list(task_info.keys()) == ["lighteval|math_500"] + assert task_info["lighteval|math_500"] == [{"fewshots": 0, "truncate_fewshots": False, "metric_params": {"k": 1}}] + + +def test_cli_sampling_params_fail(): + """ + Tests task setting the sampling parameters in CLI failure when args are wrong. + """ + registry = Registry() + + # creation of object should fail + with pytest.raises(ValueError): + registry.get_tasks_configs("lighteval|math_500@plop|0|0") + + def test_task_group_expansion_with_subset_expansion(): """ Tests that task info selector correctly handles a group with task superset is provided. diff --git a/tests/test_unit_base_metrics.py b/tests/test_unit_base_metrics.py index 65302d127..575ebf595 100644 --- a/tests/test_unit_base_metrics.py +++ b/tests/test_unit_base_metrics.py @@ -24,11 +24,11 @@ import pytest from lighteval.metrics.dynamic_metrics import ( - loglikelihood_acc_metric, - multilingual_quasi_exact_match_metric, - multilingual_quasi_f1_score_metric, - normalized_multi_choice_prob_metric, - probability_metric, + LogLikelihoodAccMetric, + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, + NormalizedMultiChoiceProbMetric, + ProbabilityMetric, ) from lighteval.metrics.metrics_sample import ExactMatches from lighteval.metrics.normalizations import LogProbCharNorm, helm_normalizer @@ -196,86 +196,88 @@ def test_prob(self): # Simple case model_response = ModelResponse(logprobs=np.log([0.7])) - prob_metric = probability_metric() - result = prob_metric.sample_level_fn(doc, model_response) - assert result == pytest.approx(0.7) + prob_metric = ProbabilityMetric() + result = prob_metric.compute_sample(doc=doc, model_response=model_response) + assert result[prob_metric.metric_name] == pytest.approx(0.7) # Aggregation function test model_response = ModelResponse(logprobs=np.log([0.7, 0.1])) - prob_min_metric = probability_metric(aggregation_function=np.min) - result = prob_min_metric.sample_level_fn(doc, model_response) - assert result == pytest.approx(0.1) + prob_min_metric = ProbabilityMetric(aggregation_function=np.min) + result = prob_min_metric.compute_sample(doc=doc, model_response=model_response) + assert result[prob_metric.metric_name] == pytest.approx(0.1) def test_mc_probability_metric(self): doc = Doc(query="Test query", choices=["A", "B", "C"], gold_index=0, task_name="test") model_response = ModelResponse(logprobs=np.log([0.35, 0.1, 0.05])) - mc_prob_metric = normalized_multi_choice_prob_metric() + mc_prob_metric = NormalizedMultiChoiceProbMetric() - result = mc_prob_metric.sample_level_fn( - doc, - model_response, + result = mc_prob_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == pytest.approx(0.7) + assert result[mc_prob_metric.metric_name] == pytest.approx(0.7) doc = Doc(query="Test query", choices=["AA", "BB", "CCC"], gold_index=1, task_name="test") model_response = ModelResponse(logprobs=np.log([0.1**2, 0.35**2, 0.05**3])) - prob_norm_metric = normalized_multi_choice_prob_metric(normalization=LogProbCharNorm()) - result = prob_norm_metric.sample_level_fn( - doc, - model_response, + prob_norm_metric = NormalizedMultiChoiceProbMetric(normalization=LogProbCharNorm()) + result = prob_norm_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == pytest.approx(0.7) + assert result[prob_norm_metric.metric_name] == pytest.approx(0.7) def test_acc(self): # Test without normalization doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=0, task_name="test") model_response = ModelResponse(logprobs=np.log([0.7, 0.2, 0.3, 0.4])) - acc_metric = loglikelihood_acc_metric() - result = acc_metric.sample_level_fn( - doc, - model_response, + acc_metric = LogLikelihoodAccMetric() + result = acc_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == 1 # The highest logprob (3.0) is at index 3, which is not in gold_ixs + assert result[acc_metric.metric_name] == 1 # The highest logprob (3.0) is at index 3, which is not in gold_ixs # Test 0 acc doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=0, task_name="test") model_response = ModelResponse(logprobs=np.log([0.1, 0.2, 0.3, 0.4])) - result = acc_metric.sample_level_fn( - doc, - model_response, + result = acc_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == 0 + assert result[acc_metric.metric_name] == 0 # Test with normalization doc = Doc(query="Test query", choices=["ABCDE", "AB"], gold_index=0, task_name="test") model_response = ModelResponse(logprobs=np.log([0.5, 0.6])) - acc_norm_metric = loglikelihood_acc_metric(normalization=LogProbCharNorm()) - result_norm = acc_norm_metric.sample_level_fn( - doc, - model_response, + acc_norm_metric = LogLikelihoodAccMetric(normalization=LogProbCharNorm()) + result_norm = acc_norm_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result_norm == 1 # After normalization, "ABCDE" should have the highest score + assert ( + result_norm[acc_norm_metric.metric_name] == 1 + ) # After normalization, "ABCDE" should have the highest score # Test with multiple correct solutions doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=[1, 3], task_name="test") model_response = ModelResponse(logprobs=np.log([0.5, 0.6, 0.7, 0.8])) - result_multi = acc_metric.sample_level_fn( - doc, - model_response, + result_multi = acc_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result_multi == 1 + assert result_multi[acc_metric.metric_name] == 1 # Test when the highest logprob is not in gold_ixs doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=[1, 2], task_name="test") model_response = ModelResponse(logprobs=[0.5, 0.6, 0.7, 0.8]) - result_incorrect = acc_metric.sample_level_fn( - doc, - model_response, + result_incorrect = acc_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result_incorrect == 0 + assert result_incorrect[acc_metric.metric_name] == 0 def test_f1_dynamic_metric(self): """ @@ -286,21 +288,21 @@ def test_f1_dynamic_metric(self): model_response = ModelResponse(text=["hello, the world"]) # Normalization test - f1_metric = multilingual_quasi_f1_score_metric(language=Language.ENGLISH) - result = f1_metric.sample_level_fn( - doc, - model_response, + f1_metric = MultilingualQuasiF1ScoreMetric(language=Language.ENGLISH) + result = f1_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == 1 + assert result[f1_metric.metric_name] == 1 model_response = ModelResponse(text=["hello, the world how"]) - f1_metric = multilingual_quasi_f1_score_metric(language=Language.ENGLISH, aggregation_function=np.min) - result = f1_metric.sample_level_fn( - doc, - model_response, + f1_metric = MultilingualQuasiF1ScoreMetric(language=Language.ENGLISH, aggregation_function=np.min) + result = f1_metric.compute_sample( + doc=doc, + model_response=model_response, ) # 2 * (precision * recall) / (precision + recall) = 2 * (1 * 2/3) / (1 + 2/3) = 0.8 - assert result == 0.8 + assert result[f1_metric.metric_name] == 0.8 def test_exact_match_dynamic_metric(self): """ @@ -310,20 +312,20 @@ def test_exact_match_dynamic_metric(self): model_response = ModelResponse(text=["hello, the world"]) # Normalization test - em_metric = multilingual_quasi_exact_match_metric(language=Language.ENGLISH, match_type="full") - result = em_metric.sample_level_fn( - doc, - model_response, + em_metric = MultilingualQuasiExactMatchMetric(language=Language.ENGLISH, match_type="full") + result = em_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == 1 + assert result[em_metric.metric_name] == 1 model_response = ModelResponse(text=["hello, the world how"]) - em_metric = multilingual_quasi_exact_match_metric(language=Language.ENGLISH, match_type="full") - result = em_metric.sample_level_fn( - doc, - model_response, + em_metric = MultilingualQuasiExactMatchMetric(language=Language.ENGLISH, match_type="full") + result = em_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == 0 + assert result[em_metric.metric_name] == 0 @pytest.mark.skip(reason="Need to understand what it does.") def test_pass_at_k_estimator(self): diff --git a/tests/test_unit_harness_metrics.py b/tests/test_unit_harness_metrics.py index 4cc2853ae..6d1764593 100644 --- a/tests/test_unit_harness_metrics.py +++ b/tests/test_unit_harness_metrics.py @@ -39,8 +39,6 @@ PATH_TO_HARNESS_METRICS = os.path.join(os.path.dirname(__file__), "reference_scores/harness_metrics.json") -POSSIBLE_METRICS = Metrics.all_metrics() - def pytest_generate_tests(metafunc: pytest.Metafunc): """Initializes the main test setup. This function is automatically called by pytest and @@ -106,7 +104,7 @@ def test_model_prediction(prompt_inputs: tuple[str, str, list]): # noqa: C901 metric_result = {k: list(v) if isinstance(v, tuple) else v for k, v in results.items()} - metric_reference = {k: v for k, v in example.items() if k in POSSIBLE_METRICS} + metric_reference = {k: example[k] for k in results.keys()} error_msg += f"Prediction: {results}\n" error_msg += f"Reference: {metric_reference}\n" error_msg += f"Returned : {metric_result}"