From ead5bdbd4ae04fe18874f4c67af32d9a6885679f Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Mon, 18 Aug 2025 15:26:46 +0000 Subject: [PATCH 01/38] defined a sampling type for metrics, works for cli, now needs to port all old evals to the new format and figure out how to provide sane defaults --- src/lighteval/metrics/metrics.py | 214 ++++++++----- src/lighteval/metrics/metrics_sample.py | 322 ++++++-------------- src/lighteval/metrics/sample_preparator.py | 12 +- src/lighteval/metrics/utils/metric_utils.py | 14 +- src/lighteval/tasks/default_tasks.py | 3 +- src/lighteval/tasks/lighteval_task.py | 28 +- src/lighteval/tasks/registry.py | 53 +++- 7 files changed, 293 insertions(+), 353 deletions(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 903295240..a3b5834e5 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -98,35 +98,35 @@ class Metrics(Enum): ) bert_score = SampleLevelMetricGrouping( metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"], - sample_level_fn=BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip).compute, + sample_level_fn=BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip), category=SamplingMethod.GENERATIVE, corpus_level_fn={"BERTScore-P": np.mean, "BERTScore-R": np.mean, "BERTScore-F": np.mean}, higher_is_better={"BERTScore-P": True, "BERTScore-R": True, "BERTScore-F": True}, ) bits_per_byte = CorpusLevelMetric( metric_name="bits_per_byte", - sample_level_fn=PerplexityPreparator(units_type="bytes").prepare, + sample_level_fn=PerplexityPreparator(units_type="bytes"), category=SamplingMethod.PERPLEXITY, corpus_level_fn=CorpusLevelPerplexityMetric("bits_per_byte").compute, higher_is_better=False, ) bleu = CorpusLevelMetric( metric_name="bleu", - sample_level_fn=GenerativePreparator().prepare, + sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, corpus_level_fn=CorpusLevelTranslationMetric("bleu").compute, higher_is_better=True, ) bleu_1 = SampleLevelMetric( metric_name="bleu_1", - sample_level_fn=BLEU(n_gram=1).compute, + sample_level_fn=BLEU(n_gram=1), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) bleu_4 = SampleLevelMetric( metric_name="bleu_4", - sample_level_fn=BLEU(n_gram=4).compute, + sample_level_fn=BLEU(n_gram=4), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -134,28 +134,28 @@ class Metrics(Enum): bleurt = SampleLevelMetric( metric_name="bleurt", - sample_level_fn=BLEURT().compute, + sample_level_fn=BLEURT(), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) byte_perplexity = CorpusLevelMetric( metric_name="byte_perplexity", - sample_level_fn=PerplexityPreparator(units_type="bytes").prepare, + sample_level_fn=PerplexityPreparator(units_type="bytes"), category=SamplingMethod.PERPLEXITY, corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity").compute, higher_is_better=False, ) chrf = CorpusLevelMetric( metric_name="chrf", - sample_level_fn=GenerativePreparator().prepare, + sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, corpus_level_fn=CorpusLevelTranslationMetric("chrf").compute, higher_is_better=True, ) chrf_plus = CorpusLevelMetric( metric_name="chrf++", - sample_level_fn=GenerativePreparator().prepare, + sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, corpus_level_fn=CorpusLevelTranslationMetric("chrf++").compute, higher_is_better=True, @@ -164,7 +164,7 @@ class Metrics(Enum): metric_name=["longest_common_prefix_length", "edit_distance", "edit_similarity"], sample_level_fn=StringDistance( metric_types=["longest_common_prefix_length", "edit_distance", "edit_similarity"], strip_prediction=True - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn={"longest_common_prefix_length": max, "edit_distance": min, "edit_similarity": max}, higher_is_better={"longest_common_prefix_length": True, "edit_distance": False, "edit_similarity": True}, @@ -178,7 +178,7 @@ class Metrics(Enum): ) exact_match = SampleLevelMetric( metric_name="em", - sample_level_fn=ExactMatches(strip_strings=True).compute, + sample_level_fn=ExactMatches(strip_strings=True), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -196,7 +196,7 @@ class Metrics(Enum): metric_name=["summarization_coverage", "summarization_density", "summarization_compression"], sample_level_fn=Extractiveness( normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text" - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn={ "summarization_coverage": np.mean, @@ -211,28 +211,28 @@ class Metrics(Enum): ) f1_score_quasi = SampleLevelMetric( metric_name="f1_score_quasi", - sample_level_fn=F1_score(normalize_gold=helm_normalizer, normalize_pred=helm_normalizer).compute, + sample_level_fn=F1_score(normalize_gold=helm_normalizer, normalize_pred=helm_normalizer), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) f1_score = SampleLevelMetric( metric_name="f1", - sample_level_fn=F1_score().compute, + sample_level_fn=F1_score(), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) f1_score_macro = CorpusLevelMetric( metric_name="f1", - sample_level_fn=GenerativePreparator().prepare, + sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, corpus_level_fn=CorpusLevelF1Score(average="macro").compute, higher_is_better=True, ) f1_score_micro = CorpusLevelMetric( metric_name="f1", - sample_level_fn=GenerativePreparator().prepare, + sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, corpus_level_fn=CorpusLevelF1Score(average="micro").compute, higher_is_better=True, @@ -241,7 +241,7 @@ class Metrics(Enum): metric_name="summac", sample_level_fn=Faithfulness( normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text" - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -257,86 +257,161 @@ class Metrics(Enum): ) loglikelihood_acc = SampleLevelMetric( metric_name="acc", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=None).compute, + sample_level_fn=LoglikelihoodAcc(logprob_normalization=None), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, ) loglikelihood_acc_norm = SampleLevelMetric( metric_name="acc_norm", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm()).compute, + sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm()), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, ) loglikelihood_acc_norm_nospace = SampleLevelMetric( metric_name="acc_norm", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm(ignore_first_space=True)).compute, + sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm(ignore_first_space=True)), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, ) loglikelihood_acc_norm_single_token = SampleLevelMetric( metric_name="acc_norm", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm()).compute, + sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm()), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, ) loglikelihood_acc_single_token = SampleLevelMetric( metric_name="acc", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=None).compute, + sample_level_fn=LoglikelihoodAcc(logprob_normalization=None), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, ) loglikelihood_f1 = CorpusLevelMetric( metric_name="loglikelihood_f1", - sample_level_fn=LoglikelihoodPreparator().prepare, + sample_level_fn=LoglikelihoodPreparator(), category=SamplingMethod.LOGPROBS, corpus_level_fn=CorpusLevelF1Score(None).compute, higher_is_better=True, ) loglikelihood_f1_single_token = CorpusLevelMetric( metric_name="loglikelihood_f1", - sample_level_fn=LoglikelihoodPreparator(is_single_token=True).prepare, + sample_level_fn=LoglikelihoodPreparator(is_single_token=True), category=SamplingMethod.LOGPROBS, corpus_level_fn=CorpusLevelF1Score(None).compute, higher_is_better=True, ) mcc = CorpusLevelMetric( metric_name="mcc", - sample_level_fn=LoglikelihoodPreparator().prepare, + sample_level_fn=LoglikelihoodPreparator(), category=SamplingMethod.LOGPROBS, corpus_level_fn=matthews_corrcoef, higher_is_better=True, ) mcc_single_token = CorpusLevelMetric( metric_name="mcc", - sample_level_fn=LoglikelihoodPreparator().prepare, + sample_level_fn=LoglikelihoodPreparator(), category=SamplingMethod.LOGPROBS, corpus_level_fn=matthews_corrcoef, higher_is_better=True, ) + # NEW + avg_at_k = SampleLevelMetric( + metric_name="avg@k", + sample_level_fn=AvgAtK(strip_strings=True), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + avg_at_k_math = SampleLevelMetric( + metric_name="avg@k", + sample_level_fn=AvgAtK( + sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + language=Language.ENGLISH, + gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], + pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], + precision=6, + ).sample_level_fn(doc, model_response), + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + g_pass_at_k = SampleLevelMetricGrouping( + metric_name=["g-pass@k:n samples"], + sample_level_fn=GPassAtK(strip_strings=True), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + + maj_at_k = SampleLevelMetric( + metric_name="maj@k", + sample_level_fn=MajAtK(), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + pass_at_k = SampleLevelMetric( + metric_name="pass@k:n samples", + sample_level_fn=PassAtK(strip_strings=True), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + pass_at_k_math = SampleLevelMetric( + metric_name="pass@k:n samples", + sample_level_fn=PassAtK( + strip_strings=True, + # Extracting mathematical expressions and latex expressions + sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + language=Language.ENGLISH, + gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], + pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], + precision=6, + ).sample_level_fn(doc, model_response), + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + pass_at_k_letters = SampleLevelMetric( + metric_name="pass@k:n samples", + sample_level_fn=PassAtK( + sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + language=Language.ENGLISH, + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + precision=6, + ).sample_level_fn(doc, model_response), + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + # OLD maj_at_4_math = SampleLevelMetric( metric_name="maj@4", sample_level_fn=MajAtK( k=4, strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) maj_at_5 = SampleLevelMetric( metric_name="maj@5", - sample_level_fn=MajAtK(k=5).compute, + sample_level_fn=MajAtK(k=5), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) maj_at_8 = SampleLevelMetric( metric_name="maj@8", - sample_level_fn=MajAtK(k=8).compute, + sample_level_fn=MajAtK(k=8), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -345,7 +420,7 @@ class Metrics(Enum): metric_name="maj@8", sample_level_fn=MajAtK( k=8, strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -360,7 +435,7 @@ class Metrics(Enum): pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, ).sample_level_fn(doc, model_response), - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -379,7 +454,7 @@ class Metrics(Enum): pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, ).sample_level_fn(doc, model_response), - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -396,7 +471,7 @@ class Metrics(Enum): pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, ).sample_level_fn(doc, model_response), - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -414,7 +489,7 @@ class Metrics(Enum): pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, ).sample_level_fn(doc, model_response), - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -431,7 +506,7 @@ class Metrics(Enum): pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, ).sample_level_fn(doc, model_response), - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -448,7 +523,7 @@ class Metrics(Enum): pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, ).sample_level_fn(doc, model_response), - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -465,7 +540,7 @@ class Metrics(Enum): pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, ).sample_level_fn(doc, model_response), - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -473,7 +548,7 @@ class Metrics(Enum): mrr = SampleLevelMetric( metric_name="mrr", - sample_level_fn=MRR().compute, + sample_level_fn=MRR(), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, @@ -487,49 +562,42 @@ class Metrics(Enum): ) multi_f1_numeric = CorpusLevelMetric( metric_name="mf1", - sample_level_fn=LoglikelihoodPreparator(is_single_token=True).prepare, + sample_level_fn=LoglikelihoodPreparator(is_single_token=True), category=SamplingMethod.LOGPROBS, corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3).compute, higher_is_better=True, ) - avg_at_64 = SampleLevelMetric( - metric_name="avg@64", - sample_level_fn=PassAtK(k=64, n=64, strip_strings=True).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) pass_at_1 = SampleLevelMetric( metric_name="pass@1:32_samples", - sample_level_fn=PassAtK(k=1, n=32, strip_strings=True).compute, + sample_level_fn=PassAtK(k=1, n=32, strip_strings=True), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) pass_at_10 = SampleLevelMetric( metric_name="pass@10:32_samples", - sample_level_fn=PassAtK(k=10, n=32, strip_strings=True).compute, + sample_level_fn=PassAtK(k=10, n=32, strip_strings=True), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) pass_at_100 = SampleLevelMetric( metric_name="pass@100:32_samples", - sample_level_fn=PassAtK(k=100, n=32, strip_strings=True).compute, + sample_level_fn=PassAtK(k=100, n=32, strip_strings=True), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) g_pass_at_16 = SampleLevelMetricGrouping( metric_name=["G-Pass@16:48_samples"], - sample_level_fn=GPassAtK(k=16, n=48, strip_strings=True).compute, + sample_level_fn=GPassAtK(k=16, n=48, strip_strings=True), category=SamplingMethod.GENERATIVE, corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), ) g_pass_at_8_16 = SampleLevelMetricGrouping( metric_name=["G-Pass@8-16:48_samples"], - sample_level_fn=GPassAtK(k=[8, 16], n=48, strip_strings=True).compute, + sample_level_fn=GPassAtK(k=[8, 16], n=48, strip_strings=True), category=SamplingMethod.GENERATIVE, corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), @@ -549,7 +617,7 @@ class Metrics(Enum): pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), aggregation_function=max, ).sample_level_fn(doc, model_response), - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), @@ -569,14 +637,14 @@ class Metrics(Enum): pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), aggregation_function=max, ).sample_level_fn(doc, model_response), - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), ) perfect_exact_match = SampleLevelMetric( metric_name="perfect_em", - sample_level_fn=ExactMatches().compute, + sample_level_fn=ExactMatches(), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -590,7 +658,7 @@ class Metrics(Enum): ) prefix_exact_match = SampleLevelMetric( metric_name="pem", - sample_level_fn=ExactMatches(strip_strings=True, type_exact_match="prefix").compute, + sample_level_fn=ExactMatches(strip_strings=True, type_exact_match="prefix"), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -601,7 +669,7 @@ class Metrics(Enum): normalize_gold=helm_normalizer, normalize_pred=helm_normalizer, type_exact_match="prefix", - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -612,7 +680,7 @@ class Metrics(Enum): normalize_gold=helm_normalizer, normalize_pred=helm_normalizer, strip_strings=True, - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -621,14 +689,14 @@ class Metrics(Enum): metric_name="qem", sample_level_fn=ExactMatches( strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) quasi_exact_match_triviaqa = SampleLevelMetric( metric_name="qem", - sample_level_fn=ExactMatches(strip_strings=True, normalize_pred=harness_triviaqa_normalizer).compute, + sample_level_fn=ExactMatches(strip_strings=True, normalize_pred=harness_triviaqa_normalizer), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -637,21 +705,21 @@ class Metrics(Enum): metric_name="qem", sample_level_fn=ExactMatches( strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) recall_at_1_single_token = SampleLevelMetric( metric_name="acc", - sample_level_fn=Recall(at=1).compute, + sample_level_fn=Recall(at=1), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, ) recall_at_2_single_token = SampleLevelMetric( metric_name="recall@2", - sample_level_fn=Recall(at=2).compute, + sample_level_fn=Recall(at=2), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, @@ -677,35 +745,35 @@ class Metrics(Enum): bootstrap=True, normalize_gold=bigbench_normalizer, normalize_pred=bigbench_normalizer, - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn={"rouge1": np.mean, "rouge2": np.mean, "rougeL": np.mean, "rougeLsum": np.mean}, higher_is_better={"rouge1": True, "rouge2": True, "rougeL": True, "rougeLsum": True}, ) rouge1 = SampleLevelMetric( metric_name="rouge1", - sample_level_fn=ROUGE("rouge1").compute, + sample_level_fn=ROUGE("rouge1"), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) rouge2 = SampleLevelMetric( metric_name="rouge2", - sample_level_fn=ROUGE("rouge2").compute, + sample_level_fn=ROUGE("rouge2"), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) rougeL = SampleLevelMetric( metric_name="rougeL", - sample_level_fn=ROUGE("rougeL").compute, + sample_level_fn=ROUGE("rougeL"), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) rougeLsum = SampleLevelMetric( metric_name="rougeLsum", - sample_level_fn=ROUGE("rougeLsum").compute, + sample_level_fn=ROUGE("rougeLsum"), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -722,14 +790,14 @@ class Metrics(Enum): ) target_perplexity = SampleLevelMetric( metric_name="ppl", - sample_level_fn=TargetPerplexityPreparator(units_type="words").prepare, + sample_level_fn=TargetPerplexityPreparator(units_type="words"), category=SamplingMethod.LOGPROBS, corpus_level_fn=CorpusLevelPerplexityMetric("perplexity").compute, higher_is_better=False, ) ter = CorpusLevelMetric( metric_name="ter", - sample_level_fn=GenerativePreparator().prepare, + sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, corpus_level_fn=CorpusLevelTranslationMetric("ter").compute, higher_is_better=False, @@ -743,7 +811,7 @@ class Metrics(Enum): ) word_perplexity = CorpusLevelMetric( metric_name="word_perplexity", - sample_level_fn=PerplexityPreparator(units_type="words").prepare, + sample_level_fn=PerplexityPreparator(units_type="words"), category=SamplingMethod.PERPLEXITY, corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity").compute, higher_is_better=False, @@ -765,7 +833,7 @@ class Metrics(Enum): pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], precision=6, ).sample_level_fn(doc, model_response), - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -781,7 +849,7 @@ class Metrics(Enum): pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], precision=6, ).sample_level_fn(doc, model_response), - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, @@ -797,7 +865,7 @@ class Metrics(Enum): pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], precision=6, ).sample_level_fn(doc, model_response), - ).compute, + ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 706a7664a..615f311f6 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -1075,12 +1075,54 @@ def compute(self, model_responses: list[ModelResponse], docs: list[Doc], **kwarg return metrics -class AvgAtK: +class SamplingMetric: def __init__( self, - k: int, + normalize: Callable | None = None, + strip_strings: bool = False, sample_scoring_function: Callable[[Doc, ModelResponse], float] | str | None = None, ): + self.normalize = normalize + self.strip_strings = strip_strings + + if callable(sample_scoring_function): + self.score_sample = sample_scoring_function + self.type_exact_match = None + else: + if isinstance(sample_scoring_function, str): + if sample_scoring_function not in ["prefix", "suffix", "full"]: + raise ValueError( + f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {sample_scoring_function} instead." + ) + self.type_exact_match = sample_scoring_function + else: + self.type_exact_match = "full" + self.compute_score = self.default_sample_scoring + + def preprocess(self, text: str) -> str: + if not text: + return "" + + if self.strip_strings: + text = text.strip() + + if self.normalize: + text = self.normalize(text) + + return text + + def default_sample_scoring(self, doc: Doc, model_response: ModelResponse) -> int: + gold = doc.get_golds()[0] + pred = model_response.final_text[0] + if self.type_exact_match == "prefix": + return 1 if pred.startswith(gold) else 0 + if self.type_exact_match == "suffix": + return 1 if pred.endswith(gold) else 0 + return 1 if gold == pred else 0 + + +class AvgAtK(SamplingMetric): + def __init__(self, k: int | None = None, **kwargs): """Sample score averages all the individual k predictions scores. Args: @@ -1092,20 +1134,9 @@ def __init__( sample_scoring_function (callable | str, optional): Function to use to compute the score for each sample. If None, uses the default scoring function which is a simple exact match. """ + super().__init__(kwargs) self.k = k - # Managed the logic of the per prediction of sample scoring - if callable(sample_scoring_function): - self.compute_score = sample_scoring_function - else: - if isinstance(sample_scoring_function, str): - if sample_scoring_function not in ["prefix", "suffix", "full"]: - raise ValueError( - f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {sample_scoring_function} instead." - ) - type_exact_match = sample_scoring_function - else: - type_exact_match = "full" - self.compute_score = self.default_sample_scoring(type_exact_match) + self.attribute_must_be_set = ["k"] def compute(self, model_response: ModelResponse, doc: Doc, **kwargs): """Computes the metric over a list of golds and predictions for one single sample. @@ -1126,55 +1157,17 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs): avg_score = np.mean(all_scores) return avg_score - def default_sample_scoring(self, type_exact_match: str) -> callable: - def sample_scoring_function(doc: Doc, model_response: ModelResponse) -> int: - """Default sample scoring function that checks if the prediction is equal to the gold.""" - pred = model_response.final_text[0] - gold = doc.get_golds()[0] + def num_samples(self): + return self.k - if type_exact_match == "prefix": - return 1 if pred.startswith(gold) else 0 - if type_exact_match == "suffix": - return 1 if pred.endswith(gold) else 0 - return 1 if gold == pred else 0 - return sample_scoring_function +class MajAtK(SamplingMetric): + def __init__(self, k: int = None, **kwargs): + """An exact match class.""" + super().__init__(kwargs) - -class MajAtK: - def __init__( - self, - k: int, - normalize_gold: Callable | None = None, - normalize_pred: Callable | None = None, - strip_strings: bool = False, - type_exact_match: str = "full", - ): - """An exact match class. - - Args: - normalize_gold (callable, optional): Function to use to normalize the reference strings. - Defaults to None if no normalization is applied. - normalize_pred (callable, optional): Function to use to normalize the predicted strings. - Defaults to None if no normalization is applied. - strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False. - type_exact_match (str, optional): Defines what type of match to apply (post normalization if present). - Can be any of `prefix`, `suffix` or `full`. Defaults to "full". - `prefix` checks if the prediction starts with the gold, - `suffix` if the prediction ends with the gold, - `full` if the prediction and gold are equal - """ self.k = k - self.normalize_gold = normalize_gold - self.normalize_pred = normalize_pred - self.strip_strings = strip_strings - - if type_exact_match not in ["prefix", "suffix", "full"]: - # todo: we could add a set exact match - raise ValueError( - f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {type_exact_match} instead." - ) - self.type_exact_match = type_exact_match + self.attribute_must_be_set = ["k"] def compute(self, model_response: ModelResponse, docs: Doc, **kwargs): """Computes the metric over a list of golds and predictions for one single sample. @@ -1188,94 +1181,43 @@ def compute(self, model_response: ModelResponse, docs: Doc, **kwargs): Returns: float: Aggregated score over the current sample's items. """ + if self.k is None: + raise Exception("You did not set the value of k") golds = docs.get_golds() - predictions = model_response.final_text if len(golds) > 1: raise Exception("Cannot compute maj@k with several golds") - gold = self.get_processed_gold(golds[0]) + processed_choices = [self.preprocess(gold=g) for g in docs.get_golds()] + new_doc = Doc( + choices=processed_choices, + query=docs.query, + gold_index=docs.gold_index, + ) all_answers = [] - for pred in predictions[: self.k]: - all_answers.append(self.get_processed_pred(pred=pred)) + for pred in model_response.final_text[: self.k]: + all_answers.append(self.preprocess(pred=pred)) majority_prediction = max(all_answers, key=all_answers.count) - return self.compute_score(majority_prediction, gold) - - def get_processed_gold(self, gold: str) -> str: - if self.strip_strings: - gold = gold.strip() - - if self.normalize_gold: - gold = self.normalize_gold(gold) - - return gold - - def get_processed_pred(self, pred: str) -> str: - if not pred: - return "" - - if self.strip_strings: - pred = pred.strip() - - if self.normalize_pred: - pred = self.normalize_pred(pred) - - return pred + new_model_response = ModelResponse( + text=[majority_prediction], + ) + return self.compute_score(new_model_response, new_doc) - def compute_score(self, pred: str, gold: str) -> int: - if self.type_exact_match == "prefix": - return 1 if pred.startswith(gold) else 0 - if self.type_exact_match == "suffix": - return 1 if pred.endswith(gold) else 0 - return 1 if gold == pred else 0 + def num_samples(self): + return self.k -class PassAtK: - def __init__( - self, - k: int, - n: int | None = None, - normalize_gold: Callable | None = None, - normalize_pred: Callable | None = None, - strip_strings: bool = False, - sample_scoring_function: Callable[[Doc, ModelResponse], float] | str | None = None, - ): +class PassAtK(SamplingMetric): + def __init__(self, k: int | None = None, n: int | None = None, **kwargs): """Computing pass at k Args: k (int): Threshold for the number of successful attempts. n (int): Number of samples to generate - normalize_gold (callable, optional): Function to use to normalize the reference strings. - Defaults to None if no normalization is applied. - normalize_pred (callable, optional): Function to use to normalize the predicted strings. - Defaults to None if no normalization is applied. - strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False. - sample_scoring_function (callable or str, optional): Function to use to score each sample. - Either pass the full function (should take a string prediction and a string gold, and return a score between 0 and 1) - a string (any of `prefix`, `suffix` or `full`) to define the type of exact match that you want, or nothing to defaults to "full". - `prefix` checks if the prediction starts with the gold, - `suffix` if the prediction ends with the gold, - `full` if the prediction and gold are equal """ + super().__init__(kwargs) self.k = k self.n = n - self.normalize_gold = normalize_gold - self.normalize_pred = normalize_pred - self.strip_strings = strip_strings - - # Managed the logic of the per prediction of sample scoring - if callable(sample_scoring_function): - self.score_sample = sample_scoring_function - self.type_exact_match = None - else: - if isinstance(sample_scoring_function, str): - if sample_scoring_function not in ["prefix", "suffix", "full"]: - raise ValueError( - f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {sample_scoring_function} instead." - ) - self.type_exact_match = sample_scoring_function - else: - self.type_exact_match = "full" - self.score_sample = self.default_sample_scoring + self.attribute_must_be_set = ["k"] def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: """Computes the metric over a list of golds and predictions for one single item with possibly many samples. @@ -1290,17 +1232,17 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: float: Aggregated score over the current sample's items. """ golds = doc.get_golds() - predictions = model_response.final_text if len(golds) > 1: raise Exception("Cannot compute pass@k with several golds") + predictions = model_response.final_text if self.n is None: self.n = len(predictions) logger.warning("n undefined in the pass@k. We assume it's the same as the sample's number of predictions.") elif len(predictions) < self.n: logger.warning(f"Number of predictions is less than {self.n} for pass@k.") - processed_choices = [self.get_processed_gold(gold=g) for g in doc.choices] + processed_choices = [self.preprocess(gold=g) for g in doc.choices] new_doc = Doc( choices=processed_choices, query=doc.query, @@ -1309,7 +1251,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: all_scores = [] for pred in predictions[: self.n]: - cur_pred = self.get_processed_pred(pred=pred) + cur_pred = self.preprocess(pred=pred) new_model_response = ModelResponse( text=[cur_pred], ) @@ -1317,37 +1259,6 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: return self.pass_at_k(all_scores) - def get_processed_gold(self, gold: str) -> str: - if self.strip_strings: - gold = gold.strip() - - if self.normalize_gold: - gold = self.normalize_gold(gold) - - return gold - - def get_processed_pred(self, pred: str) -> str: - if not pred: - return "" - - if self.strip_strings: - pred = pred.strip() - - if self.normalize_pred: - pred = self.normalize_pred(pred) - - return pred - - def default_sample_scoring(self, doc, model_response) -> int: - pred = model_response.final_text[0] - gold = doc.get_golds()[0] - - if self.type_exact_match == "prefix": - return 1 if pred.startswith(gold) else 0 - if self.type_exact_match == "suffix": - return 1 if pred.endswith(gold) else 0 - return 1 if gold == pred else 0 - def pass_at_k(self, all_scores: list[int]) -> float: """Algo from https://arxiv.org/pdf/2107.03374""" c: int = all_scores.count(1) @@ -1356,17 +1267,17 @@ def pass_at_k(self, all_scores: list[int]) -> float: return 1.0 - np.prod(1.0 - self.k / np.arange(self.n - c + 1, self.n + 1)) + def num_samples(self): + return self.n if self.n is not None else self.k -class GPassAtK: + +class GPassAtK(SamplingMetric): def __init__( self, - k: Union[int, list[int]], + k: Union[int, list[int]] | None = None, n: int | None = None, thresholds: list[float] = [0.0, 0.25, 0.5, 0.75, 1.0], - normalize_gold: Callable | None = None, - normalize_pred: Callable | None = None, - strip_strings: bool = False, - sample_scoring_function: Callable[[Doc, ModelResponse], float] | str | None = None, + **kwargs, ): """Computing G-Pass@k from http://arxiv.org/abs/2412.13147 @@ -1374,39 +1285,13 @@ def __init__( k (int, list): The number of successful attempts to be considered. n (int): Number of samples to generate. thresholds (list): Thresholds to control successful attempts in k generate. - normalize_gold (callable, optional): Function to use to normalize the reference strings. - Defaults to None if no normalization is applied. - normalize_pred (callable, optional): Function to use to normalize the predicted strings. - Defaults to None if no normalization is applied. - strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False. - sample_scoring_function (callable or str, optional): Function to use to score each sample. - Either pass the full function (should take a string prediction and a string gold, and return a score between 0 and 1) - a string (any of `prefix`, `suffix` or `full`) to define the type of exact match that you want, or nothing to defaults to "full". - `prefix` checks if the prediction starts with the gold, - `suffix` if the prediction ends with the gold, - `full` if the prediction and gold are equal """ + super().__init__(kwargs) self.k = as_list(k) self.n = n - self.thresholds = thresholds - self.normalize_gold = normalize_gold - self.normalize_pred = normalize_pred - self.strip_strings = strip_strings + self.attribute_must_be_set = ["k"] - # Managed the logic of the per prediction of sample scoring - if callable(sample_scoring_function): - self.score_sample = sample_scoring_function - self.type_exact_match = None - else: - if isinstance(sample_scoring_function, str): - if sample_scoring_function not in ["prefix", "suffix", "full"]: - raise ValueError( - f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {sample_scoring_function} instead." - ) - self.type_exact_match = sample_scoring_function - else: - self.type_exact_match = "full" - self.score_sample = self.default_sample_scoring + self.thresholds = thresholds def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: """Computes the metric over a list of golds and predictions for one single item with possibly many samples. @@ -1434,7 +1319,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: elif len(predictions) < self.n: logger.warning(f"Number of predictions is less than {self.n} for G-Pass@k.") - processed_choices = [self.get_processed_gold(gold=g) for g in doc.choices] + processed_choices = [self.preprocess(gold=g) for g in doc.choices] new_doc = Doc( choices=processed_choices, query=doc.query, @@ -1443,7 +1328,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: all_scores = [] for pred in predictions[: self.n]: - cur_pred = self.get_processed_pred(pred=pred) + cur_pred = self.preprocess(pred=pred) new_model_response = ModelResponse( text=[cur_pred], ) @@ -1451,36 +1336,6 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: return self.g_pass_at_k(all_scores) - def get_processed_gold(self, gold: str) -> str: - if self.strip_strings: - gold = gold.strip() - - if self.normalize_gold: - gold = self.normalize_gold(gold) - - return gold - - def get_processed_pred(self, pred: str) -> str: - if not pred: - return "" - - if self.strip_strings: - pred = pred.strip() - - if self.normalize_pred: - pred = self.normalize_pred(pred) - - return pred - - def default_sample_scoring(self, doc: Doc, model_response: ModelResponse) -> int: - gold = doc.get_golds()[0] - pred = model_response.final_text[0] - if self.type_exact_match == "prefix": - return 1 if pred.startswith(gold) else 0 - if self.type_exact_match == "suffix": - return 1 if pred.endswith(gold) else 0 - return 1 if gold == pred else 0 - def g_pass_at_k(self, all_scores: list[int]) -> float: """Computation of G-Pass@k details from http://arxiv.org/abs/2412.13147""" c: int = sum(all_scores) @@ -1527,3 +1382,6 @@ def all_metrics(self): metrics.append(f"mG-Pass@{k}") return metrics + + def num_samples(self): + return self.n if self.n is not None else self.k diff --git a/src/lighteval/metrics/sample_preparator.py b/src/lighteval/metrics/sample_preparator.py index 2b99483b7..ad9338e76 100644 --- a/src/lighteval/metrics/sample_preparator.py +++ b/src/lighteval/metrics/sample_preparator.py @@ -60,7 +60,11 @@ class PerplexityCorpusMetricInput(CorpusMetricInput): weights: list[int] -class GenerativePreparator: +class Preparator: + pass + + +class GenerativePreparator(Preparator): @staticmethod def prepare(doc: Doc, model_response: ModelResponse, **kwargs): """Prepares an individual generative example to the format expected by metrics computed at the corpus level (aggregated). @@ -77,7 +81,7 @@ def prepare(doc: Doc, model_response: ModelResponse, **kwargs): return GenerativeCorpusMetricInput(golds=golds, preds=predictions) -class LoglikelihoodPreparator: +class LoglikelihoodPreparator(Preparator): def __init__(self, is_single_token: bool = False): """Init. @@ -110,7 +114,7 @@ def prepare(self, doc: Doc, model_response: ModelResponse, **kwargs) -> LogprobC return LogprobCorpusMetricInput(golds=gold_ixs, preds=np.argmax(choices_logprob)) -class TargetPerplexityPreparator: +class TargetPerplexityPreparator(Preparator): def __init__(self, units_type: str) -> None: """Init. @@ -155,7 +159,7 @@ def prepare(self, doc: Doc, model_response: ModelResponse, **kwargs): return PerplexityCorpusMetricInput(logprobs=logprobs_flat, weights=self.count_units(reference_text_flat)) -class PerplexityPreparator: +class PerplexityPreparator(Preparator): def __init__(self, units_type: str) -> None: """Init. diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py index 78d30c59a..69a251afb 100644 --- a/src/lighteval/metrics/utils/metric_utils.py +++ b/src/lighteval/metrics/utils/metric_utils.py @@ -23,6 +23,7 @@ from dataclasses import dataclass from typing import Callable +from lighteval.metrics.sample_preparator import Preparator from lighteval.tasks.requests import SamplingMethod @@ -31,7 +32,7 @@ class Metric: metric_name: str higher_is_better: bool category: SamplingMethod - sample_level_fn: Callable + sample_level_fn: Callable | Preparator | object corpus_level_fn: Callable batched_compute: bool = False @@ -42,9 +43,16 @@ def get_doc(self): def compute( self, **kwargs ) -> dict: # result: Union[list[ModelResponse], ModelResponse], formatted_doc: Doc) -> dict: + if isinstance(self.sample_level_fn, Callable): + sample_level_fn = self.sample_level_fn + elif isinstance(self.sample_level_fn, Preparator): + sample_level_fn = self.sample_level_fn.prepare + else: + sample_level_fn = self.sample_level_fn.compute + if isinstance(self, MetricGrouping): - return self.sample_level_fn(**kwargs) # result, formatted_doc, - return {self.metric_name: self.sample_level_fn(**kwargs)} # result, formatted_doc, + return sample_level_fn(**kwargs) # result, formatted_doc, + return {self.metric_name: sample_level_fn(**kwargs)} # result, formatted_doc, @dataclass diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index ac12d104b..9376a5651 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -9840,8 +9840,7 @@ few_shots_select=None, generation_size=32768, metrics=[ - Metrics.math_pass_at_1_1n, - Metrics.math_pass_at_1_4n, + Metrics.pass_at_k_math, ], version=2, ) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 42ba3408e..ffc1fa198 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -32,6 +32,7 @@ from pytablewriter import MarkdownTableWriter from lighteval.metrics.metrics import Metric, Metrics +from lighteval.metrics.metrics_sample import SamplingMetric from lighteval.tasks.prompt_manager import FewShotSampler from lighteval.tasks.requests import ( Doc, @@ -196,11 +197,9 @@ def __init__( # We assume num_samples always contains 1 (for base generative evals) self.num_samples = [1] for metric in self.metrics: - metric_names = as_list(metric.metric_name) - - for metric_name in metric_names: + if isinstance(metric.sample_level_fn, SamplingMetric): # Update the number of samples to generate using the information in the metric name - self.num_samples.append(extract_num_samples(metric_name)) + self.num_samples.append(metric.sample_level_fn.num_samples()) def get_first_possible_fewshot_splits(self, available_splits: ListLike[str]) -> str | None: """ @@ -386,24 +385,3 @@ def download_dataset_worker( # It returns DatasetDict because we don't specify a split return dataset # type: ignore - - -def extract_num_samples(metric_name: str) -> int: - """Gets the number of samples to generate from the metric name. - Assumes that any metric with @ in it's name depends on the number of samples. - - Args: - metric_name (str): The metric name in the task. - - Returns: - int: The number of samples to generate. - """ - if "@" in metric_name: - metric_name = metric_name.split("@")[-1] - if "_" in metric_name: - metric_name = metric_name.split("_")[0] - if ":" in metric_name: - return int(metric_name.split(":")[-1]) - else: - return int(metric_name) - return 1 diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 01c43e942..a0ce627b9 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import ast import collections import copy import importlib @@ -95,22 +96,33 @@ def get_tasks_configs(self, task: str) -> list[LightevalTaskConfig]: returns a LightevalTaskConfig object based on the task name and fewshot and truncate_few_shots values. """ - task_info_dict = self.taskinfo_selector(task) + task_to_params = self.taskinfo_selector(task) configs = [] - for task_name, task_info in task_info_dict.items(): - # We can have multiple few_shot and truncate_few_shots values for the same task - for task_info_dict in task_info: + for task_name, task_param in task_to_params.items(): + # We can have multiple versions of the same task running (for ex, different few shots, different metric params, etc) + for subtask_param in task_param: config = self.task_registry.get(task_name) - if config is not None: - config = copy.deepcopy(config) - config.num_fewshots = task_info_dict["fewshots"] - config.truncate_fewshots = task_info_dict["truncate_fewshots"] - config.full_name = f"{task_name}|{config.num_fewshots}" - configs.append(config) - else: + if config is None: raise ValueError(f"Cannot find task {task_name} in task list or in custom task registry") + config = copy.deepcopy(config) + config.num_fewshots = subtask_param["fewshots"] + config.truncate_fewshots = subtask_param["truncate_fewshots"] + config.full_name = f"{task_name}|{config.num_fewshots}" + # If some tasks are parametrizable and in cli, we set attributes here + for metric in [m for m in config.metrics if "@" in m.metric_name]: # parametrizable metric + for attribute, value in subtask_param["metric_params"].items(): + setattr(metric.sample_level_fn, attribute, value) + if hasattr(metric.sample_level_fn, "attribute_must_be_set"): + for attribute in metric.sample_level_fn.attribute_must_be_set: + if getattr(metric.sample_level_fn, attribute) is None: + raise ValueError( + f"Metric {metric.metric_name} for task {task_name} was not correctly parametrized. Forgot to set {attribute}." + ) + + configs.append(config) + return configs @property @@ -173,7 +185,7 @@ def taskinfo_selector(self, tasks: str) -> dict[str, list[dict]]: Returns: dict[str, list[dict]]: A dictionary mapping each task name to a list of tuples representing the few_shot and truncate_few_shots values. """ - few_shot_dict = collections.defaultdict(list) + task_to_params = collections.defaultdict(list) # We can provide a path to a file with a list of tasks or a string of comma-separated tasks if os.path.exists(tasks): @@ -193,8 +205,15 @@ def taskinfo_selector(self, tasks: str) -> dict[str, list[dict]]: expanded_tasks_list.extend(expanded_tasks) for task in expanded_tasks_list: + metric_params_dict = {} try: suite_name, task_name, few_shot, truncate_few_shots = tuple(task.split("|")) + if "@" in task_name: + task_name, metric_params = task_name.split("@") + # We convert k:v,k2:v2 to {"k": "v", "k2": "v2"}, then to correct type + metric_params_dict = dict(item.split("=") for item in metric_params.split(",") if item) + metric_params_dict = {k: ast.literal_eval(v) for k, v in metric_params_dict.items()} + truncate_few_shots = int(truncate_few_shots) except ValueError: raise ValueError( @@ -215,9 +234,15 @@ def taskinfo_selector(self, tasks: str) -> dict[str, list[dict]]: # This adds support for task supersets (eg: mmlu -> all the mmlu tasks) for expanded_task in self.expand_task_definition(f"{suite_name}|{task_name}"): # Store few_shot info for each task name (suite|task) - few_shot_dict[expanded_task].append({"fewshots": few_shot, "truncate_fewshots": truncate_few_shots}) + task_to_params[expanded_task].append( + { + "fewshots": few_shot, + "truncate_fewshots": truncate_few_shots, + "metric_params": metric_params_dict, + } + ) - return few_shot_dict + return task_to_params @property @lru_cache From 8eece33041da3a497aab7cd938e4d5b01a29c266 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Mon, 18 Aug 2025 15:44:01 +0000 Subject: [PATCH 02/38] rm useless case --- tests/tasks/test_lighteval_task.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/tests/tasks/test_lighteval_task.py b/tests/tasks/test_lighteval_task.py index d338b8a76..df2b5ad4a 100644 --- a/tests/tasks/test_lighteval_task.py +++ b/tests/tasks/test_lighteval_task.py @@ -20,9 +20,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import pytest -from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig, extract_num_samples +from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -64,18 +63,3 @@ def test_dataset_filter(): filtered_docs = task.eval_docs() assert len(filtered_docs) == 1 assert filtered_docs[0].query == "hi" - - -@pytest.mark.parametrize( - "metric_name, expected", - [ - ("maj@1", 1), - ("pass@1:32_samples", 32), - ("pass@10:64_samples", 64), - ("codegen_pass@1:16", 16), - ("other_name@2", 2), - ("other_name", 1), - ], -) -def test_extract_num_samples(metric_name, expected): - assert extract_num_samples(metric_name) == expected From 8c5e5fb37c61e63e17d640e43adf88b0f44c5f16 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Mon, 18 Aug 2025 15:52:17 +0000 Subject: [PATCH 03/38] updated tests --- tests/tasks/test_registry.py | 43 +++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/tests/tasks/test_registry.py b/tests/tasks/test_registry.py index 0f51a88b5..f7f05d3b7 100644 --- a/tests/tasks/test_registry.py +++ b/tests/tasks/test_registry.py @@ -53,8 +53,8 @@ def test_custom_task_groups(): assert set(task_info.keys()) == {"custom|test_task_revision"} assert task_info["custom|test_task_revision"] == [ - {"fewshots": 0, "truncate_fewshots": False}, - {"fewshots": 1, "truncate_fewshots": False}, + {"fewshots": 0, "truncate_fewshots": False, "metric_params": {}}, + {"fewshots": 1, "truncate_fewshots": False, "metric_params": {}}, ] @@ -66,7 +66,7 @@ def test_custom_tasks(): task_info = registry.taskinfo_selector("custom|test_task_revision|0|0") assert list(task_info.keys()) == ["custom|test_task_revision"] - assert task_info["custom|test_task_revision"] == [{"fewshots": 0, "truncate_fewshots": False}] + assert task_info["custom|test_task_revision"] == [{"fewshots": 0, "truncate_fewshots": False, "metric_params": {}}] def test_superset_expansion(): @@ -78,9 +78,9 @@ def test_superset_expansion(): task_info = registry.taskinfo_selector("lighteval|storycloze|0|0") assert list(task_info.keys()) == ["lighteval|storycloze:2016", "lighteval|storycloze:2018"] - assert task_info["lighteval|storycloze:2016"] == [{"fewshots": 0, "truncate_fewshots": False}] and task_info[ - "lighteval|storycloze:2018" - ] == [{"fewshots": 0, "truncate_fewshots": False}] + assert task_info["lighteval|storycloze:2016"] == [ + {"fewshots": 0, "truncate_fewshots": False, "metric_params": {}} + ] and task_info["lighteval|storycloze:2018"] == [{"fewshots": 0, "truncate_fewshots": False, "metric_params": {}}] def test_superset_with_subset_task(): @@ -95,11 +95,38 @@ def test_superset_with_subset_task(): assert len(task_info.keys()) == 57 # Since it's defined twice assert task_info["original|mmlu:abstract_algebra"] == [ - {"fewshots": 3, "truncate_fewshots": False}, - {"fewshots": 5, "truncate_fewshots": False}, + { + "fewshots": 3, + "truncate_fewshots": False, + "metric_params": {}, + }, + {"fewshots": 5, "truncate_fewshots": False, "metric_params": {}}, ] +def test_cli_sampling_params(): + """ + Tests that task info selector correctly handles supersets. + """ + registry = Registry() + + task_info = registry.taskinfo_selector("lighteval|math_500@k=1|0|0") + + assert list(task_info.keys()) == ["lighteval|math_500"] + assert task_info["lighteval|math_500"] == [{"fewshots": 0, "truncate_fewshots": False, "metric_params": {"k": 1}}] + + +def test_cli_sampling_params_fail(): + """ + Tests that task info selector correctly handles supersets. + """ + registry = Registry() + + # creation of object should fail + with pytest.raises(ValueError): + registry.taskinfo_selector("lighteval|math_500@|0|0") + + def test_task_group_expansion_with_subset_expansion(): """ Tests that task info selector correctly handles a group with task superset is provided. From ed0a02bd92f829a569933546600849e32ef70a87 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Tue, 19 Aug 2025 08:02:30 +0000 Subject: [PATCH 04/38] fix test --- tests/tasks/test_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tasks/test_registry.py b/tests/tasks/test_registry.py index f7f05d3b7..c67ce1eae 100644 --- a/tests/tasks/test_registry.py +++ b/tests/tasks/test_registry.py @@ -124,7 +124,7 @@ def test_cli_sampling_params_fail(): # creation of object should fail with pytest.raises(ValueError): - registry.taskinfo_selector("lighteval|math_500@|0|0") + registry.get_tasks_configs("lighteval|math_500@|0|0") def test_task_group_expansion_with_subset_expansion(): From 73947068af9bb3ea3a119f166852ccf35326d59d Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Tue, 19 Aug 2025 08:22:01 +0000 Subject: [PATCH 05/38] added conversion for normalizations --- src/lighteval/metrics/metrics_sample.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 615f311f6..8b7cf344e 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -24,6 +24,7 @@ using simple function (min, mean, max, ...) at the corpus level. Most metrics fall under this category. """ +import inspect import logging import os from typing import Callable, Literal, Union @@ -1078,11 +1079,22 @@ def compute(self, model_responses: list[ModelResponse], docs: list[Doc], **kwarg class SamplingMetric: def __init__( self, - normalize: Callable | None = None, + normalize: Callable | str | None = None, strip_strings: bool = False, sample_scoring_function: Callable[[Doc, ModelResponse], float] | str | None = None, ): - self.normalize = normalize + if isinstance(normalize, str): + import lighteval.metrics.normalizations + + allowed_normalizations = inspect.getmembers( + lighteval.metrics.normalizations, inspect.isfunction + ) # -> {name: fn} + if normalize in allowed_normalizations: + self.normalize = allowed_normalizations[normalize] + else: + raise ValueError(f"Unknown normalization function: {normalize}") + else: + self.normalize = normalize self.strip_strings = strip_strings if callable(sample_scoring_function): From 732c48877f1c7ca9735e431d1130018d0756b958 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Tue, 19 Aug 2025 10:10:45 +0000 Subject: [PATCH 06/38] first pass transforming Hynek's metric functions into classes like they should have been --- ...ntributing-to-multilingual-evaluations.mdx | 6 +- .../custom_yourbench_task_mcq.py | 4 +- src/lighteval/logging/info_loggers.py | 7 +- src/lighteval/metrics/__init__.py | 4 +- src/lighteval/metrics/dynamic_metrics.py | 365 +++++++++-------- src/lighteval/metrics/metrics.py | 117 +++--- src/lighteval/metrics/metrics_corpus.py | 17 +- src/lighteval/metrics/normalizations.py | 3 - src/lighteval/metrics/utils/metric_utils.py | 16 +- .../tasks/extended/olympiade_bench/main.py | 4 +- src/lighteval/tasks/lighteval_task.py | 5 +- src/lighteval/tasks/multilingual/tasks.py | 386 +++++++++--------- .../tasks/multilingual/utils/task_utils.py | 4 +- tests/metrics/test_extractive_match.py | 6 +- tests/metrics/test_metric_requests.py | 8 +- tests/test_unit_base_metrics.py | 128 +++--- 16 files changed, 543 insertions(+), 537 deletions(-) diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx index 4db1c935b..828a9f471 100644 --- a/docs/source/contributing-to-multilingual-evaluations.mdx +++ b/docs/source/contributing-to-multilingual-evaluations.mdx @@ -64,9 +64,9 @@ your_tasks = [ metric=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), # In this function, you choose which template to follow and for which language and formulation diff --git a/examples/custom_tasks_templates/custom_yourbench_task_mcq.py b/examples/custom_tasks_templates/custom_yourbench_task_mcq.py index 961fa264b..2697380ad 100644 --- a/examples/custom_tasks_templates/custom_yourbench_task_mcq.py +++ b/examples/custom_tasks_templates/custom_yourbench_task_mcq.py @@ -25,7 +25,7 @@ from aenum import extend_enum -from lighteval.metrics.dynamic_metrics import multilingual_extractive_match_metric +from lighteval.metrics.dynamic_metrics import DynamicMultilingualExtractiveMatch from lighteval.metrics.metrics import Metrics from lighteval.metrics.utils.extractive_match_utils import IndicesExtractionConfig from lighteval.tasks.lighteval_task import LightevalTaskConfig @@ -74,7 +74,7 @@ def yourbench_prompt(line, task_name: str = ""): ) -yourbench_metrics = multilingual_extractive_match_metric( +yourbench_metrics = DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index ab261485b..da7a07c15 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -360,13 +360,13 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = # The metric is in a subset which has already been computed and saved continue + aggregation = task.aggregation()[metric_name] + try: - metric_result = task.aggregation()[metric_name](metric_values) + metric_result = aggregation(metric_values) except OverflowError: logger.warning(f"{task_name}, {metric_name} got an OVERFLOW ERROR when aggregating.") metric_result = float("nan") - except KeyError: - continue if isinstance(metric_result, dict): # For some corpus level grouping metrics self.metric_aggregated[task_name].update(metric_result) @@ -379,7 +379,6 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = None # We skip stderr for some corpus metrics that return dicts, or if bootstrap_iters is 0 ) else: - aggregation = task.aggregation()[metric_name] stderr = get_stderr_function(aggregation=aggregation, number_experiments=bootstrap_iters) if stderr is not None and len(metric_values) > 1: try: diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py index 2a4b6d4c2..17ec0ede7 100644 --- a/src/lighteval/metrics/__init__.py +++ b/src/lighteval/metrics/__init__.py @@ -32,7 +32,7 @@ def apply_metric(responses: list[ModelResponse], docs: list[Doc], metrics: list[ if metric.batched_compute: outputs_per_metrics: list = [] - outputs_per_metrics.append(metric.compute(responses=responses, docs=docs)) + outputs_per_metrics.append(metric.compute_sample(responses=responses, docs=docs)) # We merge the outputs per metric in a list of dict for each sample # example: [{metric1_sample1, metric2_sample1}, {metric1_sample2, metric2_sample2}] @@ -47,7 +47,7 @@ def apply_metric(responses: list[ModelResponse], docs: list[Doc], metrics: list[ output = {} for metric in metrics: output.update( - metric.compute( + metric.compute_sample( model_response=model_response, doc=doc, ) diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py index 745d606ec..c9b8d73bc 100644 --- a/src/lighteval/metrics/dynamic_metrics.py +++ b/src/lighteval/metrics/dynamic_metrics.py @@ -56,171 +56,178 @@ logger = logging.getLogger(__name__) -def loglikelihood_acc_metric(normalization: LogProbNormalization | None = None) -> SampleLevelMetric: - """ - Creates an accuracy (loglikelihood) metric, which returns accuracy given normalization. - """ - - normalization_str = f"_{normalization.name}" if normalization else "" - metric_name = f"acc{normalization_str}" - return SampleLevelMetric( - metric_name=metric_name, - sample_level_fn=LoglikelihoodAcc(logprob_normalization=normalization).compute, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - -def normalized_multi_choice_prob_metric( - normalization: LogProbNormalization | None = None, - aggregation_function: Callable[[np.ndarray], float] = np.max, -) -> SampleLevelMetric: - """ - Creates a normalized multi-choice probability metric, which returns the probability of the gold choice / sum of probabilities of all choices (after logprobs are normalized). - """ - - normalization_str = f"_{normalization.name}" if normalization else "" - metric_name = f"normalized_mc_prob{normalization_str}" - - return SampleLevelMetric( - metric_name=metric_name, - sample_level_fn=NormalizedMultiChoiceProbability( - log_prob_normalization=normalization, aggregation_function=aggregation_function - ).compute, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - -def probability_metric( - normalization: LogProbTokenNorm | None = None, - aggregation_function: Callable[[np.ndarray], float] = np.max, -) -> SampleLevelMetric: - """ - Creates a probability metric, which returns the probability of the gold choice given normalization. - """ - - normalization_str = f"_{normalization.name}" if normalization else "" - metric_name = f"prob{normalization_str}" - - return SampleLevelMetric( - metric_name=metric_name, - sample_level_fn=Probability(normalization=normalization, aggregation_function=aggregation_function).compute, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - -def multilingual_quasi_f1_score_metric( - language: Language, aggregation_function: Callable[[list[float]], float] = max -) -> SampleLevelMetric: - """ - Creates a language-aware F1 score metric, which returns the F1 score. - - Args: - language: The language of the samples. - aggregation_function: Aggregation samples to use when multiple golds are present. - - Returns: - F1 score metric. - """ - metric_name = f"f1_{language.value}" - - multilang_normalizer = get_multilingual_normalizer(language) - return SampleLevelMetric( - metric_name=metric_name, - sample_level_fn=F1_score( - normalize_gold=multilang_normalizer, - normalize_pred=multilang_normalizer, - aggregation_function=aggregation_function, - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - -def multilingual_quasi_exact_match_metric( - language: Language, - match_type: Literal["prefix", "suffix", "full"] = "full", - aggregation_function: Callable[[list[float]], float] = max, -) -> SampleLevelMetric: - """ - Creates a language-aware exact match metric, which returns the exact match score - Args: - language: The language of the samples. - match_type: The type of match to use - - "prefix": Prefixes must match - - "suffix": Suffixes must match - - "full": Full strings must match - aggregation_function: Aggregation samples to use when multiple golds are present. - Returns: - Exact match metric. - """ - metric_name = f"exact_match_{language.value}_{match_type}" - multilang_normalizer = get_multilingual_normalizer(language) - return SampleLevelMetric( - metric_name=metric_name, - sample_level_fn=ExactMatches( - normalize_gold=multilang_normalizer, - normalize_pred=multilang_normalizer, - aggregation_function=aggregation_function, - type_exact_match=match_type, - ).compute, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - -def multilingual_extractive_match_metric( - language: Language = Language.ENGLISH, - gold_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),), - pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(), LatexExtractionConfig()), - aggregation_function: Callable[[list[float]], float] = max, - fallback_mode: Literal["no_fallback", "first_match"] = "first_match", - extraction_mode: Literal["first_match", "any_match"] = "any_match", - precision: int = 6, - timeout_seconds: int = 5, -) -> SampleLevelMetric: - """Creates a language-aware extractive match metric that extracts answers from the model's output. - - Known issues: - - If the task is to simplify an expression, the metric might overestimate the accuracy. This is because if the model doesn't output any anchor for the extraction (e.g final answer is..), - it's possible that the extracted prediction will be the expression to simplify. Because we do simplifications ourselves, it can thus happen that sympy will correctly simplify the expression, - thus it will match gold, despite model not doing anything. PRs to fix this are welcome. - - - There is currently no StringExtractionConfig, so if the gold is \boxed{\text{Friday}} and model outputs Friday it will not match, because nothing will be extracted. - - Args: - language: Language - The language of the samples. - gold_extraction_target: Sequence[ExtractionTarget] - Extraction targets to use for gold answers. Defaults to extracting simple math expressions. - pred_extraction_target: Sequence[ExtractionTarget] - Extraction targets to use for predictions. Defaults to extracting simple math expressions. - aggregation_function: Callable[[list[float]], float] - Function to aggregate scores when multiple golds/predictions are present. Defaults to max. - fallback_mode: Literal["no_fallback", "first_match"] - How to perform extraction. Defaults to "first_match". - - "no_fallback": Only use first successfully parsed matches - - "first_match": Use the first successfully parsed match + first match irregardless the parsing success - extraction_mode: Literal["first_match", "any_match"] - - "first_match": Only tries to extract the first regex match if it fails no other matches are tried - - "any_match": Tries to extract any regex match - - precision: int - Number of decimal places to use when comparing numerical values. Defaults to 6. - timeout_seconds: int - Timeout for the extraction (each attempt) and comparison. Defaults to 5. - - Returns: - A sample level metric that extracts and compares mathematical expressions. - - """ +class LogLikelihoodAccMetric(SampleLevelMetric): + def __init__(self, normalization: LogProbNormalization | None = None): + """ + Creates an accuracy (loglikelihood) metric, which returns accuracy given normalization. + """ + super().__init__( + metric_name="acc" + (f"_{normalization.name}" if normalization else ""), + sample_level_fn=LoglikelihoodAcc(logprob_normalization=normalization), + category=SamplingMethod.LOGPROBS, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + + +class NormalizedMultiChoiceProbMetric(SampleLevelMetric): + def __init__( + self, + normalization: LogProbNormalization | None = None, + aggregation_function: Callable[[np.ndarray], float] = np.max, + ): + """ + Creates a normalized multi-choice probability metric, which returns the probability of the gold choice / sum of probabilities of all choices (after logprobs are normalized). + """ + super().__init__( + metric_name="normalized_mc_prob" + (f"_{normalization.name}" if normalization else ""), + sample_level_fn=NormalizedMultiChoiceProbability( + log_prob_normalization=normalization, aggregation_function=aggregation_function + ), + category=SamplingMethod.LOGPROBS, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + + +class ProbabilityMetric(SampleLevelMetric): + def __init__( + self, + normalization: LogProbTokenNorm | None = None, + aggregation_function: Callable[[np.ndarray], float] = np.max, + ): + """ + Creates a probability metric, which returns the probability of the gold choice given normalization. + """ + super().__init__( + metric_name="prob" + (f"_{normalization.name}" if normalization else ""), + sample_level_fn=Probability( + normalization=normalization, aggregation_function=aggregation_function + ).compute, + category=SamplingMethod.LOGPROBS, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + + +class MultilingualQuasiF1ScoreMetric(SampleLevelMetric): + def __init__(self, language: Language, aggregation_function: Callable[[list[float]], float] = max): + """ + Creates a language-aware F1 score metric, which returns the F1 score. + + Args: + language: The language of the samples. + aggregation_function: Aggregation samples to use when multiple golds are present. + """ + super().__init__( + metric_name=f"f1_{language.value}", + sample_level_fn=F1_score( + normalize_gold=get_multilingual_normalizer(language), + normalize_pred=get_multilingual_normalizer(language), + aggregation_function=aggregation_function, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + + +class MultilingualQuasiExactMatchMetric(SampleLevelMetric): + def __init__( + self, + language: Language, + match_type: Literal["prefix", "suffix", "full"] = "full", + aggregation_function: Callable[[list[float]], float] = max, + ): + """ + Creates a language-aware exact match metric, which returns the exact match score + Args: + language: The language of the samples. + match_type: The type of match to use + - "prefix": Prefixes must match + - "suffix": Suffixes must match + - "full": Full strings must match + aggregation_function: Aggregation samples to use when multiple golds are present. + Returns: + Exact match metric. + """ + super().__init__( + metric_name=f"exact_match_{language.value}_{match_type}", + sample_level_fn=ExactMatches( + normalize_gold=get_multilingual_normalizer(language), + normalize_pred=get_multilingual_normalizer(language), + aggregation_function=aggregation_function, + type_exact_match=match_type, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + + +class DynamicMultilingualExtractiveMatch(SampleLevelMetric): + def __init__( + self, + language: Language = Language.ENGLISH, + gold_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),), + pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(), LatexExtractionConfig()), + aggregation_function: Callable[[list[float]], float] = max, + fallback_mode: Literal["no_fallback", "first_match"] = "first_match", + extraction_mode: Literal["first_match", "any_match"] = "any_match", + precision: int = 6, + timeout_seconds: int = 5, + ): + """Creates a language-aware extractive match metric that extracts answers from the model's output. + + Known issues: + - If the task is to simplify an expression, the metric might overestimate the accuracy. This is because if the model doesn't output any anchor for the extraction (e.g final answer is..), + it's possible that the extracted prediction will be the expression to simplify. Because we do simplifications ourselves, it can thus happen that sympy will correctly simplify the expression, + thus it will match gold, despite model not doing anything. PRs to fix this are welcome. + + - There is currently no StringExtractionConfig, so if the gold is \boxed{\text{Friday}} and model outputs Friday it will not match, because nothing will be extracted. + + Args: + language: Language + The language of the samples. + gold_extraction_target: Sequence[ExtractionTarget] + Extraction targets to use for gold answers. Defaults to extracting simple math expressions. + pred_extraction_target: Sequence[ExtractionTarget] + Extraction targets to use for predictions. Defaults to extracting simple math expressions. + aggregation_function: Callable[[list[float]], float] + Function to aggregate scores when multiple golds/predictions are present. Defaults to max. + fallback_mode: Literal["no_fallback", "first_match"] + How to perform extraction. Defaults to "first_match". + - "no_fallback": Only use first successfully parsed matches + - "first_match": Use the first successfully parsed match + first match irregardless the parsing success + extraction_mode: Literal["first_match", "any_match"] + - "first_match": Only tries to extract the first regex match if it fails no other matches are tried + - "any_match": Tries to extract any regex match + + precision: int + Number of decimal places to use when comparing numerical values. Defaults to 6. + timeout_seconds: int + Timeout for the extraction (each attempt) and comparison. Defaults to 5. + + Returns: + A sample level metric that extracts and compares mathematical expressions. + + """ + super().__init__( + metric_name="extractive_match", + sample_level_fn=self.compute, + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + + self.language = language + self.gold_extraction_target = gold_extraction_target + self.pred_extraction_target = pred_extraction_target + self.aggregation_function = aggregation_function + self.fallback_mode = fallback_mode + self.extraction_mode = extraction_mode + self.precision = precision + self.timeout_seconds = timeout_seconds @timeout(2) def add_to_specifics_with_timeout( @@ -234,19 +241,23 @@ def add_to_specifics_with_timeout( ] formatted_doc.specific["extracted_golds"] = [str(gold) for golds in extracted_golds for gold in golds] - def sample_level_fn(doc: Doc, model_response: ModelResponse) -> float: + def compute(self, doc: Doc, model_response: ModelResponse) -> float: golds = doc.get_golds() predictions = model_response.final_text - gold_extraction_regexes = get_extraction_regexes(doc, gold_extraction_target, language) - pred_extraction_regexes = get_extraction_regexes(doc, pred_extraction_target, language) + gold_extraction_regexes = get_extraction_regexes(doc, self.gold_extraction_target, self.language) + pred_extraction_regexes = get_extraction_regexes(doc, self.pred_extraction_target, self.language) extracted_predictions = [ - extract_target_from_pred(pred, pred_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds) + extract_target_from_pred( + pred, pred_extraction_regexes, self.fallback_mode, self.extraction_mode, self.timeout_seconds + ) for pred in predictions ] extracted_golds = [ - extract_target_from_pred(gold, gold_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds) + extract_target_from_pred( + gold, gold_extraction_regexes, self.fallback_mode, self.extraction_mode, self.timeout_seconds + ) for gold in golds ] @@ -262,16 +273,16 @@ def sample_level_fn(doc: Doc, model_response: ModelResponse) -> float: # We have to use timeout because the sypmy to str conversion can be very slow try: - add_to_specifics_with_timeout(doc, extracted_predictions, extracted_golds) + self.add_to_specifics_with_timeout(doc, extracted_predictions, extracted_golds) except Exception: # noqa: E722 logger.warning("Timeout when adding extracted predictions and golds to specific") - return aggregation_function( + return self.aggregation_function( [ ( 1.0 if any( - compare_gold_target(gold, pred, precision, timeout_seconds=timeout_seconds) + compare_gold_target(gold, pred, self.precision, timeout_seconds=self.timeout_seconds) for gold in extracted_golds ) else 0.0 @@ -279,11 +290,3 @@ def sample_level_fn(doc: Doc, model_response: ModelResponse) -> float: for pred in extracted_predictions ] ) - - return SampleLevelMetric( - metric_name="extractive_match", - sample_level_fn=sample_level_fn, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index a3b5834e5..5f3ae5a19 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -20,16 +20,15 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from typing import Callable import numpy as np from aenum import Enum from lighteval.metrics.dynamic_metrics import ( + DynamicMultilingualExtractiveMatch, ExprExtractionConfig, IndicesExtractionConfig, LatexExtractionConfig, - multilingual_extractive_match_metric, ) from lighteval.metrics.harness_compatibility.drop import drop_metrics from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics @@ -78,7 +77,6 @@ from lighteval.metrics.utils.metric_utils import ( CorpusLevelMetric, CorpusLevelMetricGrouping, - Metric, MetricGrouping, SampleLevelMetric, SampleLevelMetricGrouping, @@ -107,14 +105,14 @@ class Metrics(Enum): metric_name="bits_per_byte", sample_level_fn=PerplexityPreparator(units_type="bytes"), category=SamplingMethod.PERPLEXITY, - corpus_level_fn=CorpusLevelPerplexityMetric("bits_per_byte").compute, + corpus_level_fn=CorpusLevelPerplexityMetric("bits_per_byte"), higher_is_better=False, ) bleu = CorpusLevelMetric( metric_name="bleu", sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, - corpus_level_fn=CorpusLevelTranslationMetric("bleu").compute, + corpus_level_fn=CorpusLevelTranslationMetric("bleu"), higher_is_better=True, ) bleu_1 = SampleLevelMetric( @@ -143,21 +141,21 @@ class Metrics(Enum): metric_name="byte_perplexity", sample_level_fn=PerplexityPreparator(units_type="bytes"), category=SamplingMethod.PERPLEXITY, - corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity").compute, + corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity"), higher_is_better=False, ) chrf = CorpusLevelMetric( metric_name="chrf", sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, - corpus_level_fn=CorpusLevelTranslationMetric("chrf").compute, + corpus_level_fn=CorpusLevelTranslationMetric("chrf"), higher_is_better=True, ) chrf_plus = CorpusLevelMetric( metric_name="chrf++", sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, - corpus_level_fn=CorpusLevelTranslationMetric("chrf++").compute, + corpus_level_fn=CorpusLevelTranslationMetric("chrf++"), higher_is_better=True, ) copyright = SampleLevelMetricGrouping( @@ -183,7 +181,7 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - expr_gold_metric = multilingual_extractive_match_metric( + expr_gold_metric = DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, fallback_mode="first_match", precision=5, @@ -227,14 +225,14 @@ class Metrics(Enum): metric_name="f1", sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, - corpus_level_fn=CorpusLevelF1Score(average="macro").compute, + corpus_level_fn=CorpusLevelF1Score(average="macro"), higher_is_better=True, ) f1_score_micro = CorpusLevelMetric( metric_name="f1", sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, - corpus_level_fn=CorpusLevelF1Score(average="micro").compute, + corpus_level_fn=CorpusLevelF1Score(average="micro"), higher_is_better=True, ) faithfulness = SampleLevelMetric( @@ -246,7 +244,7 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - latex_gold_metric = multilingual_extractive_match_metric( + latex_gold_metric = DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, fallback_mode="first_match", precision=5, @@ -294,14 +292,14 @@ class Metrics(Enum): metric_name="loglikelihood_f1", sample_level_fn=LoglikelihoodPreparator(), category=SamplingMethod.LOGPROBS, - corpus_level_fn=CorpusLevelF1Score(None).compute, + corpus_level_fn=CorpusLevelF1Score(None), higher_is_better=True, ) loglikelihood_f1_single_token = CorpusLevelMetric( metric_name="loglikelihood_f1", sample_level_fn=LoglikelihoodPreparator(is_single_token=True), category=SamplingMethod.LOGPROBS, - corpus_level_fn=CorpusLevelF1Score(None).compute, + corpus_level_fn=CorpusLevelF1Score(None), higher_is_better=True, ) mcc = CorpusLevelMetric( @@ -329,12 +327,12 @@ class Metrics(Enum): avg_at_k_math = SampleLevelMetric( metric_name="avg@k", sample_level_fn=AvgAtK( - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -367,12 +365,12 @@ class Metrics(Enum): sample_level_fn=PassAtK( strip_strings=True, # Extracting mathematical expressions and latex expressions - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -381,12 +379,12 @@ class Metrics(Enum): pass_at_k_letters = SampleLevelMetric( metric_name="pass@k:n samples", sample_level_fn=PassAtK( - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -429,12 +427,12 @@ class Metrics(Enum): metric_name="math_avg@64", sample_level_fn=AvgAtK( k=64, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -448,12 +446,12 @@ class Metrics(Enum): n=1, strip_strings=True, # Extracting mathematical expressions and latex expressions - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -465,12 +463,12 @@ class Metrics(Enum): k=1, n=4, strip_strings=True, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -483,12 +481,12 @@ class Metrics(Enum): n=8, strip_strings=True, # Extracting mathematical expressions and latex expressions - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -500,12 +498,12 @@ class Metrics(Enum): k=1, n=16, strip_strings=True, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -517,12 +515,12 @@ class Metrics(Enum): k=1, n=32, strip_strings=True, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -534,12 +532,12 @@ class Metrics(Enum): k=1, n=64, strip_strings=True, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -564,7 +562,7 @@ class Metrics(Enum): metric_name="mf1", sample_level_fn=LoglikelihoodPreparator(is_single_token=True), category=SamplingMethod.LOGPROBS, - corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3).compute, + corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3), higher_is_better=True, ) pass_at_1 = SampleLevelMetric( @@ -608,7 +606,7 @@ class Metrics(Enum): k=16, n=48, strip_strings=True, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, fallback_mode="first_match", precision=5, @@ -616,7 +614,7 @@ class Metrics(Enum): # Match boxed first before trying other regexes pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), aggregation_function=max, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), @@ -628,7 +626,7 @@ class Metrics(Enum): k=16, n=48, strip_strings=True, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, fallback_mode="first_match", precision=5, @@ -636,7 +634,7 @@ class Metrics(Enum): # Match boxed first before trying other regexes pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), aggregation_function=max, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), @@ -653,7 +651,7 @@ class Metrics(Enum): metric_name="ppl", sample_level_fn=None, # todo!!! category=SamplingMethod.PERPLEXITY, - corpus_level_fn=CorpusLevelPerplexityMetric("perplexity").compute, + corpus_level_fn=CorpusLevelPerplexityMetric("perplexity"), higher_is_better=True, ) prefix_exact_match = SampleLevelMetric( @@ -782,24 +780,24 @@ class Metrics(Enum): metric_name=["simpleqa_judge"], higher_is_better={"simpleqa_judge": True}, category=SamplingMethod.GENERATIVE, - sample_level_fn=JudgeLLMSimpleQA().compute, - batched_compute=True, + sample_level_fn=JudgeLLMSimpleQA(), corpus_level_fn={ "simpleqa_judge": np.mean, }, + batched_compute=True, ) target_perplexity = SampleLevelMetric( metric_name="ppl", sample_level_fn=TargetPerplexityPreparator(units_type="words"), category=SamplingMethod.LOGPROBS, - corpus_level_fn=CorpusLevelPerplexityMetric("perplexity").compute, + corpus_level_fn=CorpusLevelPerplexityMetric("perplexity"), higher_is_better=False, ) ter = CorpusLevelMetric( metric_name="ter", sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, - corpus_level_fn=CorpusLevelTranslationMetric("ter").compute, + corpus_level_fn=CorpusLevelTranslationMetric("ter"), higher_is_better=False, ) truthfulqa_mc_metrics = SampleLevelMetricGrouping( @@ -813,10 +811,10 @@ class Metrics(Enum): metric_name="word_perplexity", sample_level_fn=PerplexityPreparator(units_type="words"), category=SamplingMethod.PERPLEXITY, - corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity").compute, + corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity"), higher_is_better=False, ) - gpqa_instruct_metric = multilingual_extractive_match_metric( + gpqa_instruct_metric = DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], @@ -827,12 +825,12 @@ class Metrics(Enum): sample_level_fn=PassAtK( k=1, n=1, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -843,12 +841,12 @@ class Metrics(Enum): sample_level_fn=PassAtK( k=1, n=4, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -859,12 +857,12 @@ class Metrics(Enum): sample_level_fn=PassAtK( k=1, n=8, - sample_scoring_function=lambda doc, model_response: multilingual_extractive_match_metric( + sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], precision=6, - ).sample_level_fn(doc, model_response), + ), ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -884,21 +882,6 @@ def higher_is_better(): res[metric.value.metric_name] = metric.value.higher_is_better return res - @staticmethod - def corpus_level_fns(metrics: list[Metric]) -> dict[str, Callable]: - res = {} - for metric in metrics: - if isinstance(metric, MetricGrouping): - if isinstance(metric.corpus_level_fn, dict): - res.update(metric.corpus_level_fn) - else: - # Must make sure there is a caching implementation here - for m in metric.metric_name: - res[m] = metric.corpus_level_fn - else: - res[metric.metric_name] = metric.corpus_level_fn - return res - @staticmethod def all_metrics(): res = [] diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 030725a53..0ac99f764 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -44,6 +44,11 @@ logger = logging.getLogger(__name__) +class CorpusLevelComputation: + def compute_corpus(self): + raise NotImplementedError + + # General aggregations def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float: """Computes the Matthews Correlation Coefficient, using scikit learn ([doc](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)). @@ -59,7 +64,7 @@ def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float: return sklearn.metrics.matthews_corrcoef(golds, preds) -class CorpusLevelF1Score: +class CorpusLevelF1Score(CorpusLevelComputation): def __init__(self, average: str, num_classes: int = 2): """Stores the relevant parameters for the task's corpus level f1 score. @@ -74,7 +79,7 @@ def __init__(self, average: str, num_classes: int = 2): self.average = average self.num_classes = num_classes - def compute(self, items: list[LogprobCorpusMetricInput]): + def compute_corpus(self, items: list[LogprobCorpusMetricInput]): """Computes the metric score over all the corpus generated items, by using the scikit learn implementation.""" golds = [i.golds for i in items] preds = [i.preds for i in items] @@ -90,7 +95,7 @@ def compute(self, items: list[LogprobCorpusMetricInput]): return float(np.mean(f1s)) -class CorpusLevelTranslationMetric: +class CorpusLevelTranslationMetric(CorpusLevelComputation): def __init__(self, metric_type: str, lang: Literal["zh", "ja", "ko", ""] = ""): """Stores the relevant parameters for a corpus level translation metric. @@ -112,7 +117,7 @@ def get_metric(self): else: raise ValueError(f"Unknown corpus level translation metric type : {self.metric_type}") - def compute(self, items: list[GenerativeCorpusMetricInput]) -> float: + def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float: """Computes the metric score over all the corpus generated items, by using the sacrebleu implementation.""" metric = self.get_metric() golds = [i.golds for i in items] @@ -127,7 +132,7 @@ def compute(self, items: list[GenerativeCorpusMetricInput]) -> float: return float(metric.corpus_score(hypotheses=preds, references=golds).score) -class CorpusLevelPerplexityMetric: +class CorpusLevelPerplexityMetric(CorpusLevelComputation): def __init__(self, metric_type: str): """Stores the relevant parameter for a corpus level perplexity metric. Perplexity metrics compute more or less the same thing, which is a variation on the @@ -145,7 +150,7 @@ def __init__(self, metric_type: str): self.metric_type = metric_type - def compute(self, items: list[PerplexityCorpusMetricInput]): + def compute_corpus(self, items: list[PerplexityCorpusMetricInput]): """Computes the metric score over all the corpus generated items.""" logprobs = [i.logprobs for i in items] weights = [i.weights for i in items] diff --git a/src/lighteval/metrics/normalizations.py b/src/lighteval/metrics/normalizations.py index 565a5e379..97c1bd384 100644 --- a/src/lighteval/metrics/normalizations.py +++ b/src/lighteval/metrics/normalizations.py @@ -426,8 +426,6 @@ class LogProbPMINorm: name: str = "norm_pmi" - pass - @dataclass class LogProbTokenNorm: @@ -437,7 +435,6 @@ class LogProbTokenNorm: """ name: str = "norm_token" - pass @dataclass diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py index 69a251afb..5b2f37f25 100644 --- a/src/lighteval/metrics/utils/metric_utils.py +++ b/src/lighteval/metrics/utils/metric_utils.py @@ -40,7 +40,7 @@ class Metric: def get_doc(self): return self.sample_level_fn.__doc__ - def compute( + def compute_sample( self, **kwargs ) -> dict: # result: Union[list[ModelResponse], ModelResponse], formatted_doc: Doc) -> dict: if isinstance(self.sample_level_fn, Callable): @@ -54,6 +54,20 @@ def compute( return sample_level_fn(**kwargs) # result, formatted_doc, return {self.metric_name: sample_level_fn(**kwargs)} # result, formatted_doc, + def get_corpus_aggregations(self) -> dict: + if isinstance(self, MetricGrouping): + corpus_level_fn = self.corpus_level_fn + else: + corpus_level_fn = {self.metric_name: self.corpus_level_fn} + + for name, item in corpus_level_fn.items(): + if isinstance(item, Callable): + corpus_level_fn[name] = item + else: + corpus_level_fn[name] = item.compute_corpus + + return corpus_level_fn + @dataclass class MetricGrouping(Metric): diff --git a/src/lighteval/tasks/extended/olympiade_bench/main.py b/src/lighteval/tasks/extended/olympiade_bench/main.py index 090562a1b..369911969 100644 --- a/src/lighteval/tasks/extended/olympiade_bench/main.py +++ b/src/lighteval/tasks/extended/olympiade_bench/main.py @@ -22,9 +22,9 @@ from lighteval.metrics.dynamic_metrics import ( + DynamicMultilingualExtractiveMatch, ExprExtractionConfig, LatexExtractionConfig, - multilingual_extractive_match_metric, ) from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -200,7 +200,7 @@ def olympiad_bench_prompt(line, task_name: str = None): extraction_targets = [ExprExtractionConfig(), LatexExtractionConfig()] -metric = multilingual_extractive_match_metric( +metric = DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=extraction_targets, pred_extraction_target=extraction_targets, diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index ffc1fa198..4276511cf 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -333,7 +333,10 @@ def aggregation(self): Return a dict with metric name and its aggregation function for all metrics """ - return Metrics.corpus_level_fns(self.metrics) + aggregations = {} + for metric in self.metrics: + aggregations.update(metric.get_corpus_aggregations()) + return aggregations @staticmethod def load_datasets(tasks: dict[str, "LightevalTask"], dataset_loading_processes: int = 1) -> None: diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py index d9d0fad0e..3db60b18b 100644 --- a/src/lighteval/tasks/multilingual/tasks.py +++ b/src/lighteval/tasks/multilingual/tasks.py @@ -27,9 +27,9 @@ from langcodes import standardize_tag from lighteval.metrics.dynamic_metrics import ( - loglikelihood_acc_metric, - multilingual_quasi_exact_match_metric, - multilingual_quasi_f1_score_metric, + LogLikelihoodAccMetric, + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, ) from lighteval.metrics.metrics import Metrics from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm @@ -83,9 +83,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), prompt_function=get_nli_prompt_function( @@ -138,9 +138,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), prompt_function=get_nli_prompt_function( @@ -218,9 +218,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -265,9 +265,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -321,9 +321,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -366,9 +366,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -402,9 +402,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -437,9 +437,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -486,8 +486,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -536,8 +536,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), trust_dataset=True, @@ -588,8 +588,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -632,8 +632,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), trust_dataset=True, @@ -706,8 +706,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -740,8 +740,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -769,8 +769,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), ], ), ) @@ -797,8 +797,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -842,8 +842,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(language, "prefix"), - multilingual_quasi_f1_score_metric(language), + MultilingualQuasiExactMatchMetric(language, "prefix"), + MultilingualQuasiF1ScoreMetric(language), ), ) for language in [ @@ -886,8 +886,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(Language.GERMAN, "prefix"), - multilingual_quasi_f1_score_metric(Language.GERMAN), + MultilingualQuasiExactMatchMetric(Language.GERMAN, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.GERMAN), ), ) ] @@ -915,8 +915,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(Language.ITALIAN, "prefix"), - multilingual_quasi_f1_score_metric(Language.ITALIAN), + MultilingualQuasiExactMatchMetric(Language.ITALIAN, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.ITALIAN), ), ) ] @@ -942,8 +942,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(Language.THAI, "prefix"), - multilingual_quasi_f1_score_metric(Language.THAI), + MultilingualQuasiExactMatchMetric(Language.THAI, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.THAI), ), ) ] @@ -967,8 +967,8 @@ evaluation_splits=("validation",), few_shots_split="train", metrics=( - multilingual_quasi_exact_match_metric(Language.RUSSIAN, "prefix"), - multilingual_quasi_f1_score_metric(Language.RUSSIAN), + MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.RUSSIAN), ), generation_size=400, stop_sequence=("\n",), @@ -997,8 +997,8 @@ evaluation_splits=("validation",), few_shots_split="train", metrics=( - multilingual_quasi_exact_match_metric(Language.PORTUGUESE, "prefix"), - multilingual_quasi_f1_score_metric(Language.PORTUGUESE), + MultilingualQuasiExactMatchMetric(Language.PORTUGUESE, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.PORTUGUESE), ), generation_size=400, stop_sequence=("\n",), @@ -1026,8 +1026,8 @@ evaluation_splits=("validation",), few_shots_split="train", metrics=( - multilingual_quasi_exact_match_metric(Language.SPANISH, "prefix"), - multilingual_quasi_f1_score_metric(Language.SPANISH), + MultilingualQuasiExactMatchMetric(Language.SPANISH, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.SPANISH), ), generation_size=400, stop_sequence=("\n",), @@ -1054,8 +1054,8 @@ evaluation_splits=("validation",), few_shots_split="train", metrics=( - multilingual_quasi_exact_match_metric(Language.ARABIC, "prefix"), - multilingual_quasi_f1_score_metric(Language.ARABIC), + MultilingualQuasiExactMatchMetric(Language.ARABIC, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.ARABIC), ), generation_size=400, stop_sequence=("\n",), @@ -1081,8 +1081,8 @@ evaluation_splits=("test",), few_shots_split="validation", metrics=( - multilingual_quasi_exact_match_metric(Language.SWAHILI, "prefix"), - multilingual_quasi_f1_score_metric(Language.SWAHILI), + MultilingualQuasiExactMatchMetric(Language.SWAHILI, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.SWAHILI), ), generation_size=400, stop_sequence=("\n",), @@ -1108,8 +1108,8 @@ evaluation_splits=("validation",), few_shots_split="train", metrics=( - multilingual_quasi_exact_match_metric(Language.CHINESE, "prefix"), - multilingual_quasi_f1_score_metric(Language.CHINESE), + MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.CHINESE), ), generation_size=400, stop_sequence=("\n",), @@ -1136,8 +1136,8 @@ few_shots_split="train", generation_size=400, metrics=( - multilingual_quasi_exact_match_metric(Language.CHINESE, "prefix"), - multilingual_quasi_f1_score_metric(Language.CHINESE), + MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.CHINESE), ), stop_sequence=("\n",), ) @@ -1168,8 +1168,8 @@ hf_avail_splits=("test",), generation_size=400, metrics=( - multilingual_quasi_exact_match_metric(language, "prefix"), - multilingual_quasi_f1_score_metric(language), + MultilingualQuasiExactMatchMetric(language, "prefix"), + MultilingualQuasiF1ScoreMetric(language), ), stop_sequence=("\n",), ) @@ -1209,8 +1209,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(Language.FRENCH, "prefix"), - multilingual_quasi_f1_score_metric(Language.FRENCH), + MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.FRENCH), ), ) ] @@ -1235,8 +1235,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(Language.TURKISH, "prefix"), - multilingual_quasi_f1_score_metric(Language.TURKISH), + MultilingualQuasiExactMatchMetric(Language.TURKISH, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.TURKISH), ), ) ] @@ -1264,8 +1264,8 @@ generation_size=400, stop_sequence=("\n",), metrics=( - multilingual_quasi_exact_match_metric(language, "prefix"), - multilingual_quasi_f1_score_metric(language), + MultilingualQuasiExactMatchMetric(language, "prefix"), + MultilingualQuasiF1ScoreMetric(language), ), ) for language in [ @@ -1307,8 +1307,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -1341,8 +1341,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -1366,8 +1366,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -1403,8 +1403,8 @@ generation_size=400, stop_sequence=("\n",), metrics=[ - multilingual_quasi_exact_match_metric(lang, "prefix"), - multilingual_quasi_f1_score_metric(lang), + MultilingualQuasiExactMatchMetric(lang, "prefix"), + MultilingualQuasiF1ScoreMetric(lang), ], ) for lang in [ @@ -1440,8 +1440,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -1694,9 +1694,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -1742,9 +1742,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -1806,9 +1806,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -1871,9 +1871,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -1957,9 +1957,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2013,9 +2013,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2045,9 +2045,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2152,9 +2152,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2233,9 +2233,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2280,9 +2280,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2345,9 +2345,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2404,8 +2404,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -2439,9 +2439,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -2489,10 +2489,10 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ] - + ([loglikelihood_acc_metric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore + + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore ), ) for subset in ["easy", "challenge"] @@ -2525,10 +2525,10 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ] - + ([loglikelihood_acc_metric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore + + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore ), ) for subset in ["easy", "challenge"] @@ -2553,8 +2553,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), trust_dataset=True, @@ -2591,10 +2591,10 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ] - + ([loglikelihood_acc_metric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore + + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore ), ) for subset in ["easy", "challenge"] @@ -2651,8 +2651,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -2724,8 +2724,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -2887,8 +2887,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -2921,8 +2921,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -2963,8 +2963,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3013,9 +3013,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -3072,8 +3072,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3111,8 +3111,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3145,8 +3145,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3180,8 +3180,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3226,8 +3226,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), trust_dataset=True, @@ -3271,8 +3271,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3300,7 +3300,7 @@ few_shots_split="validation", generation_size=25, metrics=[ - multilingual_quasi_exact_match_metric(Language.CHINESE, "full"), + MultilingualQuasiExactMatchMetric(Language.CHINESE, "full"), ], stop_sequence=("\n",), ) @@ -3325,7 +3325,7 @@ few_shots_split="train", generation_size=25, metrics=[ - multilingual_quasi_exact_match_metric(language, "full"), + MultilingualQuasiExactMatchMetric(language, "full"), ], stop_sequence=("\n",), ) @@ -3364,7 +3364,7 @@ few_shots_split="train", generation_size=25, metrics=[ - multilingual_quasi_exact_match_metric(language, "full"), + MultilingualQuasiExactMatchMetric(language, "full"), ], stop_sequence=("\n",), ) @@ -3436,9 +3436,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -3527,8 +3527,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3564,8 +3564,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3599,8 +3599,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3637,8 +3637,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3673,8 +3673,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3733,8 +3733,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -3778,9 +3778,9 @@ evaluation_splits=("test",), hf_avail_splits=["test"], metrics=[ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ) for language in [ @@ -3810,9 +3810,9 @@ evaluation_splits=("validation",), few_shots_split="train", metrics=[ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ) for formulation in [ @@ -3863,12 +3863,12 @@ hf_avail_splits=["train"], stop_sequence=("\n",), metrics=[ - multilingual_quasi_exact_match_metric(language, "prefix"), - multilingual_quasi_f1_score_metric(language), + MultilingualQuasiExactMatchMetric(language, "prefix"), + MultilingualQuasiF1ScoreMetric(language), ] if subset in ["entity", "long_answer", "short_phrase"] else [ - multilingual_quasi_exact_match_metric(language, "full"), + MultilingualQuasiExactMatchMetric(language, "full"), ], ) for subset in MKQA_TASK_TO_ID.keys() @@ -3920,8 +3920,8 @@ generation_size=400, stop_sequence=("\n",), metrics=[ - multilingual_quasi_exact_match_metric(lang, "prefix"), - multilingual_quasi_f1_score_metric(lang), + MultilingualQuasiExactMatchMetric(lang, "prefix"), + MultilingualQuasiF1ScoreMetric(lang), ], ) for lang in [ @@ -3955,8 +3955,8 @@ generation_size=400, stop_sequence=("\n",), metrics=[ - multilingual_quasi_exact_match_metric(Language.FRENCH, "prefix"), - multilingual_quasi_f1_score_metric(Language.FRENCH), + MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.FRENCH), ], ) ] @@ -3980,8 +3980,8 @@ generation_size=400, stop_sequence=("\n",), metrics=[ - multilingual_quasi_exact_match_metric(Language.RUSSIAN, "prefix"), - multilingual_quasi_f1_score_metric(Language.RUSSIAN), + MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.RUSSIAN), ], ) ] @@ -4075,7 +4075,7 @@ hf_subset=subset, evaluation_splits=("test",), few_shots_split="validation", - metrics=[multilingual_quasi_exact_match_metric(Language.ARABIC, "full"), loglikelihood_acc_metric()], + metrics=[MultilingualQuasiExactMatchMetric(Language.ARABIC, "full"), LogLikelihoodAccMetric()], generation_size=5, stop_sequence=("\n",), ) @@ -4102,7 +4102,7 @@ few_shots_split="valid", generation_size=5, stop_sequence=["\n"], - metrics=[multilingual_quasi_exact_match_metric(Language.FRENCH, "full"), loglikelihood_acc_metric()], + metrics=[MultilingualQuasiExactMatchMetric(Language.FRENCH, "full"), LogLikelihoodAccMetric()], ) ] @@ -4125,7 +4125,7 @@ few_shots_split="train", generation_size=5, stop_sequence=["\n"], - metrics=[multilingual_quasi_exact_match_metric(language, "full"), loglikelihood_acc_metric()], + metrics=[MultilingualQuasiExactMatchMetric(language, "full"), LogLikelihoodAccMetric()], ) for language in [ Language.HINDI, diff --git a/src/lighteval/tasks/multilingual/utils/task_utils.py b/src/lighteval/tasks/multilingual/utils/task_utils.py index d8e73dac8..d439eed16 100644 --- a/src/lighteval/tasks/multilingual/utils/task_utils.py +++ b/src/lighteval/tasks/multilingual/utils/task_utils.py @@ -21,7 +21,7 @@ # SOFTWARE. -from lighteval.metrics.dynamic_metrics import loglikelihood_acc_metric +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric from lighteval.metrics.utils.metric_utils import Metric from lighteval.tasks.templates.utils.formulation import Formulation, MCFFormulation @@ -37,6 +37,6 @@ def get_metrics_for_formulation(formulation: Formulation, metrics: list[Metric]) match formulation: # case MCFFormulation(choice_prefix="Letters"): - return [loglikelihood_acc_metric(normalization=None)] + return [LogLikelihoodAccMetric(normalization=None)] case _: return metrics diff --git a/tests/metrics/test_extractive_match.py b/tests/metrics/test_extractive_match.py index 603813ef5..b7fc65ce9 100644 --- a/tests/metrics/test_extractive_match.py +++ b/tests/metrics/test_extractive_match.py @@ -24,10 +24,10 @@ import sympy from lighteval.metrics.dynamic_metrics import ( + DynamicMultilingualExtractiveMatch, ExprExtractionConfig, IndicesExtractionConfig, LatexExtractionConfig, - multilingual_extractive_match_metric, ) from lighteval.metrics.utils.math_comparison import sympy_expr_eq from lighteval.models.model_output import ModelResponse @@ -66,12 +66,12 @@ def compare_strings( model_response = ModelResponse(text=[pred]) doc = Doc(choices=[gold, "", "", ""], query="", gold_index=0) - return multilingual_extractive_match_metric( + return DynamicMultilingualExtractiveMatch( language=language, gold_extraction_target=extraction_targets, pred_extraction_target=extraction_targets, precision=precision, - ).sample_level_fn( + ).compute( model_response=model_response, doc=doc, ) diff --git a/tests/metrics/test_metric_requests.py b/tests/metrics/test_metric_requests.py index b748f7363..7ceb94c68 100644 --- a/tests/metrics/test_metric_requests.py +++ b/tests/metrics/test_metric_requests.py @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from lighteval.metrics.dynamic_metrics import loglikelihood_acc_metric +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric from lighteval.metrics.metrics import Metrics from lighteval.metrics.normalizations import LogProbPMINorm from lighteval.metrics.utils.metric_utils import Metric @@ -69,7 +69,7 @@ def test_pmi_request(): ] ) - metric = loglikelihood_acc_metric(normalization=LogProbPMINorm()) + metric = LogLikelihoodAccMetric(normalization=LogProbPMINorm()) pmi_test_config = get_pmi_task(metrics=[metric]) task = LightevalTask(pmi_test_config) result = fake_evaluate_task(task, fake_model, max_samples=1)["results"]["test:pmi_test_task:0"] @@ -93,7 +93,7 @@ def test_pmi_request_with_logprob_metric(): ] ) - metrics = [loglikelihood_acc_metric(normalization=LogProbPMINorm()), loglikelihood_acc_metric(normalization=None)] + metrics = [LogLikelihoodAccMetric(normalization=LogProbPMINorm()), LogLikelihoodAccMetric(normalization=None)] pmi_test_config = get_pmi_task(metrics=metrics) task = LightevalTask(pmi_test_config) result = fake_evaluate_task(task, fake_model, max_samples=1)["results"]["test:pmi_test_task:0"] @@ -126,7 +126,7 @@ def test_pmi_request_with_generative_metric(): ], ) - metrics = [loglikelihood_acc_metric(normalization=LogProbPMINorm()), Metrics.exact_match.value] + metrics = [LogLikelihoodAccMetric(normalization=LogProbPMINorm()), Metrics.exact_match.value] pmi_test_config = get_pmi_task(metrics=metrics) task = LightevalTask(pmi_test_config) results = fake_evaluate_task(task, fake_model, max_samples=1)["results"]["test:pmi_test_task:0"] diff --git a/tests/test_unit_base_metrics.py b/tests/test_unit_base_metrics.py index 65302d127..575ebf595 100644 --- a/tests/test_unit_base_metrics.py +++ b/tests/test_unit_base_metrics.py @@ -24,11 +24,11 @@ import pytest from lighteval.metrics.dynamic_metrics import ( - loglikelihood_acc_metric, - multilingual_quasi_exact_match_metric, - multilingual_quasi_f1_score_metric, - normalized_multi_choice_prob_metric, - probability_metric, + LogLikelihoodAccMetric, + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, + NormalizedMultiChoiceProbMetric, + ProbabilityMetric, ) from lighteval.metrics.metrics_sample import ExactMatches from lighteval.metrics.normalizations import LogProbCharNorm, helm_normalizer @@ -196,86 +196,88 @@ def test_prob(self): # Simple case model_response = ModelResponse(logprobs=np.log([0.7])) - prob_metric = probability_metric() - result = prob_metric.sample_level_fn(doc, model_response) - assert result == pytest.approx(0.7) + prob_metric = ProbabilityMetric() + result = prob_metric.compute_sample(doc=doc, model_response=model_response) + assert result[prob_metric.metric_name] == pytest.approx(0.7) # Aggregation function test model_response = ModelResponse(logprobs=np.log([0.7, 0.1])) - prob_min_metric = probability_metric(aggregation_function=np.min) - result = prob_min_metric.sample_level_fn(doc, model_response) - assert result == pytest.approx(0.1) + prob_min_metric = ProbabilityMetric(aggregation_function=np.min) + result = prob_min_metric.compute_sample(doc=doc, model_response=model_response) + assert result[prob_metric.metric_name] == pytest.approx(0.1) def test_mc_probability_metric(self): doc = Doc(query="Test query", choices=["A", "B", "C"], gold_index=0, task_name="test") model_response = ModelResponse(logprobs=np.log([0.35, 0.1, 0.05])) - mc_prob_metric = normalized_multi_choice_prob_metric() + mc_prob_metric = NormalizedMultiChoiceProbMetric() - result = mc_prob_metric.sample_level_fn( - doc, - model_response, + result = mc_prob_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == pytest.approx(0.7) + assert result[mc_prob_metric.metric_name] == pytest.approx(0.7) doc = Doc(query="Test query", choices=["AA", "BB", "CCC"], gold_index=1, task_name="test") model_response = ModelResponse(logprobs=np.log([0.1**2, 0.35**2, 0.05**3])) - prob_norm_metric = normalized_multi_choice_prob_metric(normalization=LogProbCharNorm()) - result = prob_norm_metric.sample_level_fn( - doc, - model_response, + prob_norm_metric = NormalizedMultiChoiceProbMetric(normalization=LogProbCharNorm()) + result = prob_norm_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == pytest.approx(0.7) + assert result[prob_norm_metric.metric_name] == pytest.approx(0.7) def test_acc(self): # Test without normalization doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=0, task_name="test") model_response = ModelResponse(logprobs=np.log([0.7, 0.2, 0.3, 0.4])) - acc_metric = loglikelihood_acc_metric() - result = acc_metric.sample_level_fn( - doc, - model_response, + acc_metric = LogLikelihoodAccMetric() + result = acc_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == 1 # The highest logprob (3.0) is at index 3, which is not in gold_ixs + assert result[acc_metric.metric_name] == 1 # The highest logprob (3.0) is at index 3, which is not in gold_ixs # Test 0 acc doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=0, task_name="test") model_response = ModelResponse(logprobs=np.log([0.1, 0.2, 0.3, 0.4])) - result = acc_metric.sample_level_fn( - doc, - model_response, + result = acc_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == 0 + assert result[acc_metric.metric_name] == 0 # Test with normalization doc = Doc(query="Test query", choices=["ABCDE", "AB"], gold_index=0, task_name="test") model_response = ModelResponse(logprobs=np.log([0.5, 0.6])) - acc_norm_metric = loglikelihood_acc_metric(normalization=LogProbCharNorm()) - result_norm = acc_norm_metric.sample_level_fn( - doc, - model_response, + acc_norm_metric = LogLikelihoodAccMetric(normalization=LogProbCharNorm()) + result_norm = acc_norm_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result_norm == 1 # After normalization, "ABCDE" should have the highest score + assert ( + result_norm[acc_norm_metric.metric_name] == 1 + ) # After normalization, "ABCDE" should have the highest score # Test with multiple correct solutions doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=[1, 3], task_name="test") model_response = ModelResponse(logprobs=np.log([0.5, 0.6, 0.7, 0.8])) - result_multi = acc_metric.sample_level_fn( - doc, - model_response, + result_multi = acc_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result_multi == 1 + assert result_multi[acc_metric.metric_name] == 1 # Test when the highest logprob is not in gold_ixs doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=[1, 2], task_name="test") model_response = ModelResponse(logprobs=[0.5, 0.6, 0.7, 0.8]) - result_incorrect = acc_metric.sample_level_fn( - doc, - model_response, + result_incorrect = acc_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result_incorrect == 0 + assert result_incorrect[acc_metric.metric_name] == 0 def test_f1_dynamic_metric(self): """ @@ -286,21 +288,21 @@ def test_f1_dynamic_metric(self): model_response = ModelResponse(text=["hello, the world"]) # Normalization test - f1_metric = multilingual_quasi_f1_score_metric(language=Language.ENGLISH) - result = f1_metric.sample_level_fn( - doc, - model_response, + f1_metric = MultilingualQuasiF1ScoreMetric(language=Language.ENGLISH) + result = f1_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == 1 + assert result[f1_metric.metric_name] == 1 model_response = ModelResponse(text=["hello, the world how"]) - f1_metric = multilingual_quasi_f1_score_metric(language=Language.ENGLISH, aggregation_function=np.min) - result = f1_metric.sample_level_fn( - doc, - model_response, + f1_metric = MultilingualQuasiF1ScoreMetric(language=Language.ENGLISH, aggregation_function=np.min) + result = f1_metric.compute_sample( + doc=doc, + model_response=model_response, ) # 2 * (precision * recall) / (precision + recall) = 2 * (1 * 2/3) / (1 + 2/3) = 0.8 - assert result == 0.8 + assert result[f1_metric.metric_name] == 0.8 def test_exact_match_dynamic_metric(self): """ @@ -310,20 +312,20 @@ def test_exact_match_dynamic_metric(self): model_response = ModelResponse(text=["hello, the world"]) # Normalization test - em_metric = multilingual_quasi_exact_match_metric(language=Language.ENGLISH, match_type="full") - result = em_metric.sample_level_fn( - doc, - model_response, + em_metric = MultilingualQuasiExactMatchMetric(language=Language.ENGLISH, match_type="full") + result = em_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == 1 + assert result[em_metric.metric_name] == 1 model_response = ModelResponse(text=["hello, the world how"]) - em_metric = multilingual_quasi_exact_match_metric(language=Language.ENGLISH, match_type="full") - result = em_metric.sample_level_fn( - doc, - model_response, + em_metric = MultilingualQuasiExactMatchMetric(language=Language.ENGLISH, match_type="full") + result = em_metric.compute_sample( + doc=doc, + model_response=model_response, ) - assert result == 0 + assert result[em_metric.metric_name] == 0 @pytest.mark.skip(reason="Need to understand what it does.") def test_pass_at_k_estimator(self): From c0654c76832b0718c89839043bfc3d8af9e379db Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Tue, 19 Aug 2025 10:40:30 +0000 Subject: [PATCH 07/38] imports --- community_tasks/arabic_evals.py | 3 ++- community_tasks/turkic_evals.py | 4 ++-- src/lighteval/metrics/__init__.py | 2 +- src/lighteval/metrics/utils/metric_utils.py | 13 +++++++++++-- src/lighteval/tasks/default_tasks.py | 2 +- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 1d036fa83..cbfce2cc0 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -32,7 +32,8 @@ from typing import Any, Dict, List, Optional, Union from lighteval.metrics.llm_as_judge import JudgeLM -from lighteval.metrics.metrics import Metric, Metrics +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.utils.metric_utils import Metric from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc, SamplingMethod diff --git a/community_tasks/turkic_evals.py b/community_tasks/turkic_evals.py index 1a9caa380..82aadeafc 100644 --- a/community_tasks/turkic_evals.py +++ b/community_tasks/turkic_evals.py @@ -43,8 +43,8 @@ from typing import Any, Dict, List, Optional, Union from lighteval.metrics.llm_as_judge import JudgeLM -from lighteval.metrics.metrics import Metric, MetricCategory, Metrics -from lighteval.metrics.utils.metric_utils import MetricUseCase +from lighteval.metrics.metrics import MetricCategory, Metrics +from lighteval.metrics.utils.metric_utils import Metric, MetricUseCase from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py index 17ec0ede7..0bfce20c3 100644 --- a/src/lighteval/metrics/__init__.py +++ b/src/lighteval/metrics/__init__.py @@ -21,7 +21,7 @@ # SOFTWARE. -from lighteval.metrics.metrics import Metric +from lighteval.metrics.utils.metric_utils import Metric from lighteval.models.model_output import ModelResponse from lighteval.tasks.requests import Doc diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py index 5b2f37f25..d6adac109 100644 --- a/src/lighteval/metrics/utils/metric_utils.py +++ b/src/lighteval/metrics/utils/metric_utils.py @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from dataclasses import dataclass +from dataclasses import dataclass, replace from typing import Callable from lighteval.metrics.sample_preparator import Preparator @@ -33,7 +33,7 @@ class Metric: higher_is_better: bool category: SamplingMethod sample_level_fn: Callable | Preparator | object - corpus_level_fn: Callable + corpus_level_fn: Callable | object batched_compute: bool = False @@ -68,6 +68,15 @@ def get_corpus_aggregations(self) -> dict: return corpus_level_fn + def __call__(self, sample_params: dict | None): # , corpus_params: dict | None): + """Allow creating new instances with modified parameters""" + if sample_params: + self.sample_level_fn = replace(self.sample_level_fn, sample_params) + # Corpus params are unused for now, as the registry only expects sample level params + # if corpus_params: + # self.corpus_level_fn = replace(self.corpus_level_fn, corpus_params) + return self + @dataclass class MetricGrouping(Metric): diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 9376a5651..7ff3c0729 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -9840,7 +9840,7 @@ few_shots_select=None, generation_size=32768, metrics=[ - Metrics.pass_at_k_math, + Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}), ], version=2, ) From a6e271a41beb5d96175b532767e3296a96a0aa6f Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Tue, 19 Aug 2025 12:26:44 +0000 Subject: [PATCH 08/38] removed single token evals since we no longer have the feature, added a name update with parameter change --- docs/source/metric-list.mdx | 20 +- examples/custom_tasks_tests.py | 2 +- src/lighteval/metrics/metrics.py | 468 ++++---------------- src/lighteval/metrics/metrics_sample.py | 15 +- src/lighteval/metrics/utils/metric_utils.py | 26 +- src/lighteval/tasks/default_tasks.py | 387 ++++++++++------ src/lighteval/tasks/lighteval_task.py | 3 +- 7 files changed, 382 insertions(+), 539 deletions(-) diff --git a/docs/source/metric-list.mdx b/docs/source/metric-list.mdx index 643c915d2..c1315fdb1 100644 --- a/docs/source/metric-list.mdx +++ b/docs/source/metric-list.mdx @@ -3,21 +3,18 @@ ## Automatic metrics for multiple-choice tasks These metrics use log-likelihood of the different possible targets. -- `loglikelihood_acc`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_acc_single_token`). -- `loglikelihood_acc_norm`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_acc_norm_single_token`). +- `loglikelihood_acc`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token +- `loglikelihood_acc_norm`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct - also exists in a faster version for tasks where the possible choices include only one token - `loglikelihood_acc_norm_nospace`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct, with the first space ignored. -- `loglikelihood_f1`: Corpus level F1 score of the multichoice selection - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_f1_single_token`). +- `loglikelihood_f1`: Corpus level F1 score of the multichoice selection - also exists in a faster version for tasks where the possible choices include only one token - `mcc`: Matthew's correlation coefficient (a measure of agreement between statistical distributions). -- `recall_at_1`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (`recall_at_1_single_token`). -- `recall_at_2`: Fraction of instances where the choice with the 2nd best logprob or better was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (`recall_at_2_single_token`). -- `mrr`: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance - also exists in a faster version for tasks where the possible choices include only one token (`mrr_single_token`). +- `recall_at_1`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token per choice +- `recall_at_2`: Fraction of instances where the choice with the 2nd best logprob or better was correct - also exists in a faster version for tasks where the possible choices include only one token per choice +- `mrr`: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance - also exists in a faster version for tasks where the possible choices include only one token - `target_perplexity`: Perplexity of the different choices available. - `acc_golds_likelihood`: A bit different, it actually checks if the average logprob of a single target is above or below 0.5. - `multi_f1_numeric`: Loglikelihood F1 score for multiple gold targets. -All these metrics also exist in a "single token" version (`loglikelihood_acc_single_token`, `loglikelihood_acc_norm_single_token`, `loglikelihood_f1_single_token`, `mcc_single_token`, `recall@2_single_token` and `mrr_single_token`). When the multichoice option compares only one token (ex: "A" vs "B" vs "C" vs "D", or "yes" vs "no"), using these metrics in the single token version will divide the time spent by the number of choices. Single token evals also include: -- `multi_f1_numeric`: Computes the f1 score of all possible choices and averages it. - ## Automatic metrics for perplexity and language modeling These metrics use log-likelihood of prompt. - `word_perplexity`: Perplexity (log probability of the input) weighted by the number of words of the sequence. @@ -38,7 +35,7 @@ These metrics need the model to generate an output. They are therefore slower. - `f1_score`: Average F1 score in terms of word overlap between the model output and gold without normalisation. - `f1_score_macro`: Corpus level macro F1 score. - `f1_score_macro`: Corpus level micro F1 score. - - `maj_at_5` and `maj_at_8`: Model majority vote. Takes n (5 or 8) generations from the model and assumes the most frequent is the actual prediction. + - `maj_at_k`: Model majority vote. Samples k generations from the model and assumes the most frequent is the actual prediction. - Summarization: - `rouge`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/). - `rouge1`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap. @@ -65,9 +62,8 @@ These metrics need the model to generate an output. They are therefore slower. - `edit_similarity`: Average Levenshtein edit similarity (normalized by the length of longer sequence) between model generation and reference. - Math: - `quasi_exact_match_math`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for math, where latex symbols, units, etc are removed). - - `maj_at_4_math`: Majority choice evaluation, using the math normalisation for the predictions and gold. - `quasi_exact_match_gsm8k`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed). - - `maj_at_8_gsm8k`: Majority choice evaluation, using the gsm8k normalisation for the predictions and gold. + - `maj_at_k`: Majority choice evaluation can be applied with a math specific normalizer (gsm8k, math, etc) ## LLM-as-Judge - `llm_judge_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. diff --git a/examples/custom_tasks_tests.py b/examples/custom_tasks_tests.py index 46b2f18ab..e30669fb4 100644 --- a/examples/custom_tasks_tests.py +++ b/examples/custom_tasks_tests.py @@ -53,7 +53,7 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.gpqa_instruct_pass_at_1_1n], + metrics=[Metrics.gpqa_instruct_pass_at_k({"k": 1})], stop_sequence=[], # no stop sequence, will use eos token trust_dataset=True, version=0, diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 5f3ae5a19..d14a73201 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -77,7 +77,6 @@ from lighteval.metrics.utils.metric_utils import ( CorpusLevelMetric, CorpusLevelMetricGrouping, - MetricGrouping, SampleLevelMetric, SampleLevelMetricGrouping, SamplingMethod, @@ -94,6 +93,27 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) + avg_at_k = SampleLevelMetric( + metric_name="avg@k", + sample_level_fn=AvgAtK(strip_strings=True), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + avg_at_k_math = SampleLevelMetric( + metric_name="avg@k", + sample_level_fn=AvgAtK( + sample_scoring_function=DynamicMultilingualExtractiveMatch( + language=Language.ENGLISH, + gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], + pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], + precision=6, + ), + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) bert_score = SampleLevelMetricGrouping( metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"], sample_level_fn=BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip), @@ -244,14 +264,50 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - latex_gold_metric = DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - fallback_mode="first_match", - precision=5, - gold_extraction_target=(LatexExtractionConfig(),), - # Match boxed first before trying other regexes - pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), - aggregation_function=max, + g_pass_at_k = SampleLevelMetricGrouping( + metric_name="g-pass@k", + sample_level_fn=GPassAtK(strip_strings=True), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + g_pass_at_k_math = SampleLevelMetricGrouping( + metric_name="math-g-pass@k", + sample_level_fn=GPassAtK( + name_prefix="math", + strip_strings=True, + sample_scoring_function=DynamicMultilingualExtractiveMatch( + language=Language.ENGLISH, + fallback_mode="first_match", + precision=5, + gold_extraction_target=(ExprExtractionConfig(),), + # Match boxed first before trying other regexes + pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), + aggregation_function=max, + ), + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + g_pass_at_k_latex = SampleLevelMetricGrouping( + metric_name="latex-g-pass@k", + sample_level_fn=GPassAtK( + name_prefix="latex", + strip_strings=True, + sample_scoring_function=DynamicMultilingualExtractiveMatch( + language=Language.ENGLISH, + fallback_mode="first_match", + precision=5, + gold_extraction_target=(LatexExtractionConfig(),), + # Match boxed first before trying other regexes + pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), + aggregation_function=max, + ), + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, ) loglikelihood_acc = SampleLevelMetric( metric_name="acc", @@ -274,20 +330,6 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - loglikelihood_acc_norm_single_token = SampleLevelMetric( - metric_name="acc_norm", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm()), - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - loglikelihood_acc_single_token = SampleLevelMetric( - metric_name="acc", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=None), - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) loglikelihood_f1 = CorpusLevelMetric( metric_name="loglikelihood_f1", sample_level_fn=LoglikelihoodPreparator(), @@ -295,11 +337,11 @@ class Metrics(Enum): corpus_level_fn=CorpusLevelF1Score(None), higher_is_better=True, ) - loglikelihood_f1_single_token = CorpusLevelMetric( - metric_name="loglikelihood_f1", - sample_level_fn=LoglikelihoodPreparator(is_single_token=True), - category=SamplingMethod.LOGPROBS, - corpus_level_fn=CorpusLevelF1Score(None), + maj_at_k = SampleLevelMetric( + metric_name="maj@k", + sample_level_fn=MajAtK(), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, higher_is_better=True, ) mcc = CorpusLevelMetric( @@ -309,48 +351,18 @@ class Metrics(Enum): corpus_level_fn=matthews_corrcoef, higher_is_better=True, ) - mcc_single_token = CorpusLevelMetric( - metric_name="mcc", - sample_level_fn=LoglikelihoodPreparator(), + mrr = SampleLevelMetric( + metric_name="mrr", + sample_level_fn=MRR(), category=SamplingMethod.LOGPROBS, - corpus_level_fn=matthews_corrcoef, - higher_is_better=True, - ) - # NEW - avg_at_k = SampleLevelMetric( - metric_name="avg@k", - sample_level_fn=AvgAtK(strip_strings=True), - category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) - avg_at_k_math = SampleLevelMetric( - metric_name="avg@k", - sample_level_fn=AvgAtK( - sample_scoring_function=DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ), - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - g_pass_at_k = SampleLevelMetricGrouping( - metric_name=["g-pass@k:n samples"], - sample_level_fn=GPassAtK(strip_strings=True), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - maj_at_k = SampleLevelMetric( - metric_name="maj@k", - sample_level_fn=MajAtK(), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, + multi_f1_numeric = CorpusLevelMetric( + metric_name="mf1", + sample_level_fn=LoglikelihoodPreparator(is_single_token=True), + category=SamplingMethod.LOGPROBS, + corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3), higher_is_better=True, ) pass_at_k = SampleLevelMetric( @@ -390,266 +402,9 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - # OLD - maj_at_4_math = SampleLevelMetric( - metric_name="maj@4", - sample_level_fn=MajAtK( - k=4, strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - maj_at_5 = SampleLevelMetric( - metric_name="maj@5", - sample_level_fn=MajAtK(k=5), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - maj_at_8 = SampleLevelMetric( - metric_name="maj@8", - sample_level_fn=MajAtK(k=8), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - maj_at_8_gsm8k = SampleLevelMetric( - metric_name="maj@8", - sample_level_fn=MajAtK( - k=8, strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - math_avg_at_64 = SampleLevelMetric( - metric_name="math_avg@64", - sample_level_fn=AvgAtK( - k=64, - sample_scoring_function=DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ), - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - math_pass_at_1_1n = SampleLevelMetric( - metric_name="math_pass@1:1_samples", - sample_level_fn=PassAtK( - k=1, - n=1, - strip_strings=True, - # Extracting mathematical expressions and latex expressions - sample_scoring_function=DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ), - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - math_pass_at_1_4n = SampleLevelMetric( - metric_name="math_pass@1:4_samples", - sample_level_fn=PassAtK( - k=1, - n=4, - strip_strings=True, - sample_scoring_function=DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ), - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - math_pass_at_1_8n = SampleLevelMetric( - metric_name="math_pass@1:8_samples", - sample_level_fn=PassAtK( - k=1, - n=8, - strip_strings=True, - # Extracting mathematical expressions and latex expressions - sample_scoring_function=DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ), - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - math_pass_at_1_16n = SampleLevelMetric( - metric_name="math_pass@1:16_samples", - sample_level_fn=PassAtK( - k=1, - n=16, - strip_strings=True, - sample_scoring_function=DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ), - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - math_pass_at_1_32n = SampleLevelMetric( - metric_name="math_pass@1:32_samples", - sample_level_fn=PassAtK( - k=1, - n=32, - strip_strings=True, - sample_scoring_function=DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ), - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - math_pass_at_1_64n = SampleLevelMetric( - metric_name="math_pass@1:64_samples", - sample_level_fn=PassAtK( - k=1, - n=64, - strip_strings=True, - sample_scoring_function=DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - precision=6, - ), - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - - mrr = SampleLevelMetric( - metric_name="mrr", - sample_level_fn=MRR(), - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - mrr_single_token = SampleLevelMetric( - metric_name="mrr", - sample_level_fn=mrr, - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - multi_f1_numeric = CorpusLevelMetric( - metric_name="mf1", - sample_level_fn=LoglikelihoodPreparator(is_single_token=True), - category=SamplingMethod.LOGPROBS, - corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3), - higher_is_better=True, - ) - pass_at_1 = SampleLevelMetric( - metric_name="pass@1:32_samples", - sample_level_fn=PassAtK(k=1, n=32, strip_strings=True), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - pass_at_10 = SampleLevelMetric( - metric_name="pass@10:32_samples", - sample_level_fn=PassAtK(k=10, n=32, strip_strings=True), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - pass_at_100 = SampleLevelMetric( - metric_name="pass@100:32_samples", - sample_level_fn=PassAtK(k=100, n=32, strip_strings=True), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - g_pass_at_16 = SampleLevelMetricGrouping( - metric_name=["G-Pass@16:48_samples"], - sample_level_fn=GPassAtK(k=16, n=48, strip_strings=True), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), - higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), - ) - g_pass_at_8_16 = SampleLevelMetricGrouping( - metric_name=["G-Pass@8-16:48_samples"], - sample_level_fn=GPassAtK(k=[8, 16], n=48, strip_strings=True), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), - higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), - ) - g_pass_at_16_expr_gold = SampleLevelMetricGrouping( - metric_name=["G-Pass@16:48_samples"], - sample_level_fn=GPassAtK( - k=16, - n=48, - strip_strings=True, - sample_scoring_function=DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - fallback_mode="first_match", - precision=5, - gold_extraction_target=(ExprExtractionConfig(),), - # Match boxed first before trying other regexes - pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), - aggregation_function=max, - ), - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), - higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), - ) - g_pass_at_16_latex_gold = SampleLevelMetricGrouping( - metric_name=["G-Pass@16:48_samples"], - sample_level_fn=GPassAtK( - k=16, - n=48, - strip_strings=True, - sample_scoring_function=DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - fallback_mode="first_match", - precision=5, - gold_extraction_target=(LatexExtractionConfig(),), - # Match boxed first before trying other regexes - pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), - aggregation_function=max, - ), - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), - higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), - ) - perfect_exact_match = SampleLevelMetric( - metric_name="perfect_em", - sample_level_fn=ExactMatches(), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) prediction_perplexity = SampleLevelMetric( metric_name="ppl", - sample_level_fn=None, # todo!!! + sample_level_fn=PerplexityPreparator("words"), category=SamplingMethod.PERPLEXITY, corpus_level_fn=CorpusLevelPerplexityMetric("perplexity"), higher_is_better=True, @@ -708,20 +463,6 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - recall_at_1_single_token = SampleLevelMetric( - metric_name="acc", - sample_level_fn=Recall(at=1), - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - recall_at_2_single_token = SampleLevelMetric( - metric_name="recall@2", - sample_level_fn=Recall(at=2), - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) recall_at_1 = SampleLevelMetric( metric_name="acc", sample_level_fn=Recall(at=1), @@ -820,43 +561,9 @@ class Metrics(Enum): pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], precision=6, ) - gpqa_instruct_pass_at_1_1n = SampleLevelMetric( - metric_name="gpqa_pass@1:1_samples", + gpqa_instruct_pass_at_k = SampleLevelMetric( + metric_name="gpqa_pass@k", sample_level_fn=PassAtK( - k=1, - n=1, - sample_scoring_function=DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - precision=6, - ), - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - gpqa_instruct_pass_at_1_4n = SampleLevelMetric( - metric_name="gpqa_pass@1:4_samples", - sample_level_fn=PassAtK( - k=1, - n=4, - sample_scoring_function=DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - precision=6, - ), - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - gpqa_instruct_pass_at_1_8n = SampleLevelMetric( - metric_name="gpqa_pass@1:8_samples", - sample_level_fn=PassAtK( - k=1, - n=8, sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], @@ -872,15 +579,10 @@ class Metrics(Enum): def __str__(self): return self.name.replace("_at_", "@") - @staticmethod - def higher_is_better(): - res = {} - for metric in Metrics: - if isinstance(metric.value, MetricGrouping): - res.update(metric.value.higher_is_better) - else: - res[metric.value.metric_name] = metric.value.higher_is_better - return res + def __call__(self, **kwargs): + # When parametrizing, we don't look at the Metrics enum, + # but at a specific single metric (a value) + return self.value(kwargs) @staticmethod def all_metrics(): diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 8b7cf344e..91d23ae91 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -1132,6 +1132,9 @@ def default_sample_scoring(self, doc: Doc, model_response: ModelResponse) -> int return 1 if pred.endswith(gold) else 0 return 1 if gold == pred else 0 + def name_metrics(self) -> str | list[str]: + raise NotImplementedError + class AvgAtK(SamplingMetric): def __init__(self, k: int | None = None, **kwargs): @@ -1289,6 +1292,7 @@ def __init__( k: Union[int, list[int]] | None = None, n: int | None = None, thresholds: list[float] = [0.0, 0.25, 0.5, 0.75, 1.0], + name_prefix: str = None, **kwargs, ): """Computing G-Pass@k from http://arxiv.org/abs/2412.13147 @@ -1304,6 +1308,7 @@ def __init__( self.attribute_must_be_set = ["k"] self.thresholds = thresholds + self.name = (f"{name_prefix}_" if name_prefix else "") + "g-pass@" def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: """Computes the metric over a list of golds and predictions for one single item with possibly many samples. @@ -1377,21 +1382,21 @@ def compute_mg_pass_at_k(n, c, k): metrics = {} for k in ks: for t in thresholds: - metrics[f"G-Pass@{k}_{t}"] = compute_g_pass_at_k(n, c, k, t) - metrics[f"mG-Pass@{k}"] = compute_mg_pass_at_k(n, c, k) + metrics[f"{self.name}@{k}_{t}"] = compute_g_pass_at_k(n, c, k, t) + metrics[f"m{self.name}@{k}"] = compute_mg_pass_at_k(n, c, k) return metrics @property - def all_metrics(self): + def metric_names(self): ks: int = self.k thresholds: list[float] = self.thresholds metrics = [] for k in ks: for t in thresholds: - metrics.append(f"G-Pass@{k}_{t}") - metrics.append(f"mG-Pass@{k}") + metrics.append(f"{self.name}@{k}_{t}") + metrics.append(f"m{self.name}@{k}") return metrics diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py index d6adac109..75af6cd5c 100644 --- a/src/lighteval/metrics/utils/metric_utils.py +++ b/src/lighteval/metrics/utils/metric_utils.py @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from dataclasses import dataclass, replace +from dataclasses import dataclass from typing import Callable from lighteval.metrics.sample_preparator import Preparator @@ -56,7 +56,10 @@ def compute_sample( def get_corpus_aggregations(self) -> dict: if isinstance(self, MetricGrouping): - corpus_level_fn = self.corpus_level_fn + if isinstance(self.corpus_level_fn, dict): + corpus_level_fn = self.corpus_level_fn + else: + corpus_level_fn = dict.fromkeys(self.metric_name, self.corpus_level_fn) else: corpus_level_fn = {self.metric_name: self.corpus_level_fn} @@ -68,13 +71,22 @@ def get_corpus_aggregations(self) -> dict: return corpus_level_fn - def __call__(self, sample_params: dict | None): # , corpus_params: dict | None): + def __call__(self, sample_params: dict | None): """Allow creating new instances with modified parameters""" if sample_params: - self.sample_level_fn = replace(self.sample_level_fn, sample_params) - # Corpus params are unused for now, as the registry only expects sample level params - # if corpus_params: - # self.corpus_level_fn = replace(self.corpus_level_fn, corpus_params) + for k, v in sample_params.items(): + setattr(self.sample_level_fn, k, v) + + # Once the parameters are updated, we need to adjust the + # metric name to what will be returned + if isinstance(self, MetricGrouping): + if hasattr(self.sample_level_fn, "metric_names"): + # this is mostly for the gpass@k metrics + self.metric_name = self.sample_level_fn.metric_names() + else: + self.metric_name = [metric + str(sample_params) for metric in self.metric_name] + else: + self.metric_name = self.metric_name + str(sample_params) return self diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 7ff3c0729..89c1cab3e 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -21,6 +21,9 @@ # SOFTWARE. import lighteval.tasks.default_prompts as prompt from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import ( + math_normalizer, +) from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.templates.qa import get_qa_prompt_function from lighteval.utils.language import Language @@ -373,14 +376,7 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[ - Metrics.math_pass_at_1_1n, - Metrics.math_pass_at_1_4n, - Metrics.math_pass_at_1_8n, - Metrics.math_pass_at_1_16n, - Metrics.math_pass_at_1_32n, - Metrics.math_pass_at_1_64n, - ], + metrics=[Metrics.pass_at_k_math({"k": 1})], version=2, ) aime24_avg = LightevalTaskConfig( @@ -394,9 +390,7 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[ - Metrics.math_avg_at_64, - ], + metrics=[Metrics.avg_at_k_math({"k": 64})], version=2, ) aime24_gpassk = LightevalTaskConfig( @@ -410,7 +404,7 @@ few_shots_split=None, few_shots_select=None, generation_size=8192, - metrics=[Metrics.g_pass_at_16_expr_gold], + metrics=[Metrics.g_pass_at_k_math({"k": 16, "n": 48})], version=1, ) aime25 = LightevalTaskConfig( @@ -424,14 +418,7 @@ few_shots_split=None, few_shots_select=None, generation_size=10000, - metrics=[ - Metrics.math_pass_at_1_1n, - # Metrics.math_pass_at_1_4n, - # Metrics.math_pass_at_1_8n, - # Metrics.math_pass_at_1_16n, - # Metrics.math_pass_at_1_32n, - # Metrics.math_pass_at_1_64n, - ], + metrics=[Metrics.pass_at_k_math({"k": 1, "n": 1})], version=2, ) aime25_gpassk = LightevalTaskConfig( @@ -445,7 +432,7 @@ few_shots_split=None, few_shots_select=None, generation_size=8192, - metrics=[Metrics.g_pass_at_16_expr_gold], + metrics=[Metrics.g_pass_at_k_math({"k": 16, "n": 48})], version=1, ) anachronisms_bigbench = LightevalTaskConfig( @@ -811,7 +798,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -827,7 +814,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -891,7 +878,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=None, trust_dataset=True, version=0, @@ -1527,7 +1514,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1549,7 +1536,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1571,7 +1558,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1593,7 +1580,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1615,7 +1602,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1637,7 +1624,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1659,7 +1646,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1681,7 +1668,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1703,7 +1690,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1725,7 +1712,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1747,7 +1734,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1769,7 +1756,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1791,7 +1778,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1813,7 +1800,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1835,7 +1822,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1857,7 +1844,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1879,7 +1866,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1901,7 +1888,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1923,7 +1910,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1945,7 +1932,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1967,7 +1954,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1989,7 +1976,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -2011,7 +1998,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -2033,7 +2020,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -2055,7 +2042,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -2077,7 +2064,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -2099,7 +2086,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -2121,7 +2108,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2143,7 +2130,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2165,7 +2152,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2187,7 +2174,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2209,7 +2196,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2231,7 +2218,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2253,7 +2240,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2275,7 +2262,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2297,7 +2284,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2319,7 +2306,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2341,7 +2328,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2363,7 +2350,7 @@ Metrics.quasi_exact_match, Metrics.prefix_exact_match, Metrics.prefix_quasi_exact_match, - Metrics.perfect_exact_match, + Metrics.exact_match({"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -5980,7 +5967,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6028,7 +6015,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6044,7 +6031,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6060,7 +6047,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6331,7 +6318,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6400,7 +6387,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.perfect_exact_match], + metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.exact_match({"strip_strings": False})], stop_sequence=[".", ";", "!", "?"], trust_dataset=True, version=0, @@ -6416,7 +6403,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6727,7 +6714,7 @@ few_shots_split=None, few_shots_select=None, generation_size=10, - metrics=[Metrics.perfect_exact_match, Metrics.f1_score], + metrics=[Metrics.exact_match({"strip_strings": False}), Metrics.f1_score], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6814,7 +6801,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6910,7 +6897,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -7044,7 +7031,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -7679,7 +7666,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -7711,7 +7698,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -7920,9 +7907,9 @@ few_shots_select=None, generation_size=32768, # needed for reasoning models like R1 metrics=[ - Metrics.gpqa_instruct_pass_at_1_1n, - Metrics.gpqa_instruct_pass_at_1_4n, - Metrics.gpqa_instruct_pass_at_1_8n, + Metrics.gpqa_instruct_pass_at_k({"k": 1}), + Metrics.gpqa_instruct_pass_at_k({"k": 1, "n": 4}), + Metrics.gpqa_instruct_pass_at_k({"k": 1, "n": 8}), ], stop_sequence=[], # no stop sequence, will use eos token trust_dataset=True, @@ -8122,7 +8109,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -8470,7 +8457,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -8939,7 +8926,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -9606,7 +9593,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -9622,7 +9609,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], stop_sequence=None, trust_dataset=True, version=0, @@ -9855,7 +9842,7 @@ few_shots_split=None, few_shots_select=None, generation_size=8192, - metrics=[Metrics.g_pass_at_16_latex_gold], + metrics=[Metrics.g_pass_at_k_latex({"k": 16, "n": 48})], version=1, ) math_algebra_lighteval = LightevalTaskConfig( @@ -9869,7 +9856,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=1, @@ -9885,7 +9882,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=1, @@ -9901,7 +9908,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=1, @@ -9917,7 +9934,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=1, @@ -9933,7 +9960,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=1, @@ -9949,7 +9986,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=1, @@ -9965,7 +10012,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=1, @@ -9981,7 +10038,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -9997,7 +10064,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -10013,7 +10090,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -10029,7 +10116,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -10045,7 +10142,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -10061,7 +10168,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -10077,7 +10194,17 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.quasi_exact_match_math, Metrics.maj_at_4_math], + metrics=[ + Metrics.quasi_exact_match_math, + Metrics.maj_at_k( + sample_params={ + "k": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -10125,7 +10252,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -13570,7 +13697,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -13698,7 +13825,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14043,7 +14170,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14112,7 +14239,7 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14128,7 +14255,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14160,7 +14287,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=None, trust_dataset=True, version=0, @@ -14176,7 +14303,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14192,7 +14319,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14272,7 +14399,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14474,7 +14601,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleurt, Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleurt, Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14871,7 +14998,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14887,7 +15014,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.perfect_exact_match], + metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14967,7 +15094,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14999,7 +15126,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15015,7 +15142,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15063,7 +15190,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15095,7 +15222,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15127,7 +15254,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15143,7 +15270,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.perfect_exact_match], + metrics=[Metrics.bleu, Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15175,7 +15302,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15363,7 +15490,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15767,7 +15894,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16669,7 +16796,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16685,7 +16812,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16701,7 +16828,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16717,7 +16844,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16733,7 +16860,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16749,7 +16876,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -21419,7 +21546,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -21435,7 +21562,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.perfect_exact_match], + metrics=[Metrics.exact_match({"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 4276511cf..bc3b040b7 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -31,8 +31,9 @@ from multiprocess import Pool from pytablewriter import MarkdownTableWriter -from lighteval.metrics.metrics import Metric, Metrics +from lighteval.metrics.metrics import Metrics from lighteval.metrics.metrics_sample import SamplingMetric +from lighteval.metrics.utils.metric_utils import Metric from lighteval.tasks.prompt_manager import FewShotSampler from lighteval.tasks.requests import ( Doc, From 511d0e69b8e71c3fe181a7b1d061376c16598cde Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Tue, 19 Aug 2025 13:01:02 +0000 Subject: [PATCH 09/38] keep on making metrics more adjustable --- community_tasks/aimo_evals.py | 3 +- community_tasks/arabic_evals.py | 39 +- community_tasks/french_evals.py | 6 +- community_tasks/serbian_eval.py | 97 +- community_tasks/turkic_evals.py | 11 +- docs/source/metric-list.mdx | 11 +- examples/nanotron/custom_evaluation_tasks.py | 92 +- src/lighteval/metrics/metrics.py | 99 +- src/lighteval/metrics/metrics_sample.py | 6 +- src/lighteval/tasks/default_tasks.py | 4621 +++++++++++++----- 10 files changed, 3605 insertions(+), 1380 deletions(-) diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py index 885ffd8da..f0a01c16c 100644 --- a/community_tasks/aimo_evals.py +++ b/community_tasks/aimo_evals.py @@ -26,6 +26,7 @@ """ from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import math_normalizer from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -49,7 +50,7 @@ def aimo_prompt(line, task_name: str = None): evaluation_splits=["train"], few_shots_split="train", few_shots_select="sequential", - metric=[Metrics.quasi_exact_match_math], + metric=[Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer})], generation_size=2048, stop_sequence=None, ) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index cbfce2cc0..68cb2a0f0 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -33,6 +33,7 @@ from lighteval.metrics.llm_as_judge import JudgeLM from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.metrics.utils.metric_utils import Metric from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.lighteval_task import LightevalTaskConfig @@ -104,7 +105,7 @@ def __init__( hf_subset=hf_subset, prompt_function=arabic_mmlu_pfn, hf_repo="MBZUAI/ArabicMMLU", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test"], evaluation_splits=["test"], few_shots_split=["dev"], @@ -166,7 +167,7 @@ def __init__( hf_subset=hf_subset, prompt_function=arabic_mmlu_ht_pfn, hf_repo="MBZUAI/human_translated_arabic_mmlu", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test"], evaluation_splits=["test"], few_shots_split=None, @@ -231,7 +232,7 @@ def __init__( hf_subset=hf_subset, prompt_function=arabic_mmlu_mt_pfn, hf_repo="OALL/Arabic_MMLU", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test", "dev"], evaluation_splits=["test"], few_shots_split="dev", @@ -287,7 +288,7 @@ def __init__( hf_subset=hf_subset, prompt_function=acva_pfn, hf_repo="OALL/ACVA", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", @@ -344,7 +345,7 @@ def __init__( hf_subset=hf_subset, prompt_function=aratrust_pfn, hf_repo="asas-ai/AraTrust-categorized", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["train"], evaluation_splits=["train"], few_shots_split=None, @@ -393,7 +394,7 @@ def arabic_exams_pfn(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], trust_dataset=True, version=0, ) @@ -444,7 +445,7 @@ def __init__( hf_subset=hf_subset, prompt_function=alghafa_pfn, hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", @@ -471,7 +472,7 @@ def __init__( evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], trust_dataset=True, version=0, ) @@ -488,7 +489,7 @@ def __init__( evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], trust_dataset=True, version=0, ) @@ -505,7 +506,7 @@ def __init__( evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], trust_dataset=True, version=0, ) @@ -522,7 +523,7 @@ def __init__( evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], trust_dataset=True, version=0, ) @@ -539,7 +540,7 @@ def __init__( evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], trust_dataset=True, version=0, ) @@ -556,7 +557,7 @@ def __init__( evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], trust_dataset=True, version=0, ) @@ -594,7 +595,7 @@ def boolq_arabic_pfn(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], trust_dataset=True, version=0, ) @@ -629,7 +630,7 @@ def copa_arabic_pfn(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], trust_dataset=True, version=0, ) @@ -673,7 +674,7 @@ def hellaswag_arabic_pfn(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], trust_dataset=True, version=0, ) @@ -710,7 +711,7 @@ def toxigen_arabic_pfn(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], trust_dataset=True, version=0, ) @@ -761,7 +762,7 @@ def sciq_arabic_pfn(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], trust_dataset=True, version=0, ) @@ -819,7 +820,7 @@ def __init__( hf_subset=hf_subset, prompt_function=madinah_qa_pfn, hf_repo="MBZUAI/MadinahQA", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test"], evaluation_splits=["test"], few_shots_split=["dev"], diff --git a/community_tasks/french_evals.py b/community_tasks/french_evals.py index 75185113d..42875dab3 100644 --- a/community_tasks/french_evals.py +++ b/community_tasks/french_evals.py @@ -33,6 +33,7 @@ import random from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import math_normalizer from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.extended.ifeval.main import ifeval_metrics from lighteval.tasks.lighteval_task import LightevalTaskConfig @@ -137,7 +138,10 @@ def prompt_bac_fr(line, task_name: str = None): few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.quasi_exact_match_math, Metrics.exact_match], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), + Metrics.exact_match, + ], stop_sequence=["\n"], trust_dataset=True, version=0, diff --git a/community_tasks/serbian_eval.py b/community_tasks/serbian_eval.py index 9cfe85e31..1d24ebe83 100644 --- a/community_tasks/serbian_eval.py +++ b/community_tasks/serbian_eval.py @@ -35,6 +35,7 @@ from typing import List, Optional from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -300,7 +301,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.ARC_EASY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) arc_challenge = create_task_config( @@ -308,7 +309,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.ARC_CHALLENGE.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -320,14 +321,14 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.HELLASWAG.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) piqa = create_task_config( task_name="serbian_evals:piqa", prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.PIQA.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) winogrande = create_task_config( @@ -335,7 +336,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.WINOGRANDE.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -359,7 +360,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ANATOMY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_astronomy = create_task_config( @@ -367,7 +368,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ASTRONOMY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_business_ethics = create_task_config( @@ -375,7 +376,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_BUSINESS_ETHICS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_clinical_knowledge = create_task_config( @@ -383,7 +384,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_CLINICAL_KNOWLEDGE.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_miscellaneous = create_task_config( @@ -391,7 +392,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MISCELLANEOUS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_electrical_engineering = create_task_config( @@ -399,7 +400,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ELECTRONIC_ENGINEERING.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -411,7 +412,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_SERBIAN_ALL.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -423,7 +424,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MARKETING.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_management = create_task_config( @@ -431,7 +432,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MANAGEMENT.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -443,7 +444,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_BIOLOGY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_chemistry = create_task_config( @@ -451,7 +452,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_CHEMISTRY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_computer_science = create_task_config( @@ -459,7 +460,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_COMPUTER_SCIENCE.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_mathematics = create_task_config( @@ -467,7 +468,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_MATHEMATICS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_medicine = create_task_config( @@ -475,7 +476,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_MEDICINE.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_physics = create_task_config( @@ -483,7 +484,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_PHYSICS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_computer_security = create_task_config( @@ -491,7 +492,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_COMPUTER_SECURITY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -503,7 +504,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MORAL_DISPUTES.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_moral_scenarios = create_task_config( @@ -511,7 +512,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MORAL_SCENARIOS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_philosophy = create_task_config( @@ -519,7 +520,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_PHILOSOPHY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_world_religions = create_task_config( @@ -527,7 +528,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_WORLD_RELIGIONS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -539,7 +540,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_BIOLOGY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_chemistry = create_task_config( @@ -547,7 +548,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_CHEMISTRY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_computer_science = create_task_config( @@ -555,7 +556,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_COMPUTER_SCIENCE.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_european_history = create_task_config( @@ -563,7 +564,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_EURO_HISTORY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_geography = create_task_config( @@ -571,7 +572,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_GEOGRAPHY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_mathematics = create_task_config( @@ -579,7 +580,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_MATHEMATICS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_microeconomics = create_task_config( @@ -587,7 +588,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_MICROECONOMICS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_physics = create_task_config( @@ -595,7 +596,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_PHYSICS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_psychology = create_task_config( @@ -603,7 +604,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_PSYCHOLOGY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_statistics = create_task_config( @@ -611,7 +612,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_STATISTICS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_world_history = create_task_config( @@ -619,7 +620,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_WORLD_HISTORY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -631,7 +632,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ABSTRACT_ALGEBRA.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_elementary_mathematics = create_task_config( @@ -639,7 +640,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ELEMENTARY_MATHEMATICS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_formal_logic = create_task_config( @@ -647,7 +648,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_FORMAL_LOGIC.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_conceptual_physics = create_task_config( @@ -655,7 +656,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_CONCEPTUAL_PHYSICS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_econometrics = create_task_config( @@ -663,7 +664,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ECONOMETRICS.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_machine_learning = create_task_config( @@ -671,7 +672,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MACHINE_LEARNING.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -683,7 +684,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_GLOBAL_FACT.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_logical_fallacies = create_task_config( @@ -691,7 +692,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_LOGICAL_FALLACIES.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_sociology = create_task_config( @@ -699,7 +700,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_SOCIOLOGY.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_human_aging = create_task_config( @@ -707,7 +708,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HUMAN_AGING.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -719,7 +720,7 @@ def create_task_config( prompt_function=boolq_serbian, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.BOOLQ.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) openbook_qa = create_task_config( @@ -727,7 +728,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.OPENBOOK.value, - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) diff --git a/community_tasks/turkic_evals.py b/community_tasks/turkic_evals.py index 82aadeafc..255773504 100644 --- a/community_tasks/turkic_evals.py +++ b/community_tasks/turkic_evals.py @@ -37,15 +37,10 @@ } """ -import random -import re from functools import partial -from typing import Any, Dict, List, Optional, Union -from lighteval.metrics.llm_as_judge import JudgeLM -from lighteval.metrics.metrics import MetricCategory, Metrics -from lighteval.metrics.utils.metric_utils import Metric, MetricUseCase -from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -128,7 +123,7 @@ def __init__( hf_subset=hf_subset, prompt_function=partial(tumlu_pfn, language=hf_subset), hf_repo="jafarisbarov/TUMLU-mini", - metric=[Metrics.loglikelihood_acc_norm], + metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test", "dev"], evaluation_splits=["test"], few_shots_split=["dev"], diff --git a/docs/source/metric-list.mdx b/docs/source/metric-list.mdx index c1315fdb1..e089fb8cd 100644 --- a/docs/source/metric-list.mdx +++ b/docs/source/metric-list.mdx @@ -3,14 +3,13 @@ ## Automatic metrics for multiple-choice tasks These metrics use log-likelihood of the different possible targets. -- `loglikelihood_acc`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token -- `loglikelihood_acc_norm`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct - also exists in a faster version for tasks where the possible choices include only one token +- `loglikelihood_acc`: Fraction of instances where the choice with the best logprob was correct +- `loglikelihood_acc_norm`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct - `loglikelihood_acc_norm_nospace`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct, with the first space ignored. -- `loglikelihood_f1`: Corpus level F1 score of the multichoice selection - also exists in a faster version for tasks where the possible choices include only one token +- `loglikelihood_f1`: Corpus level F1 score of the multichoice selection - `mcc`: Matthew's correlation coefficient (a measure of agreement between statistical distributions). -- `recall_at_1`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token per choice -- `recall_at_2`: Fraction of instances where the choice with the 2nd best logprob or better was correct - also exists in a faster version for tasks where the possible choices include only one token per choice -- `mrr`: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance - also exists in a faster version for tasks where the possible choices include only one token +- `recall_at_k`: Fraction of instances where the choice with the k-st best logprob or better was correct +- `mrr`: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance - `target_perplexity`: Perplexity of the different choices available. - `acc_golds_likelihood`: A bit different, it actually checks if the average logprob of a single target is above or below 0.5. - `multi_f1_numeric`: Loglikelihood F1 score for multiple gold targets. diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py index 49010098c..aa41c2d91 100644 --- a/examples/nanotron/custom_evaluation_tasks.py +++ b/examples/nanotron/custom_evaluation_tasks.py @@ -33,6 +33,7 @@ import lighteval.tasks.default_prompts as prompt from lighteval.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm, helm_normalizer, math_normalizer from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -89,7 +90,12 @@ def preprocess(text): prompt_function=hellaswag_prompt, hf_repo="hellaswag", hf_subset="default", - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metric=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], trust_dataset=True, stop_sequence=["\n"], ), @@ -98,7 +104,12 @@ def preprocess(text): prompt_function=prompt.winogrande, hf_repo="winogrande", hf_subset="winogrande_xl", - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metric=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], trust_dataset=True, stop_sequence=["\n"], ), @@ -107,7 +118,12 @@ def preprocess(text): prompt_function=prompt.piqa_harness, hf_repo="piqa", hf_subset="plain_text", - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metric=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], trust_dataset=True, stop_sequence=["\n"], ), @@ -117,7 +133,12 @@ def preprocess(text): hf_repo="lighteval/siqa", hf_subset="default", hf_avail_splits=["train", "validation"], - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metric=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], trust_dataset=True, stop_sequence=["\n"], ), @@ -126,7 +147,12 @@ def preprocess(text): prompt_function=prompt.openbookqa, hf_repo="openbookqa", hf_subset="main", - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metric=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], trust_dataset=True, stop_sequence=["\n"], ), @@ -137,7 +163,12 @@ def preprocess(text): hf_subset="ARC-Easy", evaluation_splits=["test"], generation_size=1, - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metric=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], trust_dataset=True, stop_sequence=["\n"], ), @@ -148,7 +179,12 @@ def preprocess(text): hf_subset="ARC-Challenge", evaluation_splits=["test"], generation_size=1, - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metric=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], trust_dataset=True, stop_sequence=["\n"], ), @@ -157,7 +193,12 @@ def preprocess(text): prompt_function=commonsense_qa_prompt, hf_repo="commonsense_qa", hf_subset="default", - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metric=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], trust_dataset=True, stop_sequence=["\n"], ), @@ -187,7 +228,9 @@ def natural_questions_prompt(line, task_name: str = None): prompt_function=prompt.triviaqa, hf_repo="trivia_qa", hf_subset="rc.nocontext", - metric=[Metrics.quasi_exact_match], + metric=[ + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}) + ], generation_size=20, trust_dataset=True, stop_sequence=["\n", ".", ","], @@ -197,7 +240,9 @@ def natural_questions_prompt(line, task_name: str = None): prompt_function=natural_questions_prompt, hf_repo="lighteval/natural_questions_clean", hf_subset="default", - metric=[Metrics.quasi_exact_match], + metric=[ + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}) + ], generation_size=20, trust_dataset=True, stop_sequence=["\n", ".", ","], @@ -236,7 +281,9 @@ def boolq_prompt(line, task_name: str = None): prompt_function=prompt.quac, hf_repo="lighteval/quac_helm", hf_subset="deault", - metric=[Metrics.quasi_exact_match], + metric=[ + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}) + ], generation_size=20, trust_dataset=True, stop_sequence=["\n", ".", ","], @@ -259,7 +306,9 @@ def __init__( prompt_function=prompt.math, hf_repo="DigitalLearningGmbH/MATH-lighteval", hf_subset=None, - metric=[Metrics.quasi_exact_match_math], + metric=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}) + ], hf_avail_splits=None, evaluation_splits=["test"], few_shots_split=None, @@ -358,7 +407,12 @@ def __init__( hf_repo="lighteval/mmlu", hf_subset=None, # metric=[Metrics.loglikelihood_acc_single_token], - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metric=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], hf_avail_splits=None, evaluation_splits=["test"], few_shots_split="dev", @@ -603,7 +657,12 @@ def __init__( hf_repo="lighteval/agi_eval_en", hf_subset=None, # metric=[Metrics.loglikelihood_acc_single_token], - metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metric=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc( + sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)} + ), + ], hf_avail_splits=["train", "validation"], evaluation_splits=["train"], few_shots_split="validation", @@ -640,7 +699,10 @@ def __init__( name="agi_eval:math", hf_subset="math", prompt_function=agi_eval_math_prompt, - metric=[Metrics.exact_match, Metrics.quasi_exact_match], + metric=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], generation_size=40, ), CustomAGIEvalEvaluationTask(name="agi_eval:sat-en", hf_subset="sat-en"), diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index d14a73201..e2470d5b3 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -59,12 +59,7 @@ acc_golds_likelihood, ) from lighteval.metrics.normalizations import ( - LogProbCharNorm, bigbench_normalizer, - gsm8k_normalizer, - harness_triviaqa_normalizer, - helm_normalizer, - math_normalizer, remove_braces, remove_braces_and_strip, ) @@ -227,13 +222,6 @@ class Metrics(Enum): "summarization_compression": True, }, ) - f1_score_quasi = SampleLevelMetric( - metric_name="f1_score_quasi", - sample_level_fn=F1_score(normalize_gold=helm_normalizer, normalize_pred=helm_normalizer), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) f1_score = SampleLevelMetric( metric_name="f1", sample_level_fn=F1_score(), @@ -316,20 +304,6 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - loglikelihood_acc_norm = SampleLevelMetric( - metric_name="acc_norm", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm()), - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - loglikelihood_acc_norm_nospace = SampleLevelMetric( - metric_name="acc_norm", - sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm(ignore_first_space=True)), - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) loglikelihood_f1 = CorpusLevelMetric( metric_name="loglikelihood_f1", sample_level_fn=LoglikelihoodPreparator(), @@ -366,14 +340,14 @@ class Metrics(Enum): higher_is_better=True, ) pass_at_k = SampleLevelMetric( - metric_name="pass@k:n samples", + metric_name="pass@k", sample_level_fn=PassAtK(strip_strings=True), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) pass_at_k_math = SampleLevelMetric( - metric_name="pass@k:n samples", + metric_name="pass@k", sample_level_fn=PassAtK( strip_strings=True, # Extracting mathematical expressions and latex expressions @@ -389,7 +363,7 @@ class Metrics(Enum): higher_is_better=True, ) pass_at_k_letters = SampleLevelMetric( - metric_name="pass@k:n samples", + metric_name="pass@k", sample_level_fn=PassAtK( sample_scoring_function=DynamicMultilingualExtractiveMatch( language=Language.ENGLISH, @@ -409,70 +383,9 @@ class Metrics(Enum): corpus_level_fn=CorpusLevelPerplexityMetric("perplexity"), higher_is_better=True, ) - prefix_exact_match = SampleLevelMetric( - metric_name="pem", - sample_level_fn=ExactMatches(strip_strings=True, type_exact_match="prefix"), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - prefix_quasi_exact_match = SampleLevelMetric( - metric_name="pqem", - sample_level_fn=ExactMatches( - normalize_gold=helm_normalizer, - normalize_pred=helm_normalizer, - type_exact_match="prefix", - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - quasi_exact_match = SampleLevelMetric( - metric_name="qem", - sample_level_fn=ExactMatches( - normalize_gold=helm_normalizer, - normalize_pred=helm_normalizer, - strip_strings=True, - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - quasi_exact_match_math = SampleLevelMetric( - metric_name="qem", - sample_level_fn=ExactMatches( - strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - quasi_exact_match_triviaqa = SampleLevelMetric( - metric_name="qem", - sample_level_fn=ExactMatches(strip_strings=True, normalize_pred=harness_triviaqa_normalizer), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - quasi_exact_match_gsm8k = SampleLevelMetric( - metric_name="qem", - sample_level_fn=ExactMatches( - strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer - ), - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - recall_at_1 = SampleLevelMetric( - metric_name="acc", - sample_level_fn=Recall(at=1), - category=SamplingMethod.LOGPROBS, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - recall_at_2 = SampleLevelMetric( - metric_name="recall@2", - sample_level_fn=Recall(at=2), + recall_at_k = SampleLevelMetric( + metric_name="recall", + sample_level_fn=Recall(k=1), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 91d23ae91..1026a21a1 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -394,14 +394,14 @@ def compute( class Recall: - def __init__(self, at: int) -> None: - """Recall metric class. It checks if the top `at` best choices include one of the golds or not. + def __init__(self, k: int) -> None: + """Recall metric class. It checks if the top `k` best choices include one of the golds or not. Args: at (int): Depth level of the recall. Recall at 1 is equivalent to a logprob accuracy without normalization. """ - self.recall_depth = at + self.recall_depth = k def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> int: """Computes the recall at the requested depth level: looks at the `n` best predicted choices (with the diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 89c1cab3e..6f848b48a 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -22,6 +22,10 @@ import lighteval.tasks.default_prompts as prompt from lighteval.metrics.metrics import Metrics from lighteval.metrics.normalizations import ( + LogProbCharNorm, + gsm8k_normalizer, + harness_triviaqa_normalizer, + helm_normalizer, math_normalizer, ) from lighteval.tasks.lighteval_task import LightevalTaskConfig @@ -104,7 +108,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -120,7 +127,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -136,7 +146,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -152,7 +165,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -168,7 +184,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -184,7 +203,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -200,7 +222,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -216,7 +241,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -232,7 +260,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -248,7 +279,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -264,7 +298,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -280,7 +317,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -296,7 +336,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -312,7 +355,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -328,7 +374,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -344,7 +393,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -360,7 +412,10 @@ few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=None, trust_dataset=True, version=0, @@ -376,7 +431,7 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.pass_at_k_math({"k": 1})], + metrics=[Metrics.pass_at_k_math(sample_params={"k": 1})], version=2, ) aime24_avg = LightevalTaskConfig( @@ -390,7 +445,7 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.avg_at_k_math({"k": 64})], + metrics=[Metrics.avg_at_k_math(sample_params={"k": 64})], version=2, ) aime24_gpassk = LightevalTaskConfig( @@ -404,7 +459,7 @@ few_shots_split=None, few_shots_select=None, generation_size=8192, - metrics=[Metrics.g_pass_at_k_math({"k": 16, "n": 48})], + metrics=[Metrics.g_pass_at_k_math(sample_params={"k": 16, "n": 48})], version=1, ) aime25 = LightevalTaskConfig( @@ -418,7 +473,7 @@ few_shots_split=None, few_shots_select=None, generation_size=10000, - metrics=[Metrics.pass_at_k_math({"k": 1, "n": 1})], + metrics=[Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1})], version=2, ) aime25_gpassk = LightevalTaskConfig( @@ -432,7 +487,7 @@ few_shots_split=None, few_shots_select=None, generation_size=8192, - metrics=[Metrics.g_pass_at_k_math({"k": 16, "n": 48})], + metrics=[Metrics.g_pass_at_k_math(sample_params={"k": 16, "n": 48})], version=1, ) anachronisms_bigbench = LightevalTaskConfig( @@ -574,7 +629,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -590,7 +648,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -606,7 +667,10 @@ few_shots_split=None, few_shots_select="random_sampling_from_train", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -622,7 +686,10 @@ few_shots_split=None, few_shots_select="random_sampling_from_train", generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -798,7 +865,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -814,7 +881,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -878,7 +945,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=None, trust_dataset=True, version=0, @@ -896,9 +963,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -1203,7 +1276,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1220,7 +1296,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1237,7 +1316,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1254,7 +1336,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1271,7 +1356,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1288,7 +1376,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1305,7 +1396,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1322,7 +1416,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1339,7 +1436,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1356,7 +1456,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1373,7 +1476,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1390,7 +1496,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1407,7 +1516,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1424,7 +1536,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1441,7 +1556,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1458,7 +1576,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1475,7 +1596,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1492,7 +1616,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["", "Q=", "\n\n"], must_remove_duplicate_docs=True, trust_dataset=True, @@ -1511,10 +1638,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1533,10 +1666,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1555,10 +1694,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1577,10 +1722,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1599,10 +1750,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1621,10 +1778,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1643,10 +1806,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1665,10 +1834,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1687,10 +1862,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1709,10 +1890,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1731,10 +1918,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1753,10 +1946,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1775,10 +1974,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1797,10 +2002,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1819,10 +2030,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1841,10 +2058,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1863,10 +2086,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1885,10 +2114,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1907,10 +2142,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1929,10 +2170,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1951,10 +2198,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1973,10 +2226,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -1995,10 +2254,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -2017,10 +2282,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -2039,10 +2310,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -2061,10 +2338,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -2083,10 +2366,16 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["", "Q=", "\n\n"], trust_dataset=True, @@ -2105,10 +2394,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2127,10 +2422,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2149,10 +2450,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2171,10 +2478,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2193,10 +2506,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2215,10 +2534,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2237,10 +2562,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2259,10 +2590,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2281,10 +2618,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2303,10 +2646,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2325,10 +2674,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2347,10 +2702,16 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - Metrics.exact_match({"strip_strings": False}), + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.exact_match(sample_params={"strip_strings": False}), ], stop_sequence=["\n"], trust_dataset=True, @@ -2383,7 +2744,10 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -2401,9 +2765,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2422,9 +2792,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2443,9 +2819,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2464,9 +2846,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2485,9 +2873,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2506,10 +2900,16 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, - ], + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -2527,9 +2927,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2548,9 +2954,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2569,9 +2981,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2590,9 +3008,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2611,9 +3035,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2632,9 +3062,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2653,9 +3089,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2674,9 +3116,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2695,9 +3143,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2716,9 +3170,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2737,9 +3197,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2758,9 +3224,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2779,9 +3251,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2800,9 +3278,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2821,9 +3305,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2842,9 +3332,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2863,9 +3359,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -2884,9 +3386,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3161,9 +3669,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3182,9 +3696,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3203,9 +3723,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3224,9 +3750,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3245,9 +3777,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3264,7 +3802,10 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3282,9 +3823,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3303,9 +3850,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3324,9 +3877,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3345,9 +3904,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3366,9 +3931,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3387,9 +3958,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3406,7 +3983,10 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3422,7 +4002,10 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3440,9 +4023,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3459,7 +4048,10 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3477,9 +4069,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3498,9 +4096,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3519,9 +4123,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3540,9 +4150,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3561,9 +4177,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3582,9 +4204,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3603,9 +4231,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3624,9 +4258,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3645,9 +4285,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3666,9 +4312,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -3701,7 +4353,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3733,7 +4388,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3765,7 +4423,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3797,7 +4458,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3829,7 +4493,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3861,7 +4528,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3893,7 +4563,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3925,7 +4598,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3957,7 +4633,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -3989,7 +4668,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4021,7 +4703,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4053,7 +4738,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4085,7 +4773,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4117,7 +4808,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4149,7 +4843,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4181,7 +4878,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4213,7 +4913,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4245,7 +4948,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4277,7 +4983,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4309,7 +5018,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4341,7 +5053,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4373,7 +5088,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4405,7 +5123,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4437,7 +5158,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4469,7 +5193,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4501,7 +5228,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4533,7 +5263,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4565,7 +5298,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4597,7 +5333,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4629,7 +5368,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4661,7 +5403,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4693,7 +5438,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4725,7 +5473,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4757,7 +5508,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4789,7 +5543,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4821,7 +5578,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4853,7 +5613,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4885,7 +5648,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4917,7 +5683,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4949,7 +5718,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -4981,7 +5753,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5013,7 +5788,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5045,7 +5823,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5077,7 +5858,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5109,7 +5893,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5141,7 +5928,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5173,7 +5963,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5205,7 +5998,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5237,7 +6033,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5269,7 +6068,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5301,7 +6103,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5333,7 +6138,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5365,7 +6173,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5397,7 +6208,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5429,7 +6243,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5461,7 +6278,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5493,7 +6313,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5525,7 +6348,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5557,7 +6383,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5589,7 +6418,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5621,7 +6453,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5653,7 +6488,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5685,7 +6523,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5717,7 +6558,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5749,7 +6593,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5781,7 +6628,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5813,7 +6663,10 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -5927,9 +6780,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -5948,9 +6807,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -5967,7 +6832,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6015,7 +6880,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6031,7 +6896,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6047,7 +6912,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6081,9 +6946,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -6104,9 +6975,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -6127,9 +7004,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -6150,9 +7033,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -6173,9 +7062,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -6196,9 +7091,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -6219,9 +7120,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -6242,9 +7149,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -6265,9 +7178,15 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -6318,7 +7237,12 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], + metrics=[ + Metrics.rouge_t5, + Metrics.bleu, + Metrics.loglikelihood_acc, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6352,9 +7276,15 @@ generation_size=1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -6387,7 +7317,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=[".", ";", "!", "?"], trust_dataset=True, version=0, @@ -6403,7 +7333,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6690,7 +7620,7 @@ "choices": [line["answers"]["input_text"][0]], }, ), - suite=("lighteval",), + suite=["lighteval"], hf_repo="stanfordnlp/coqa", hf_subset="default", hf_avail_splits=["train", "validation"], @@ -6699,8 +7629,14 @@ generation_size=100, version=1, metrics=( - Metrics.prefix_quasi_exact_match, - Metrics.f1_score_quasi, + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), ), ) coqa_bb_lighteval = LightevalTaskConfig( @@ -6714,7 +7650,7 @@ few_shots_split=None, few_shots_select=None, generation_size=10, - metrics=[Metrics.exact_match({"strip_strings": False}), Metrics.f1_score], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False}), Metrics.f1_score], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6732,7 +7668,7 @@ generation_size=128, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.rougeL, Metrics.bleu_1, @@ -6801,7 +7737,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6897,7 +7833,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -6935,8 +7871,14 @@ generation_size=250, stop_sequence=["Question:", "question:", "\n"], metrics=( - Metrics.prefix_quasi_exact_match, - Metrics.f1_score_quasi, + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), ), version=1, ) @@ -7031,7 +7973,12 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], + metrics=[ + Metrics.rouge_t5, + Metrics.bleu, + Metrics.loglikelihood_acc, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -7145,9 +8092,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7166,9 +8119,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7187,9 +8146,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7208,9 +8173,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7229,9 +8200,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7250,9 +8227,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7271,9 +8254,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7292,9 +8281,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7313,9 +8308,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7334,9 +8335,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7355,9 +8362,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7376,9 +8389,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7397,9 +8416,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7418,9 +8443,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7439,9 +8470,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -7666,7 +8703,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -7698,7 +8735,12 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], + metrics=[ + Metrics.rouge_t5, + Metrics.bleu, + Metrics.loglikelihood_acc, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -7906,11 +8948,7 @@ few_shots_split=None, few_shots_select=None, generation_size=32768, # needed for reasoning models like R1 - metrics=[ - Metrics.gpqa_instruct_pass_at_k({"k": 1}), - Metrics.gpqa_instruct_pass_at_k({"k": 1, "n": 4}), - Metrics.gpqa_instruct_pass_at_k({"k": 1, "n": 8}), - ], + metrics=[Metrics.gpqa_instruct_pass_at_k(sample_params={"k": 1})], stop_sequence=[], # no stop sequence, will use eos token trust_dataset=True, version=1, @@ -7990,7 +9028,9 @@ few_shots_split=None, few_shots_select="random_sampling_from_train", generation_size=256, - metrics=[Metrics.quasi_exact_match_gsm8k], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": gsm8k_normalizer, "normalize_pred": gsm8k_normalizer}) + ], stop_sequence=[], trust_dataset=True, version=0, @@ -8024,7 +9064,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -8040,7 +9083,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -8056,7 +9102,10 @@ few_shots_split=None, few_shots_select="random_sampling_from_train", generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -8074,9 +9123,15 @@ generation_size=1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -8109,7 +9164,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -8223,9 +9278,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -8246,9 +9307,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -8317,9 +9384,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -8338,9 +9411,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -8359,9 +9438,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -8380,9 +9465,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -8401,9 +9492,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -8422,9 +9519,15 @@ generation_size=-1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -8457,7 +9560,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -8703,8 +9806,14 @@ generation_size=250, stop_sequence=["\n", "Question:", "question:"], metrics=( - Metrics.prefix_quasi_exact_match, - Metrics.f1_score_quasi, + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), ), ) kanji_ascii_bigbench = LightevalTaskConfig( @@ -8926,7 +10035,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -9030,9 +10139,15 @@ metrics=[ Metrics.loglikelihood_acc, Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -9051,7 +10166,7 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9073,7 +10188,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9095,7 +10210,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9117,7 +10232,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9139,7 +10254,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9161,7 +10276,7 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9183,7 +10298,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9205,7 +10320,7 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9227,7 +10342,7 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9249,7 +10364,7 @@ generation_size=10, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9271,7 +10386,7 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9293,7 +10408,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9315,7 +10430,7 @@ generation_size=20, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9335,7 +10450,11 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match, Metrics.f1_score], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.f1_score, + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -9353,7 +10472,7 @@ generation_size=430, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9375,7 +10494,7 @@ generation_size=788, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9397,7 +10516,7 @@ generation_size=338, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9419,7 +10538,7 @@ generation_size=274, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9441,7 +10560,7 @@ generation_size=274, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9463,7 +10582,7 @@ generation_size=10, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9485,7 +10604,7 @@ generation_size=10, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9507,7 +10626,7 @@ generation_size=10, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9529,7 +10648,7 @@ generation_size=10, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9551,7 +10670,7 @@ generation_size=10, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9573,7 +10692,7 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.f1_score_macro, Metrics.f1_score_micro, @@ -9593,7 +10712,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -9609,7 +10728,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=None, trust_dataset=True, version=0, @@ -9705,7 +10824,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -9723,9 +10845,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -9744,9 +10872,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -9765,9 +10899,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -9786,9 +10926,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -9807,9 +10953,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -9842,7 +10994,7 @@ few_shots_split=None, few_shots_select=None, generation_size=8192, - metrics=[Metrics.g_pass_at_k_latex({"k": 16, "n": 48})], + metrics=[Metrics.g_pass_at_k_latex(sample_params={"k": 16, "n": 48})], version=1, ) math_algebra_lighteval = LightevalTaskConfig( @@ -9857,7 +11009,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -9883,7 +11035,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -9909,7 +11061,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -9935,7 +11087,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -9961,7 +11113,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -9987,7 +11139,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -10013,7 +11165,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -10039,7 +11191,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -10065,7 +11217,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -10091,7 +11243,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -10117,7 +11269,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -10143,7 +11295,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -10169,7 +11321,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -10195,7 +11347,7 @@ few_shots_select=None, generation_size=2048, metrics=[ - Metrics.quasi_exact_match_math, + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), Metrics.maj_at_k( sample_params={ "k": 4, @@ -10236,7 +11388,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -10252,7 +11407,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -10270,7 +11425,7 @@ generation_size=128, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.rougeL, Metrics.bleu_1, @@ -10293,7 +11448,7 @@ generation_size=128, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.rougeL, Metrics.bleu_1, @@ -10316,7 +11471,7 @@ generation_size=128, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.rougeL, Metrics.bleu_1, @@ -10340,9 +11495,15 @@ metrics=[ Metrics.loglikelihood_acc, Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -10361,7 +11522,7 @@ generation_size=512, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.rougeL, Metrics.bleu_1, @@ -10385,9 +11546,15 @@ metrics=[ Metrics.loglikelihood_acc, Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -10436,7 +11603,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "Question="], trust_dataset=True, version=0, @@ -10452,7 +11622,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "Pregunta="], trust_dataset=True, version=0, @@ -10468,7 +11641,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "Question="], trust_dataset=True, version=0, @@ -10484,7 +11660,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "Frage="], trust_dataset=True, version=0, @@ -10500,7 +11679,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "\u0417\u0430\u0434\u0430\u0447\u0430="], trust_dataset=True, version=0, @@ -10516,7 +11698,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "\u95ee\u9898="], trust_dataset=True, version=0, @@ -10532,7 +11717,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "\u554f\u984c="], trust_dataset=True, version=0, @@ -10548,7 +11736,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "\u0e42\u0e08\u0e17\u0e22\u0e4c="], trust_dataset=True, version=0, @@ -10564,7 +11755,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "Swali="], trust_dataset=True, version=0, @@ -10580,7 +11774,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8="], trust_dataset=True, version=0, @@ -10596,7 +11793,10 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n", "=", "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28="], trust_dataset=True, version=0, @@ -10694,9 +11894,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -10747,9 +11953,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -10800,9 +12012,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -10853,9 +12071,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -10906,9 +12130,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -10959,9 +12189,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11012,9 +12248,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11065,9 +12307,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11118,9 +12366,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11171,9 +12425,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11224,9 +12484,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11277,9 +12543,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11330,9 +12602,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11383,9 +12661,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11436,9 +12720,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11489,9 +12779,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11542,9 +12838,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11595,9 +12897,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11648,9 +12956,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11701,9 +13015,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11754,9 +13074,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11807,9 +13133,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11860,9 +13192,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11913,9 +13251,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -11966,9 +13310,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12019,9 +13369,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12072,9 +13428,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12125,9 +13487,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12178,9 +13546,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12231,9 +13605,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12284,9 +13664,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12337,9 +13723,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12390,9 +13782,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12443,9 +13841,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12496,9 +13900,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12549,9 +13959,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12602,9 +14018,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12655,9 +14077,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12708,9 +14136,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12761,9 +14195,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12814,9 +14254,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12867,9 +14313,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12920,9 +14372,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -12973,9 +14431,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13026,9 +14490,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13079,9 +14549,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13132,9 +14608,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13185,9 +14667,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13238,9 +14726,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13291,9 +14785,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13344,9 +14844,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13397,9 +14903,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13450,9 +14962,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13503,9 +15021,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13556,9 +15080,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13609,9 +15139,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13662,9 +15198,15 @@ generation_size=5, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -13697,7 +15239,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -13825,7 +15367,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -13905,7 +15447,7 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.recall_at_1, Metrics.recall_at_2, Metrics.mrr], + metrics=[Metrics.recall_at_k, Metrics.recall_at_k(sample_params={"k": 2}), Metrics.mrr], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -13921,7 +15463,7 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.recall_at_1, Metrics.recall_at_2, Metrics.mrr], + metrics=[Metrics.recall_at_k, Metrics.recall_at_k(sample_params={"k": 2}), Metrics.mrr], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -13939,7 +15481,7 @@ generation_size=100, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), Metrics.f1_score, Metrics.rougeL, Metrics.bleu_1, @@ -13979,8 +15521,14 @@ generation_size=250, stop_sequence=["\n", "Question:", "question:"], metrics=( - Metrics.prefix_quasi_exact_match, - Metrics.f1_score_quasi, + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), ), ) navigate_bigbench = LightevalTaskConfig( @@ -14042,7 +15590,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14058,7 +15609,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14074,7 +15628,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14090,7 +15647,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14106,7 +15666,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14122,7 +15685,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14138,7 +15704,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14154,7 +15723,10 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14170,7 +15742,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14204,9 +15776,15 @@ generation_size=1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -14223,7 +15801,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14239,7 +15820,7 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14255,7 +15836,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14287,7 +15868,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=None, trust_dataset=True, version=0, @@ -14303,7 +15884,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14319,7 +15900,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14399,7 +15980,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14415,7 +15996,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14433,9 +16017,15 @@ generation_size=1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -14500,7 +16090,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14534,9 +16127,15 @@ generation_size=1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -14553,7 +16152,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14569,7 +16171,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14585,8 +16190,11 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], - stop_sequence=["\n"], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], + stop_sequence=["\n"], trust_dataset=True, version=0, ) @@ -14601,7 +16209,12 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleurt, Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], + metrics=[ + Metrics.bleurt, + Metrics.bleu, + Metrics.rouge_t5, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14617,7 +16230,7 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.f1_score_quasi], + metrics=[Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14649,7 +16262,11 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match, Metrics.quasi_exact_match, Metrics.f1_score], + metrics=[ + Metrics.exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.f1_score, + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -14699,9 +16316,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -14722,9 +16345,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -14745,9 +16374,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -14768,9 +16403,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -14791,9 +16432,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -14814,9 +16461,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -14837,9 +16490,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -14860,9 +16519,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -14883,9 +16548,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -14906,9 +16577,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -14929,9 +16606,15 @@ generation_size=30, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), Metrics.f1_score_macro, Metrics.f1_score_micro, ], @@ -14998,7 +16681,7 @@ few_shots_split=None, few_shots_select=None, generation_size=100, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15014,7 +16697,12 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.loglikelihood_acc, Metrics.exact_match({"strip_strings": False})], + metrics=[ + Metrics.rouge_t5, + Metrics.bleu, + Metrics.loglikelihood_acc, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15094,7 +16782,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15110,7 +16798,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15126,7 +16817,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15142,7 +16833,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15190,7 +16881,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15222,7 +16913,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15254,7 +16945,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15270,7 +16961,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.bleu, Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15302,7 +16993,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15320,9 +17011,15 @@ generation_size=1, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -15411,8 +17108,14 @@ stop_sequence=["\n", "Question:", "question:"], generation_size=200, metrics=( - Metrics.prefix_quasi_exact_match, - Metrics.f1_score_quasi, + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), + Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), ), ) storycloze_2016_lighteval = LightevalTaskConfig( @@ -15490,7 +17193,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15719,7 +17422,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -15769,9 +17475,15 @@ generation_size=50, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -15822,9 +17534,15 @@ generation_size=50, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -15843,9 +17561,15 @@ generation_size=50, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -15894,7 +17618,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16582,7 +18306,10 @@ few_shots_split=None, few_shots_select=None, generation_size=-1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm], + metrics=[ + Metrics.loglikelihood_acc, + Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), + ], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16630,7 +18357,7 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.quasi_exact_match_triviaqa], + metrics=[Metrics.exact_match(sample_params={"normalize_pred": harness_triviaqa_normalizer})], stop_sequence=["\n", ".", ","], trust_dataset=True, version=0, @@ -16681,9 +18408,15 @@ metrics=[ Metrics.loglikelihood_acc, Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -16796,7 +18529,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16812,7 +18545,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16828,7 +18561,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16844,7 +18577,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16860,7 +18593,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16876,7 +18609,7 @@ few_shots_split=None, few_shots_select=None, generation_size=5, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -16958,9 +18691,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -16979,9 +18718,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17000,9 +18745,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17021,9 +18772,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17042,9 +18799,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17063,9 +18826,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17084,9 +18853,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17105,9 +18880,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17126,9 +18907,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17147,9 +18934,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17168,9 +18961,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17189,9 +18988,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17210,9 +19015,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17231,9 +19042,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17252,9 +19069,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17273,9 +19096,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17294,9 +19123,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17315,9 +19150,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17336,9 +19177,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17357,9 +19204,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17378,9 +19231,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17399,9 +19258,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17420,9 +19285,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17441,9 +19312,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17462,9 +19339,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17483,9 +19366,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17504,9 +19393,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17525,9 +19420,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17546,9 +19447,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17567,9 +19474,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17588,9 +19501,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17609,9 +19528,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17630,9 +19555,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17651,9 +19582,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17672,9 +19609,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17693,9 +19636,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17714,9 +19663,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17735,9 +19690,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17756,9 +19717,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17777,9 +19744,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17798,9 +19771,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17819,9 +19798,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17840,9 +19825,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17861,9 +19852,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17882,9 +19879,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17903,9 +19906,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17924,9 +19933,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17945,9 +19960,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17966,9 +19987,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -17987,9 +20014,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18008,9 +20041,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18029,9 +20068,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18050,9 +20095,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18071,9 +20122,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18092,9 +20149,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18113,9 +20176,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18134,9 +20203,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18155,9 +20230,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18176,9 +20257,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18197,9 +20284,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18218,9 +20311,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18239,9 +20338,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18260,9 +20365,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18281,9 +20392,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18302,9 +20419,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18323,9 +20446,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18344,9 +20473,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18365,9 +20500,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18386,9 +20527,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18407,9 +20554,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18428,9 +20581,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18449,9 +20608,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18470,9 +20635,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18491,9 +20662,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18512,9 +20689,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18533,9 +20716,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18554,9 +20743,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18575,9 +20770,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18596,9 +20797,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18617,9 +20824,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18638,9 +20851,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18659,9 +20878,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18680,9 +20905,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18701,9 +20932,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18722,9 +20959,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -18743,9 +20986,15 @@ generation_size=8, metrics=[ Metrics.exact_match, - Metrics.quasi_exact_match, - Metrics.prefix_exact_match, - Metrics.prefix_quasi_exact_match, + Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), + Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), + Metrics.exact_match( + sample_params={ + "normalize_gold": helm_normalizer, + "normalize_pred": helm_normalizer, + "type_exact_match": "prefix", + } + ), ], stop_sequence=["\n"], trust_dataset=True, @@ -21546,7 +23795,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, @@ -21562,7 +23811,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.exact_match({"strip_strings": False})], + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], stop_sequence=["\n"], trust_dataset=True, version=0, From bc4bb7efb8131dfa1ca28b7dd821d0b4eafca357 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Tue, 19 Aug 2025 13:30:18 +0000 Subject: [PATCH 10/38] updating test suite given the new names --- examples/nanotron/custom_evaluation_tasks.py | 2 -- examples/nanotron/custom_task.py | 4 ++-- src/lighteval/metrics/utils/metric_utils.py | 2 +- src/lighteval/tasks/default_tasks.py | 2 +- tests/reference_scores/harness_metrics.json | 4 ++-- 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py index aa41c2d91..26fdf8dd2 100644 --- a/examples/nanotron/custom_evaluation_tasks.py +++ b/examples/nanotron/custom_evaluation_tasks.py @@ -406,7 +406,6 @@ def __init__( prompt_function=mmlu_prompt, hf_repo="lighteval/mmlu", hf_subset=None, - # metric=[Metrics.loglikelihood_acc_single_token], metric=[ Metrics.loglikelihood_acc, Metrics.loglikelihood_acc( @@ -656,7 +655,6 @@ def __init__( prompt_function=agi_eval_prompt_no_letters, hf_repo="lighteval/agi_eval_en", hf_subset=None, - # metric=[Metrics.loglikelihood_acc_single_token], metric=[ Metrics.loglikelihood_acc, Metrics.loglikelihood_acc( diff --git a/examples/nanotron/custom_task.py b/examples/nanotron/custom_task.py index feaa849ba..55a318edd 100644 --- a/examples/nanotron/custom_task.py +++ b/examples/nanotron/custom_task.py @@ -80,7 +80,7 @@ def mmlu_anatomy(line): few_shots_split="dev", few_shots_select="sequential", generation_size=5, - metric=[Metrics.loglikelihood_acc_single_token], + metric=[Metrics.loglikelihood_acc], stop_sequence=["\n"], ), LightevalTaskConfig( @@ -94,7 +94,7 @@ def mmlu_anatomy(line): few_shots_split="dev", few_shots_select="sequential", generation_size=5, - metric=[Metrics.loglikelihood_acc_single_token], + metric=[Metrics.loglikelihood_acc], stop_sequence=["\n"], ), ] diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py index 75af6cd5c..60c91cbb8 100644 --- a/src/lighteval/metrics/utils/metric_utils.py +++ b/src/lighteval/metrics/utils/metric_utils.py @@ -82,7 +82,7 @@ def __call__(self, sample_params: dict | None): if isinstance(self, MetricGrouping): if hasattr(self.sample_level_fn, "metric_names"): # this is mostly for the gpass@k metrics - self.metric_name = self.sample_level_fn.metric_names() + self.metric_name = self.sample_level_fn.metric_names else: self.metric_name = [metric + str(sample_params) for metric in self.metric_name] else: diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 6f848b48a..ffc8a36b5 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -8756,7 +8756,7 @@ few_shots_split=None, few_shots_select=None, generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.mcc_single_token], + metrics=[Metrics.loglikelihood_acc, Metrics.mcc], stop_sequence=["\n"], trust_dataset=True, version=0, diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json index 9679e6592..f1af834a9 100644 --- a/tests/reference_scores/harness_metrics.json +++ b/tests/reference_scores/harness_metrics.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5dffe1e990e1e839322b74ff02f306ea468ad7602492f62f987cae1bb546b84 -size 48376580 +oid sha256:fc21a4e01203f073f3f3a3e3a5628d2e19801132b34615204964cefb4771daaa +size 10763318 From 5d85a6e8907744a9f1bc387c4fe563993a252802 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 19 Aug 2025 15:41:12 +0200 Subject: [PATCH 11/38] manual update of file --- tests/reference_scores/harness_metrics.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json index f1af834a9..345f49f1f 100644 --- a/tests/reference_scores/harness_metrics.json +++ b/tests/reference_scores/harness_metrics.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc21a4e01203f073f3f3a3e3a5628d2e19801132b34615204964cefb4771daaa -size 10763318 +oid sha256:27bc1a90b557ed1818f96fa205eee0ca3e102cddf2d1fd8263aeb75bc4153e5a +size 48376328 From 917fb791bee9b74f97222ac4100e9a0ff85a0d10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 19 Aug 2025 15:48:38 +0200 Subject: [PATCH 12/38] manual update of file --- tests/reference_scores/harness_metrics.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json index 345f49f1f..35fdaafa3 100644 --- a/tests/reference_scores/harness_metrics.json +++ b/tests/reference_scores/harness_metrics.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27bc1a90b557ed1818f96fa205eee0ca3e102cddf2d1fd8263aeb75bc4153e5a -size 48376328 +oid sha256:a5365e9e5b7c503be35e0fcd3e31986ec79037220832c85e4615f950a7dc35c0 +size 48376329 From a367d73efc536ddd1cb776aa451e6a9d11f1f07d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 19 Aug 2025 16:04:26 +0200 Subject: [PATCH 13/38] fix mcc single token --- tests/reference_scores/harness_metrics.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json index 35fdaafa3..b541145a0 100644 --- a/tests/reference_scores/harness_metrics.json +++ b/tests/reference_scores/harness_metrics.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5365e9e5b7c503be35e0fcd3e31986ec79037220832c85e4615f950a7dc35c0 -size 48376329 +oid sha256:e3acf20a4f30005f8a8650c2575b49522bc64f6daefac0f49c5e7c73cfd38f6c +size 48376316 From 404d00f8bc3f04dd9606a77211295d57c3c13571 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 19 Aug 2025 16:12:07 +0200 Subject: [PATCH 14/38] now metrics are em --- tests/reference_scores/harness_metrics.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json index b541145a0..0861a9b78 100644 --- a/tests/reference_scores/harness_metrics.json +++ b/tests/reference_scores/harness_metrics.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3acf20a4f30005f8a8650c2575b49522bc64f6daefac0f49c5e7c73cfd38f6c -size 48376316 +oid sha256:8ba58bea942b6284fa8b978020997493c0ab4fdf2c668efcf78b810dcd4239f9 +size 48375592 From f90575043b8ad18f0c6c8489af3cdcb939aff6cc Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Tue, 19 Aug 2025 15:10:59 +0000 Subject: [PATCH 15/38] some fixs for tests --- docs/source/saving-and-reading-results.mdx | 10 +++++----- src/lighteval/metrics/harness_compatibility/drop.py | 2 +- src/lighteval/metrics/metrics.py | 6 +++--- src/lighteval/tasks/default_tasks.py | 4 +++- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx index 2a54aeaf4..01761f2f8 100644 --- a/docs/source/saving-and-reading-results.mdx +++ b/docs/source/saving-and-reading-results.mdx @@ -123,14 +123,14 @@ The detail file contains the following columns: }, "results": { "lighteval|gsm8k|0": { - "qem": 0.0, - "qem_stderr": 0.0, + "em": 0.0, + "em_stderr": 0.0, "maj@8": 0.0, "maj@8_stderr": 0.0 }, "all": { - "qem": 0.0, - "qem_stderr": 0.0, + "em": 0.0, + "em_stderr": 0.0, "maj@8": 0.0, "maj@8_stderr": 0.0 } @@ -146,7 +146,7 @@ The detail file contains the following columns: "hf_subset": "main", "metric": [ { - "metric_name": "qem", + "metric_name": "em", "higher_is_better": true, "category": "3", "use_case": "5", diff --git a/src/lighteval/metrics/harness_compatibility/drop.py b/src/lighteval/metrics/harness_compatibility/drop.py index f12828cbe..0a4db3421 100644 --- a/src/lighteval/metrics/harness_compatibility/drop.py +++ b/src/lighteval/metrics/harness_compatibility/drop.py @@ -171,4 +171,4 @@ def _normalize(answer: str): if gold_answer.strip(): max_em = max(max_em, exact_match) max_f1 = max(max_f1, f1_score) - return {"qem": max_em, "f1": max_f1} + return {"em": max_em, "f1": max_f1} diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index e2470d5b3..275985778 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -183,11 +183,11 @@ class Metrics(Enum): higher_is_better={"longest_common_prefix_length": True, "edit_distance": False, "edit_similarity": True}, ) drop = SampleLevelMetricGrouping( - metric_name=["qem", "f1"], + metric_name=["em", "f1"], sample_level_fn=drop_metrics, category=SamplingMethod.GENERATIVE, - corpus_level_fn={"qem": max, "f1": max}, - higher_is_better={"qem": True, "f1": True}, + corpus_level_fn={"em": max, "f1": max}, + higher_is_better={"em": True, "f1": True}, ) exact_match = SampleLevelMetric( metric_name="em", diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index ffc8a36b5..988880024 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -18357,7 +18357,9 @@ few_shots_split=None, few_shots_select=None, generation_size=20, - metrics=[Metrics.exact_match(sample_params={"normalize_pred": harness_triviaqa_normalizer})], + metrics=[ + Metrics.exact_match(sample_params={"strip_strings": True, "normalize_pred": harness_triviaqa_normalizer}) + ], stop_sequence=["\n", ".", ","], trust_dataset=True, version=0, From 82b3fe9cda5767dba9d7299d5c19a82ff60b688d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 19 Aug 2025 17:15:16 +0200 Subject: [PATCH 16/38] rm trivia qa outdated --- tests/reference_scores/harness_metrics.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json index 0861a9b78..f5cf55df6 100644 --- a/tests/reference_scores/harness_metrics.json +++ b/tests/reference_scores/harness_metrics.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ba58bea942b6284fa8b978020997493c0ab4fdf2c668efcf78b810dcd4239f9 -size 48375592 +oid sha256:cb7a1d7fdf76e3915daf2eb951077048ff47914e2ac9ff186fa175a216357976 +size 48360080 From 5c4f9ab67e53e4eb6ce734a0fc4bd53298211213 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Tue, 19 Aug 2025 15:37:20 +0000 Subject: [PATCH 17/38] removed dumdum enum overwrite --- src/lighteval/metrics/metrics.py | 13 ++++--------- tests/test_unit_harness_metrics.py | 4 +--- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 275985778..300601c43 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -21,6 +21,8 @@ # SOFTWARE. +from copy import deepcopy + import numpy as np from aenum import Enum @@ -77,7 +79,6 @@ SamplingMethod, ) from lighteval.utils.language import Language -from lighteval.utils.utils import as_list class Metrics(Enum): @@ -495,11 +496,5 @@ def __str__(self): def __call__(self, **kwargs): # When parametrizing, we don't look at the Metrics enum, # but at a specific single metric (a value) - return self.value(kwargs) - - @staticmethod - def all_metrics(): - res = [] - for metric in Metrics: - res.extend(as_list(metric.value.metric_name)) - return res + # Be very careful to not change the default value of the enum + return deepcopy(self.value)(kwargs) diff --git a/tests/test_unit_harness_metrics.py b/tests/test_unit_harness_metrics.py index 4cc2853ae..9ce2ed228 100644 --- a/tests/test_unit_harness_metrics.py +++ b/tests/test_unit_harness_metrics.py @@ -39,8 +39,6 @@ PATH_TO_HARNESS_METRICS = os.path.join(os.path.dirname(__file__), "reference_scores/harness_metrics.json") -POSSIBLE_METRICS = Metrics.all_metrics() - def pytest_generate_tests(metafunc: pytest.Metafunc): """Initializes the main test setup. This function is automatically called by pytest and @@ -106,7 +104,7 @@ def test_model_prediction(prompt_inputs: tuple[str, str, list]): # noqa: C901 metric_result = {k: list(v) if isinstance(v, tuple) else v for k, v in results.items()} - metric_reference = {k: v for k, v in example.items() if k in POSSIBLE_METRICS} + metric_reference = {k: v for k, v in example.items() if k in results.keys()} error_msg += f"Prediction: {results}\n" error_msg += f"Reference: {metric_reference}\n" error_msg += f"Returned : {metric_result}" From c31a39ff730ac80418c8071a8170fea2dbb2ca25 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Tue, 19 Aug 2025 15:49:04 +0000 Subject: [PATCH 18/38] fix test --- examples/custom_tasks_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/custom_tasks_tests.py b/examples/custom_tasks_tests.py index e30669fb4..465ce7cd0 100644 --- a/examples/custom_tasks_tests.py +++ b/examples/custom_tasks_tests.py @@ -53,7 +53,7 @@ few_shots_split=None, few_shots_select=None, generation_size=2048, - metrics=[Metrics.gpqa_instruct_pass_at_k({"k": 1})], + metrics=[Metrics.gpqa_instruct_pass_at_k(sample_params={"k": 1})], stop_sequence=[], # no stop sequence, will use eos token trust_dataset=True, version=0, From 75419c1cbf8ccddba29829914eeba9646f5af2b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 19 Aug 2025 17:54:42 +0200 Subject: [PATCH 19/38] rm a space --- tests/reference_scores/harness_metrics.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json index f5cf55df6..2a638f9ba 100644 --- a/tests/reference_scores/harness_metrics.json +++ b/tests/reference_scores/harness_metrics.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb7a1d7fdf76e3915daf2eb951077048ff47914e2ac9ff186fa175a216357976 +oid sha256:73597098e72f50abbf6527f018ff2cfc7d45a8b8052a624b4170efef552da71a size 48360080 From 915943f0183e9cbeaf538580daf43beea830649d Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Tue, 19 Aug 2025 15:56:23 +0000 Subject: [PATCH 20/38] cleaner loop --- tests/test_unit_harness_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_unit_harness_metrics.py b/tests/test_unit_harness_metrics.py index 9ce2ed228..6d1764593 100644 --- a/tests/test_unit_harness_metrics.py +++ b/tests/test_unit_harness_metrics.py @@ -104,7 +104,7 @@ def test_model_prediction(prompt_inputs: tuple[str, str, list]): # noqa: C901 metric_result = {k: list(v) if isinstance(v, tuple) else v for k, v in results.items()} - metric_reference = {k: v for k, v in example.items() if k in results.keys()} + metric_reference = {k: example[k] for k in results.keys()} error_msg += f"Prediction: {results}\n" error_msg += f"Reference: {metric_reference}\n" error_msg += f"Returned : {metric_result}" From d047766fd3da61a751e1dbd0acf7adc3506918d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 19 Aug 2025 18:04:16 +0200 Subject: [PATCH 21/38] test --- tests/reference_scores/harness_metrics.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json index 2a638f9ba..8f4be7807 100644 --- a/tests/reference_scores/harness_metrics.json +++ b/tests/reference_scores/harness_metrics.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73597098e72f50abbf6527f018ff2cfc7d45a8b8052a624b4170efef552da71a +oid sha256:c2080305011a7ac8b0895ec1fbb26b45af4e3dced6272abf67156ebf57656f88 size 48360080 From 9e295109b23bd5aa4ec713e8774ca2f0a9984a11 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 09:17:12 +0000 Subject: [PATCH 22/38] better json encoder + a small naming fix --- src/lighteval/logging/evaluation_tracker.py | 8 +++++++- src/lighteval/metrics/metrics.py | 4 ++-- src/lighteval/metrics/metrics_sample.py | 10 +++++++++- src/lighteval/metrics/utils/metric_utils.py | 5 +++-- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 32a638d1e..9e588e58c 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -81,7 +81,13 @@ def default(self, o): return str(o) if isinstance(o, Enum): return o.name - return super().default(o) + if hasattr(o, "__str__"): + return str(o) + try: + return super().default(o) + except TypeError: + # For classes without json serialization + return type(o).__name__ class EvaluationTracker: diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 300601c43..6b8479da7 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -493,8 +493,8 @@ class Metrics(Enum): def __str__(self): return self.name.replace("_at_", "@") - def __call__(self, **kwargs): + def __call__(self, sample_params): # When parametrizing, we don't look at the Metrics enum, # but at a specific single metric (a value) # Be very careful to not change the default value of the enum - return deepcopy(self.value)(kwargs) + return deepcopy(self.value)(sample_params=sample_params) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 1026a21a1..413893624 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -1303,13 +1303,21 @@ def __init__( thresholds (list): Thresholds to control successful attempts in k generate. """ super().__init__(kwargs) - self.k = as_list(k) + self._k = k self.n = n self.attribute_must_be_set = ["k"] self.thresholds = thresholds self.name = (f"{name_prefix}_" if name_prefix else "") + "g-pass@" + @property + def k(self): + return as_list(self._k) + + @k.setter + def k(self, new_val): + self._k = as_list(new_val) + def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: """Computes the metric over a list of golds and predictions for one single item with possibly many samples. It applies normalisation (if needed) to model prediction and gold, computes their per prediction score, diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py index 60c91cbb8..2d9642790 100644 --- a/src/lighteval/metrics/utils/metric_utils.py +++ b/src/lighteval/metrics/utils/metric_utils.py @@ -79,14 +79,15 @@ def __call__(self, sample_params: dict | None): # Once the parameters are updated, we need to adjust the # metric name to what will be returned + sample_params_name = "&".join(sample_params.keys()) if isinstance(self, MetricGrouping): if hasattr(self.sample_level_fn, "metric_names"): # this is mostly for the gpass@k metrics self.metric_name = self.sample_level_fn.metric_names else: - self.metric_name = [metric + str(sample_params) for metric in self.metric_name] + self.metric_name = [f"{metric}_with_{sample_params_name}" for metric in self.metric_name] else: - self.metric_name = self.metric_name + str(sample_params) + self.metric_name = f"{self.metric_name}_with_{sample_params_name}" return self From 054c6d56498d19d45050709c0f892865573119ee Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 09:17:30 +0000 Subject: [PATCH 23/38] new names --- .../SmolLM2-1.7B-Instruct-results-accelerate.json | 4 ++-- .../reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json index a55fd6f82..7c8c77d79 100644 --- a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json +++ b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d38c5cdb9dd354222ccd238df2675b0999181b663322dab612655aa12f9ef372 -size 49944 +oid sha256:2fbcbcf4031d545999b8e02afffa2537f642a1239664af16160e5fcd250a4ecc +size 50626 diff --git a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json index 7bc559c14..66ab85090 100644 --- a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json +++ b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be52fd994b9978b91eb057bb72ec6110e2e49016ca0f2b296ba5bf75ba056725 -size 49883 +oid sha256:d1302090702deaf018f21f1dc5ffd2a2a2b93e19b50aa459508146f130aa9ecf +size 50565 From cc305815b123cc17f034cdbfb0b0a1d65e7daa51 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 09:26:33 +0000 Subject: [PATCH 24/38] fix test --- tests/tasks/test_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tasks/test_registry.py b/tests/tasks/test_registry.py index c67ce1eae..cd05fe230 100644 --- a/tests/tasks/test_registry.py +++ b/tests/tasks/test_registry.py @@ -124,7 +124,7 @@ def test_cli_sampling_params_fail(): # creation of object should fail with pytest.raises(ValueError): - registry.get_tasks_configs("lighteval|math_500@|0|0") + registry.get_tasks_configs("lighteval|math_500@plop|0|0") def test_task_group_expansion_with_subset_expansion(): From 3dd79e2f66a78e6c282efa5519309288ba350061 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 09:44:20 +0000 Subject: [PATCH 25/38] up doc --- docs/source/metric-list.mdx | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/docs/source/metric-list.mdx b/docs/source/metric-list.mdx index e089fb8cd..06d3dd069 100644 --- a/docs/source/metric-list.mdx +++ b/docs/source/metric-list.mdx @@ -3,9 +3,7 @@ ## Automatic metrics for multiple-choice tasks These metrics use log-likelihood of the different possible targets. -- `loglikelihood_acc`: Fraction of instances where the choice with the best logprob was correct -- `loglikelihood_acc_norm`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct -- `loglikelihood_acc_norm_nospace`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct, with the first space ignored. +- `loglikelihood_acc`: Fraction of instances where the choice with the best logprob was correct - we recommend using a normalization by length - `loglikelihood_f1`: Corpus level F1 score of the multichoice selection - `mcc`: Matthew's correlation coefficient (a measure of agreement between statistical distributions). - `recall_at_k`: Fraction of instances where the choice with the k-st best logprob or better was correct @@ -24,17 +22,13 @@ These metrics use log-likelihood of prompt. ## Automatic metrics for generative tasks These metrics need the model to generate an output. They are therefore slower. - Base: - - `perfect_exact_match`: Fraction of instances where the prediction matches the gold exactly. - - `exact_match`: Fraction of instances where the prediction matches the gold with the exception of the border whitespaces (= after a `strip` has been applied to both). - - `quasi_exact_match`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, ...). Other variations exist, with other normalizers, such as `quasi_exact_match_triviaqa`, which only normalizes the predictions after applying a strip to all sentences. - - `prefix_exact_match`: Fraction of instances where the beginning of the prediction matches the gold at the exception of the border whitespaces (= after a `strip` has been applied to both). - - `prefix_quasi_exact_match`: Fraction of instances where the normalized beginning of the prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, ...). - - `exact_match_indicator`: Exact match with some preceding context (before an indicator) removed. - - `f1_score_quasi`: Average F1 score in terms of word overlap between the model output and gold, with both being normalized first. - - `f1_score`: Average F1 score in terms of word overlap between the model output and gold without normalisation. + - `exact_match`: Fraction of instances where the prediction matches the gold. Several variations can be made through parametrization: + - normalization on string pre-comparision on whitespace, articles, capitalization, .... + - comparing the full string, or only subsets (prefix, suffix, ...) + - `maj_at_k`: Model majority vote. Samples k generations from the model and assumes the most frequent is the actual prediction. + - `f1_score`: Average F1 score in terms of word overlap between the model output and gold (normalisation optional). - `f1_score_macro`: Corpus level macro F1 score. - `f1_score_macro`: Corpus level micro F1 score. - - `maj_at_k`: Model majority vote. Samples k generations from the model and assumes the most frequent is the actual prediction. - Summarization: - `rouge`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/). - `rouge1`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap. @@ -60,9 +54,7 @@ These metrics need the model to generate an output. They are therefore slower. - `edit_distance`: Average Levenshtein edit distance between model generation and reference, - `edit_similarity`: Average Levenshtein edit similarity (normalized by the length of longer sequence) between model generation and reference. - Math: - - `quasi_exact_match_math`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for math, where latex symbols, units, etc are removed). - - `quasi_exact_match_gsm8k`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed). - - `maj_at_k`: Majority choice evaluation can be applied with a math specific normalizer (gsm8k, math, etc) + - Both `exact_match` and `maj_at_k` can be used to evaluate mathematics tasks with math specific normalization to remove and filter latex. ## LLM-as-Judge - `llm_judge_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. From a07cde11829660bbbfc4bb93f93eff5152ccb4d8 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 12:44:18 +0000 Subject: [PATCH 26/38] reorg --- community_tasks/arabic_evals.py | 2 +- src/lighteval/logging/info_loggers.py | 2 +- src/lighteval/metrics/utils/__init__.py | 21 ------------------- .../metrics/{ => utils}/judge_prompts.jsonl | 0 .../metrics/{ => utils}/llm_as_judge.py | 0 src/lighteval/metrics/utils/metric_utils.py | 14 ++++++++----- src/lighteval/metrics/{ => utils}/stderr.py | 0 7 files changed, 11 insertions(+), 28 deletions(-) delete mode 100644 src/lighteval/metrics/utils/__init__.py rename src/lighteval/metrics/{ => utils}/judge_prompts.jsonl (100%) rename src/lighteval/metrics/{ => utils}/llm_as_judge.py (100%) rename src/lighteval/metrics/{ => utils}/stderr.py (100%) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 68cb2a0f0..b239fbbbb 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -31,9 +31,9 @@ import re from typing import Any, Dict, List, Optional, Union -from lighteval.metrics.llm_as_judge import JudgeLM from lighteval.metrics.metrics import Metrics from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.metrics.utils.llm_as_judge import JudgeLM from lighteval.metrics.utils.metric_utils import Metric from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.lighteval_task import LightevalTaskConfig diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index da7a07c15..5d82d3c38 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -30,7 +30,7 @@ import git import xxhash -from lighteval.metrics.stderr import get_stderr_function +from lighteval.metrics.utils.stderr import get_stderr_function from lighteval.models.abstract_model import ModelConfig from lighteval.models.model_output import ModelResponse from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig diff --git a/src/lighteval/metrics/utils/__init__.py b/src/lighteval/metrics/utils/__init__.py deleted file mode 100644 index a732db8d0..000000000 --- a/src/lighteval/metrics/utils/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. diff --git a/src/lighteval/metrics/judge_prompts.jsonl b/src/lighteval/metrics/utils/judge_prompts.jsonl similarity index 100% rename from src/lighteval/metrics/judge_prompts.jsonl rename to src/lighteval/metrics/utils/judge_prompts.jsonl diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/utils/llm_as_judge.py similarity index 100% rename from src/lighteval/metrics/llm_as_judge.py rename to src/lighteval/metrics/utils/llm_as_judge.py diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py index 2d9642790..2b51185c6 100644 --- a/src/lighteval/metrics/utils/metric_utils.py +++ b/src/lighteval/metrics/utils/metric_utils.py @@ -23,6 +23,8 @@ from dataclasses import dataclass from typing import Callable +from lighteval.metrics.metrics_corpus import CorpusLevelComputation +from lighteval.metrics.metrics_sample import SampleLevelComputation from lighteval.metrics.sample_preparator import Preparator from lighteval.tasks.requests import SamplingMethod @@ -32,8 +34,8 @@ class Metric: metric_name: str higher_is_better: bool category: SamplingMethod - sample_level_fn: Callable | Preparator | object - corpus_level_fn: Callable | object + sample_level_fn: SampleLevelComputation | Preparator + corpus_level_fn: Callable | CorpusLevelComputation batched_compute: bool = False @@ -43,12 +45,14 @@ def get_doc(self): def compute_sample( self, **kwargs ) -> dict: # result: Union[list[ModelResponse], ModelResponse], formatted_doc: Doc) -> dict: - if isinstance(self.sample_level_fn, Callable): - sample_level_fn = self.sample_level_fn + if isinstance(self.sample_level_fn, SampleLevelComputation): + sample_level_fn = self.sample_level_fn.compute elif isinstance(self.sample_level_fn, Preparator): sample_level_fn = self.sample_level_fn.prepare else: - sample_level_fn = self.sample_level_fn.compute + raise ValueError( + f"Incorrect type for {self.sample_level_fn}, should be a SampleLevelComputation or Preparator" + ) if isinstance(self, MetricGrouping): return sample_level_fn(**kwargs) # result, formatted_doc, diff --git a/src/lighteval/metrics/stderr.py b/src/lighteval/metrics/utils/stderr.py similarity index 100% rename from src/lighteval/metrics/stderr.py rename to src/lighteval/metrics/utils/stderr.py From 61506917f3d24fde034e115c49dd59eea0d906ef Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 12:45:34 +0000 Subject: [PATCH 27/38] enforce correct classes --- .../metrics/harness_compatibility/drop.py | 100 +++++++++--------- .../harness_compatibility/truthful_qa.py | 48 +++++---- src/lighteval/metrics/metrics.py | 16 +-- src/lighteval/metrics/metrics_corpus.py | 25 ++--- src/lighteval/metrics/metrics_sample.py | 64 ++++++----- 5 files changed, 133 insertions(+), 120 deletions(-) diff --git a/src/lighteval/metrics/harness_compatibility/drop.py b/src/lighteval/metrics/harness_compatibility/drop.py index 0a4db3421..382d9ad08 100644 --- a/src/lighteval/metrics/harness_compatibility/drop.py +++ b/src/lighteval/metrics/harness_compatibility/drop.py @@ -27,28 +27,40 @@ import numpy as np from scipy.optimize import linear_sum_assignment +from lighteval.metrics.metrics_sample import SampleLevelComputation from lighteval.models.model_output import ModelResponse from lighteval.tasks.requests import Doc -def drop_metrics(doc: Doc, model_response: ModelResponse): # noqa: C901 - """F1 score from bag of words: comes from Harness Drop. DROP offers two metrics, - a quasi exact match and a numeracy-focused F1 score. Quasi in the sense that it - does some normalizations before matching and numeracy-focused in the sense that - if there's number mismatch between the target and prediction F1 score is set to 0. - F1 score is computed using the intersection of target and prediction's BoW - representations with the additional spice that if the answer and/or prediction is - comprised of multiple spans, a greedy matching is done between the two sets of spans - (based on the very BoW overlap) and the average over F1 of pairs is returned. - DROP also accepts multiple answers in which case, the maximum of F1/ Exact Match - between prediction and the different answers is taken. +class DropMetrics(SampleLevelComputation): + def compute(self, doc: Doc, model_response: ModelResponse): # noqa: C901 + """F1 score from bag of words: comes from Harness Drop. DROP offers two metrics, + a quasi exact match and a numeracy-focused F1 score. Quasi in the sense that it + does some normalizations before matching and numeracy-focused in the sense that + if there's number mismatch between the target and prediction F1 score is set to 0. + F1 score is computed using the intersection of target and prediction's BoW + representations with the additional spice that if the answer and/or prediction is + comprised of multiple spans, a greedy matching is done between the two sets of spans + (based on the very BoW overlap) and the average over F1 of pairs is returned. + DROP also accepts multiple answers in which case, the maximum of F1/ Exact Match + between prediction and the different answers is taken. - For more information, please refer to the section 5 of the DROP paper (https://aclanthology.org/N19-1246/). + For more information, please refer to the section 5 of the DROP paper (https://aclanthology.org/N19-1246/). - Todo: this code is really hard to follow, simplify when possible - """ - - def _answer_to_bags(answer: List[str]) -> Tuple[List[str], List[Set[str]]]: + Todo: this code is really hard to follow, simplify when possible + """ + max_em = 0 + max_f1 = 0 + for gold_answer in doc.specific["golds_no_preprocessing"]: + exact_match, f1_score = self._get_metrics(model_response.text, gold_answer) + if isinstance(gold_answer, list): + gold_answer = gold_answer[0] + if gold_answer.strip(): + max_em = max(max_em, exact_match) + max_f1 = max(max_f1, f1_score) + return {"em": max_em, "f1": max_f1} + + def _answer_to_bags(self, answer: List[str]) -> Tuple[List[str], List[Set[str]]]: if isinstance(answer, (list, tuple)): raw_spans = answer else: @@ -56,12 +68,12 @@ def _answer_to_bags(answer: List[str]) -> Tuple[List[str], List[Set[str]]]: normalized_spans = [] token_bags = [] for raw_span in raw_spans: - normalized_span = _normalize(raw_span) + normalized_span = self._normalize(raw_span) normalized_spans.append(normalized_span) token_bags.append(set(normalized_span.split())) return normalized_spans, token_bags - def _get_metrics(predicted: List[str], gold: List[str]): + def _get_metrics(self, predicted: List[str], gold: List[str]): """ Takes a predicted answer and a gold answer (that are both either a string or a list of strings), and returns exact match and the DROP F1 metric for the prediction. If you are @@ -69,8 +81,8 @@ def _get_metrics(predicted: List[str], gold: List[str]): validation, or while training), this is the function you want to call, after using :func:`answer_json_to_strings` when reading the gold answer from the released data file. """ - pred_normalized_spans, pred_bags = _answer_to_bags(predicted) - gold_normalized_spans, gold_bags = _answer_to_bags(gold) + pred_normalized_spans, pred_bags = self._answer_to_bags(predicted) + gold_normalized_spans, gold_bags = self._answer_to_bags(gold) if set(pred_normalized_spans) == set(gold_normalized_spans) and len(gold_normalized_spans) == len( gold_normalized_spans @@ -79,32 +91,32 @@ def _get_metrics(predicted: List[str], gold: List[str]): else: exact_match = 0.0 - f1_per_bag = _align_bags(pred_bags, gold_bags) + f1_per_bag = self._align_bags(pred_bags, gold_bags) f1 = np.mean(f1_per_bag) f1 = round(f1, 2) return exact_match, f1 - def _is_number(text): + def _is_number(self, text): try: float(text) return True except ValueError: return False - def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]): + def _match_numbers_if_present(self, gold_bag: Set[str], predicted_bag: Set[str]): gold_numbers = set() predicted_numbers = set() for word in gold_bag: - if _is_number(word): + if self._is_number(word): gold_numbers.add(word) for word in predicted_bag: - if _is_number(word): + if self._is_number(word): predicted_numbers.add(word) if (not gold_numbers) or gold_numbers.intersection(predicted_numbers): return True return False - def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> np.array: + def _align_bags(self, predicted: List[Set[str]], gold: List[Set[str]]) -> np.array: """ Takes gold and predicted answer sets and first finds the optimal 1-1 alignment between them and gets maximum metric values over all the answers. @@ -112,8 +124,8 @@ def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> np.array: scores = np.zeros([len(gold), len(predicted)]) for gold_index, gold_item in enumerate(gold): for pred_index, pred_item in enumerate(predicted): - if _match_numbers_if_present(gold_item, pred_item): - scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item) + if self._match_numbers_if_present(gold_item, pred_item): + scores[gold_index, pred_index] = self._compute_f1(pred_item, gold_item) row_ind, col_ind = linear_sum_assignment(-scores) max_scores = np.zeros([max(len(gold), len(predicted))]) @@ -121,7 +133,7 @@ def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> np.array: max_scores[row] = max(max_scores[row], scores[row, column]) return max_scores - def _compute_f1(predicted_bag, gold_bag): + def _compute_f1(self, predicted_bag, gold_bag): intersection = len(gold_bag.intersection(predicted_bag)) if not predicted_bag: precision = 1.0 @@ -135,40 +147,30 @@ def _compute_f1(predicted_bag, gold_bag): return 0 return (2 * precision * recall) / (precision + recall) - def _remove_articles(text): + def _remove_articles(self, text): return re.compile(r"\b(a|an|the)\b", re.UNICODE).sub(" ", text) - def _white_space_fix(text): + def _white_space_fix(self, text): return " ".join(text.split()) - def _remove_punc(text): + def _remove_punc(self, text): exclude = set(string.punctuation) - if not _is_number(text): + if not self._is_number(text): return "".join(ch for ch in text if ch not in exclude) else: return text - def _fix_number(text): - return str(float(text)) if _is_number(text) else text + def _fix_number(self, text): + return str(float(text)) if self._is_number(text) else text - def _tokenize(text): + def _tokenize(self, text): return re.split(" |-", text) - def _normalize(answer: str): + def _normalize(self, answer: str): tokens = [ - _white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower())))) for token in _tokenize(answer) + self._white_space_fix(self._remove_articles(self._fix_number(self._remove_punc(token.lower())))) + for token in self._tokenize(answer) ] tokens = [token for token in tokens if token.strip()] normalized = " ".join(tokens).strip() return normalized - - max_em = 0 - max_f1 = 0 - for gold_answer in doc.specific["golds_no_preprocessing"]: - exact_match, f1_score = _get_metrics(model_response.text, gold_answer) - if isinstance(gold_answer, list): - gold_answer = gold_answer[0] - if gold_answer.strip(): - max_em = max(max_em, exact_match) - max_f1 = max(max_f1, f1_score) - return {"em": max_em, "f1": max_f1} diff --git a/src/lighteval/metrics/harness_compatibility/truthful_qa.py b/src/lighteval/metrics/harness_compatibility/truthful_qa.py index 771077222..d8cbc3662 100644 --- a/src/lighteval/metrics/harness_compatibility/truthful_qa.py +++ b/src/lighteval/metrics/harness_compatibility/truthful_qa.py @@ -22,39 +22,41 @@ import numpy as np +from lighteval.metrics.metrics_sample import SampleLevelComputation from lighteval.models.model_output import ModelResponse from lighteval.tasks.requests import Doc from lighteval.utils.utils import as_list # Comes from the harness -def truthfulqa_mc_metrics(doc: Doc, model_response: ModelResponse): - def mc1(lls): +class TruthfulqaMCMetrics(SampleLevelComputation): + def compute(self, doc: Doc, model_response: ModelResponse): + gold_ixs = as_list(doc.gold_index) + choices_logprob = model_response.logprobs + + # The harness assumes that all items are gold before the last one, but that is not always the case + # For gold ix 5, 6, 8, the harness will look at the first "gap" (7) and consider that the following + # items are not gold (even though here, 8 is gold). Example at item 371 of the dataset. + # This is broken and will have to be fixed once we OSS this, by actually separating + # gold and not gold items for mc2 computations + len_mc1 = doc.specific["len_mc1"] + last_harness_gold = gold_ixs[1] - 1 # fake value to init the loop + for g in gold_ixs[1:]: # we ignore the first item, which is the gold for mc1 + if last_harness_gold == g - 1: + last_harness_gold = g + else: + break + # TODO: This completely ignores any normalization, but keeping it as is + mc2_last_gold_ix = last_harness_gold - len_mc1 + 1 + mc1_lls, mc2_lls = choices_logprob[:len_mc1], choices_logprob[len_mc1:] + return {"truthfulqa_mc1": self.mc1(mc1_lls), "truthfulqa_mc2": self.mc2(mc2_lls, mc2_last_gold_ix)} + + def mc1(self, lls): # The gold answers in `mc1_targets` are always first (index = `0`). return np.argmax(lls) == 0 - def mc2(lls, split_idx): + def mc2(self, lls, split_idx): ll_true, ll_false = lls[:split_idx], lls[split_idx:] p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false)) p_true = p_true / (sum(p_true) + sum(p_false)) return sum(p_true) - - gold_ixs = as_list(doc.gold_index) - choices_logprob = model_response.logprobs - - # The harness assumes that all items are gold before the last one, but that is not always the case - # For gold ix 5, 6, 8, the harness will look at the first "gap" (7) and consider that the following - # items are not gold (even though here, 8 is gold). Example at item 371 of the dataset. - # This is broken and will have to be fixed once we OSS this, by actually separating - # gold and not gold items for mc2 computations - len_mc1 = doc.specific["len_mc1"] - last_harness_gold = gold_ixs[1] - 1 # fake value to init the loop - for g in gold_ixs[1:]: # we ignore the first item, which is the gold for mc1 - if last_harness_gold == g - 1: - last_harness_gold = g - else: - break - # TODO: This completely ignores any normalization, but keeping it as is - mc2_last_gold_ix = last_harness_gold - len_mc1 + 1 - mc1_lls, mc2_lls = choices_logprob[:len_mc1], choices_logprob[len_mc1:] - return {"truthfulqa_mc1": mc1(mc1_lls), "truthfulqa_mc2": mc2(mc2_lls, mc2_last_gold_ix)} diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 6b8479da7..651a023bf 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -32,19 +32,20 @@ IndicesExtractionConfig, LatexExtractionConfig, ) -from lighteval.metrics.harness_compatibility.drop import drop_metrics -from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics +from lighteval.metrics.harness_compatibility.drop import DropMetrics +from lighteval.metrics.harness_compatibility.truthful_qa import TruthfulqaMCMetrics from lighteval.metrics.metrics_corpus import ( CorpusLevelF1Score, CorpusLevelPerplexityMetric, CorpusLevelTranslationMetric, - matthews_corrcoef, + MatthewsCorrCoef, ) from lighteval.metrics.metrics_sample import ( BLEU, BLEURT, MRR, ROUGE, + AccGoldLikelihood, AvgAtK, BertScore, ExactMatches, @@ -58,7 +59,6 @@ PassAtK, Recall, StringDistance, - acc_golds_likelihood, ) from lighteval.metrics.normalizations import ( bigbench_normalizer, @@ -84,7 +84,7 @@ class Metrics(Enum): acc_golds_likelihood = SampleLevelMetric( # todo: we need a better name for this! metric_name="acc", - sample_level_fn=acc_golds_likelihood, + sample_level_fn=AccGoldLikelihood(), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, @@ -185,7 +185,7 @@ class Metrics(Enum): ) drop = SampleLevelMetricGrouping( metric_name=["em", "f1"], - sample_level_fn=drop_metrics, + sample_level_fn=DropMetrics(), category=SamplingMethod.GENERATIVE, corpus_level_fn={"em": max, "f1": max}, higher_is_better={"em": True, "f1": True}, @@ -323,7 +323,7 @@ class Metrics(Enum): metric_name="mcc", sample_level_fn=LoglikelihoodPreparator(), category=SamplingMethod.LOGPROBS, - corpus_level_fn=matthews_corrcoef, + corpus_level_fn=MatthewsCorrCoef(), higher_is_better=True, ) mrr = SampleLevelMetric( @@ -457,7 +457,7 @@ class Metrics(Enum): ) truthfulqa_mc_metrics = SampleLevelMetricGrouping( metric_name=["truthfulqa_mc1", "truthfulqa_mc2"], - sample_level_fn=truthfulqa_mc_metrics, + sample_level_fn=TruthfulqaMCMetrics(), category=SamplingMethod.LOGPROBS, corpus_level_fn={"truthfulqa_mc1": np.mean, "truthfulqa_mc2": np.mean}, higher_is_better={"truthfulqa_mc1": True, "truthfulqa_mc2": True}, diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 0ac99f764..710e073dd 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -50,18 +50,19 @@ def compute_corpus(self): # General aggregations -def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float: - """Computes the Matthews Correlation Coefficient, using scikit learn ([doc](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)). - - Args: - items (list[dict]): List of GenerativeCorpusMetricInput - - Returns: - float: Score - """ - golds = [i.golds for i in items] - preds = [i.preds for i in items] - return sklearn.metrics.matthews_corrcoef(golds, preds) +class MatthewsCorrCoef(CorpusLevelComputation): + def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float: + """Computes the Matthews Correlation Coefficient, using scikit learn ([doc](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)). + + Args: + items (list[dict]): List of GenerativeCorpusMetricInput + + Returns: + float: Score + """ + golds = [i.golds for i in items] + preds = [i.preds for i in items] + return sklearn.metrics.matthews_corrcoef(golds, preds) class CorpusLevelF1Score(CorpusLevelComputation): diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 413893624..8e796fc9b 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -43,7 +43,6 @@ from lighteval.metrics.imports.bert_scorer import BERTScorer from lighteval.metrics.imports.data_stats_metric import DataStatsMetric from lighteval.metrics.imports.summac import SummaCZS -from lighteval.metrics.llm_as_judge import JudgeLM from lighteval.metrics.normalizations import ( LogProbNormalization, LogProbTokenNorm, @@ -52,6 +51,7 @@ remove_braces_and_strip, ) from lighteval.metrics.utils.judge_utils import get_judge_prompt_simpleqa, process_judge_response_simpleqa +from lighteval.metrics.utils.llm_as_judge import JudgeLM from lighteval.models.model_output import ModelResponse from lighteval.tasks.requests import Doc from lighteval.utils.utils import as_list, safe_divide @@ -60,7 +60,12 @@ logger = logging.getLogger(__name__) -class ExactMatches: +class SampleLevelComputation: + def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): + raise NotImplementedError + + +class ExactMatches(SampleLevelComputation): def __init__( self, aggregation_function: Callable[[list[float]], float] = max, @@ -148,7 +153,7 @@ def compute_one_item( return 1 if gold == pred else 0 -class F1_score: +class F1_score(SampleLevelComputation): def __init__( self, aggregation_function: Callable[[list[float]], float] = max, @@ -220,7 +225,7 @@ def compute_one_item(self, gold: str, pred: str) -> float: return ret -class LoglikelihoodAcc: +class LoglikelihoodAcc(SampleLevelComputation): def __init__(self, logprob_normalization: LogProbNormalization | None = None): """Log likelihood accuracy class. It tests if the highest log-probability of the possible choices is actually in the gold ones. @@ -277,7 +282,7 @@ def compute( return int(best_choice in gold_ixs) -class NormalizedMultiChoiceProbability: +class NormalizedMultiChoiceProbability(SampleLevelComputation): def __init__( self, log_prob_normalization: LogProbNormalization | None = None, @@ -340,7 +345,7 @@ def compute( return gold_idx_agg_prob -class Probability: +class Probability(SampleLevelComputation): def __init__( self, normalization: LogProbTokenNorm | None = None, @@ -393,7 +398,7 @@ def compute( return self.aggregation_function(probs) -class Recall: +class Recall(SampleLevelComputation): def __init__(self, k: int) -> None: """Recall metric class. It checks if the top `k` best choices include one of the golds or not. @@ -422,7 +427,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> int: return int(any(ix in gold_ixs for ix in np.array(choices_logprobs).argsort()[::-1][: self.recall_depth])) -class MRR: +class MRR(SampleLevelComputation): def __init__(self, length_normalization: bool = False): """A mean reciprocal rank class. @@ -456,19 +461,20 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: return 1.0 / (min(ranked_choices) + 1) -def acc_golds_likelihood(doc, model_response, **kwargs) -> int: - """Tests if at least one of predicted gold targets' argmax of logits equals the gold. +class AccGoldLikelihood(SampleLevelComputation): + def compute(self, doc, model_response, **kwargs) -> int: + """Tests if at least one of predicted gold targets' argmax of logits equals the gold. - Args: - argmax_logits_eq_gold_list (list[int]): List of scores 1/0 indicating whether the argmax of logits equals the gold + Args: + argmax_logits_eq_gold_list (list[int]): List of scores 1/0 indicating whether the argmax of logits equals the gold - Returns: - int: 1 if at least one of the possible golds has argmax of logits == gold, 0 otherwise - """ - return int(any(model_response.argmax_logits_eq_gold)) + Returns: + int: 1 if at least one of the possible golds has argmax of logits == gold, 0 otherwise + """ + return int(any(model_response.argmax_logits_eq_gold)) -class ROUGE: +class ROUGE(SampleLevelComputation): ALLOWED_ROUGE_METHODS = ["rouge1", "rouge2", "rougeL", "rougeLsum"] def __init__( @@ -579,7 +585,7 @@ def _rouge_score_with_bootsrap(self, golds: list[str], predictions: list[str]): return {method: result[method].mid.fmeasure * 100 for method in self.methods} -class BertScore: +class BertScore(SampleLevelComputation): def __init__( self, normalize_gold: Callable | None = None, @@ -641,7 +647,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str return {"BERTScore-P": p[0].item(), "BERTScore-R": r[0].item(), "BERTScore-F": f[0].item()} -class Extractiveness: +class Extractiveness(SampleLevelComputation): def __init__( self, normalize_input: callable = remove_braces, @@ -695,7 +701,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str } -class Faithfulness: +class Faithfulness(SampleLevelComputation): def __init__( self, normalize_input: Callable = remove_braces, @@ -744,7 +750,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str return self.summac.score_one(inp, prediction)["score"] -class BLEURT: +class BLEURT(SampleLevelComputation): def __init__(self): """Creates a BLEURT scorer using a light bleurt-tiny-512 model. For more complex use cases, could also be Elron/bleurt-base-128 @@ -783,7 +789,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: return scores.item() -class BLEU: +class BLEU(SampleLevelComputation): def __init__(self, n_gram: int): """BLEU scorer class. Relies on `nltk`'s sentencebleu for scoring. TODO: Will have to move this to sacrebleu. @@ -821,7 +827,7 @@ def _bleu_score(self, gold: list[str], pred: str): return sentence_bleu([word_tokenize(g) for g in gold], word_tokenize(pred), weights=weights) -class StringDistance: +class StringDistance(SampleLevelComputation): def __init__( self, metric_types: list[str] | str, @@ -912,7 +918,7 @@ def edit_similarity(self, s1, s2): return 1.0 - edist / max(len(s1), len(s2)) if len(s1) > 0 and len(s2) > 0 else 0 -class JudgeLLM: +class JudgeLLM(SampleLevelComputation): available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4", "gpt-4o-2024-08-06"] def __init__( @@ -1077,6 +1083,8 @@ def compute(self, model_responses: list[ModelResponse], docs: list[Doc], **kwarg class SamplingMetric: + """Handles normalization for sampling based metrics""" + def __init__( self, normalize: Callable | str | None = None, @@ -1136,7 +1144,7 @@ def name_metrics(self) -> str | list[str]: raise NotImplementedError -class AvgAtK(SamplingMetric): +class AvgAtK(SamplingMetric, SampleLevelComputation): def __init__(self, k: int | None = None, **kwargs): """Sample score averages all the individual k predictions scores. @@ -1176,7 +1184,7 @@ def num_samples(self): return self.k -class MajAtK(SamplingMetric): +class MajAtK(SamplingMetric, SampleLevelComputation): def __init__(self, k: int = None, **kwargs): """An exact match class.""" super().__init__(kwargs) @@ -1221,7 +1229,7 @@ def num_samples(self): return self.k -class PassAtK(SamplingMetric): +class PassAtK(SamplingMetric, SampleLevelComputation): def __init__(self, k: int | None = None, n: int | None = None, **kwargs): """Computing pass at k @@ -1286,7 +1294,7 @@ def num_samples(self): return self.n if self.n is not None else self.k -class GPassAtK(SamplingMetric): +class GPassAtK(SamplingMetric, SampleLevelComputation): def __init__( self, k: Union[int, list[int]] | None = None, From 5805b304e7c1b03a0d44cb5459ae5ec77111fe46 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 12:50:30 +0000 Subject: [PATCH 28/38] fix --- docs/source/package_reference/metrics.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/package_reference/metrics.mdx b/docs/source/package_reference/metrics.mdx index 57c656966..70e9f5502 100644 --- a/docs/source/package_reference/metrics.mdx +++ b/docs/source/package_reference/metrics.mdx @@ -24,8 +24,8 @@ [[autodoc]] metrics.metrics_corpus.CorpusLevelPerplexityMetric ### CorpusLevelTranslationMetric [[autodoc]] metrics.metrics_corpus.CorpusLevelTranslationMetric -### matthews_corrcoef -[[autodoc]] metrics.metrics_corpus.matthews_corrcoef +### MatthewsCorrCoef +[[autodoc]] metrics.metrics_corpus.MatthewsCorrCoef ## Sample Metrics ### ExactMatches From ac5e0424cbf8a98c2f354dec1e3b7e2710f764a4 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 12:55:22 +0000 Subject: [PATCH 29/38] forgot to update extended tasks --- src/lighteval/metrics/dynamic_metrics.py | 6 ++---- src/lighteval/tasks/extended/hle/main.py | 4 ++-- src/lighteval/tasks/extended/mix_eval/main.py | 8 ++++---- src/lighteval/tasks/extended/mt_bench/main.py | 2 +- src/lighteval/tasks/extended/tiny_benchmarks/main.py | 9 +++++---- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py index c9b8d73bc..d4e58ea3a 100644 --- a/src/lighteval/metrics/dynamic_metrics.py +++ b/src/lighteval/metrics/dynamic_metrics.py @@ -101,9 +101,7 @@ def __init__( """ super().__init__( metric_name="prob" + (f"_{normalization.name}" if normalization else ""), - sample_level_fn=Probability( - normalization=normalization, aggregation_function=aggregation_function - ).compute, + sample_level_fn=Probability(normalization=normalization, aggregation_function=aggregation_function), category=SamplingMethod.LOGPROBS, corpus_level_fn=np.mean, higher_is_better=True, @@ -214,7 +212,7 @@ def __init__( """ super().__init__( metric_name="extractive_match", - sample_level_fn=self.compute, + sample_level_fn=self, category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, diff --git a/src/lighteval/tasks/extended/hle/main.py b/src/lighteval/tasks/extended/hle/main.py index 2c36607da..7e17b2d5a 100644 --- a/src/lighteval/tasks/extended/hle/main.py +++ b/src/lighteval/tasks/extended/hle/main.py @@ -206,8 +206,8 @@ def hle_text_only(line, task_name: str = None): metric_name=["accuracy", "confidence_half_width", "calibration_error"], higher_is_better=dict.fromkeys(["accuracy", "confidence_half_width", "calibration_error"], True), category=SamplingMethod.GENERATIVE, - sample_level_fn=JudgeLLMHLE().compute, - corpus_level_fn=JudgeLLMHLE().compute_corpus, + sample_level_fn=JudgeLLMHLE(), + corpus_level_fn=JudgeLLMHLE(), ) extend_enum(Metrics, "hle_metrics", hle_metrics) diff --git a/src/lighteval/tasks/extended/mix_eval/main.py b/src/lighteval/tasks/extended/mix_eval/main.py index 0e108f90c..2d9b7569a 100644 --- a/src/lighteval/tasks/extended/mix_eval/main.py +++ b/src/lighteval/tasks/extended/mix_eval/main.py @@ -111,7 +111,7 @@ def process_judge_response_freeform_gpt(x): process_judge_response=process_judge_response, judge_backend="vllm", short_judge_name="flow", - ).compute, + ), corpus_level_fn={ "judge_score_flow": np.mean, }, @@ -127,7 +127,7 @@ def process_judge_response_freeform_gpt(x): process_judge_response=process_judge_response_multichoice_gpt, judge_backend="openai", short_judge_name="gpt-3.5", - ).compute, + ), corpus_level_fn={ "judge_score_gpt-3.5": np.mean, }, @@ -148,7 +148,7 @@ def mean_dv_5(x): process_judge_response=process_judge_response, judge_backend="vllm", short_judge_name="flow", - ).compute, + ), corpus_level_fn={ "judge_score_flow": mean_dv_5, }, @@ -164,7 +164,7 @@ def mean_dv_5(x): process_judge_response=process_judge_response_freeform_gpt, judge_backend="openai", short_judge_name="gpt-3.5", - ).compute, + ), corpus_level_fn={ "judge_score_gpt-3.5": np.mean, }, diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index 1756fb212..e32194747 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -70,7 +70,7 @@ def flow_judge_mt_bench_prompt(question, answer, options, gold): template=flow_judge_mt_bench_prompt, process_judge_response=process_judge_response, judge_backend="vllm", - ).compute, + ), corpus_level_fn={ "judge_score_turn_1": np.mean, "judge_score_turn_2": np.mean, diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py index d195bc89b..0aa1ca678 100644 --- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py +++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py @@ -38,7 +38,8 @@ import lighteval.tasks.default_prompts as prompt from lighteval.metrics.metrics import CorpusLevelMetricGrouping, Metrics -from lighteval.metrics.metrics_sample import ExactMatches, LoglikelihoodAcc +from lighteval.metrics.metrics_corpus import CorpusLevelComputation +from lighteval.metrics.metrics_sample import ExactMatches, LoglikelihoodAcc, SampleLevelComputation from lighteval.metrics.normalizations import gsm8k_normalizer from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import SamplingMethod @@ -71,7 +72,7 @@ def neg_log_like(x): # Evaluation function -class TinyCorpusAggregator: +class TinyCorpusAggregator(SampleLevelComputation, CorpusLevelComputation): LEADEBRBOARD_SCENARIOS = ["truthfulqa", "gsm8k", "winogrande", "arc", "hellaswag"] BENCHS = ["lb", "mmlu"] METRICS = ["irt", "pirt", "gpirt"] @@ -111,7 +112,7 @@ def compute(self, **args): res = LoglikelihoodAcc().compute(**args) return dict.fromkeys(self.METRICS, res) - def aggregate(self, y_input): + def compute_corpus(self, y_input): if len(y_input) == self.num_samples and self.estimates is not None: return self.estimates[self.task] @@ -276,7 +277,7 @@ def aggregate(self, y_input): CorpusLevelMetricGrouping( metric_name=TinyCorpusAggregator.METRICS, higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True), - sample_level_fn=TinyCorpusAggregator(name).compute, + sample_level_fn=TinyCorpusAggregator(name), category=category, corpus_level_fn=TinyCorpusAggregator(name).aggregate, ), From 2cde9016774415f9e92f2c46b967bb3905ffd559 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 13:09:51 +0000 Subject: [PATCH 30/38] fix multilingual again --- .../custom_yourbench_task_mcq.py | 10 +--- src/lighteval/metrics/dynamic_metrics.py | 10 +--- src/lighteval/metrics/metrics.py | 52 ++++++++++++------- .../tasks/extended/olympiade_bench/main.py | 21 +++++--- tests/metrics/test_extractive_match.py | 4 +- 5 files changed, 51 insertions(+), 46 deletions(-) diff --git a/examples/custom_tasks_templates/custom_yourbench_task_mcq.py b/examples/custom_tasks_templates/custom_yourbench_task_mcq.py index 2697380ad..010928550 100644 --- a/examples/custom_tasks_templates/custom_yourbench_task_mcq.py +++ b/examples/custom_tasks_templates/custom_yourbench_task_mcq.py @@ -25,12 +25,9 @@ from aenum import extend_enum -from lighteval.metrics.dynamic_metrics import DynamicMultilingualExtractiveMatch from lighteval.metrics.metrics import Metrics -from lighteval.metrics.utils.extractive_match_utils import IndicesExtractionConfig from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc -from lighteval.utils.language import Language logger = logging.getLogger(__name__) @@ -74,12 +71,7 @@ def yourbench_prompt(line, task_name: str = ""): ) -yourbench_metrics = DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - precision=6, -) +yourbench_metrics = Metrics.gpqa_instruct_metric extend_enum(Metrics, "yourbench_metrics", yourbench_metrics) diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py index d4e58ea3a..35eef6a61 100644 --- a/src/lighteval/metrics/dynamic_metrics.py +++ b/src/lighteval/metrics/dynamic_metrics.py @@ -163,7 +163,7 @@ def __init__( ) -class DynamicMultilingualExtractiveMatch(SampleLevelMetric): +class MultilingualExtractiveMatchMetric(SampleLevelMetric): def __init__( self, language: Language = Language.ENGLISH, @@ -210,14 +210,6 @@ def __init__( A sample level metric that extracts and compares mathematical expressions. """ - super().__init__( - metric_name="extractive_match", - sample_level_fn=self, - category=SamplingMethod.GENERATIVE, - corpus_level_fn=np.mean, - higher_is_better=True, - ) - self.language = language self.gold_extraction_target = gold_extraction_target self.pred_extraction_target = pred_extraction_target diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 651a023bf..50e1aabcf 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -27,10 +27,10 @@ from aenum import Enum from lighteval.metrics.dynamic_metrics import ( - DynamicMultilingualExtractiveMatch, ExprExtractionConfig, IndicesExtractionConfig, LatexExtractionConfig, + MultilingualExtractiveMatchMetric, ) from lighteval.metrics.harness_compatibility.drop import DropMetrics from lighteval.metrics.harness_compatibility.truthful_qa import TruthfulqaMCMetrics @@ -99,7 +99,7 @@ class Metrics(Enum): avg_at_k_math = SampleLevelMetric( metric_name="avg@k", sample_level_fn=AvgAtK( - sample_scoring_function=DynamicMultilingualExtractiveMatch( + sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], @@ -197,14 +197,20 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - expr_gold_metric = DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - fallback_mode="first_match", - precision=5, - gold_extraction_target=(ExprExtractionConfig(),), - # Match boxed first before trying other regexes - pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), - aggregation_function=max, + expr_gold_metric = SampleLevelMetric( + metric_name="extractive_match", + sample_level_fn=MultilingualExtractiveMatchMetric( + language=Language.ENGLISH, + fallback_mode="first_match", + precision=5, + gold_extraction_target=(ExprExtractionConfig(),), + # Match boxed first before trying other regexes + pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), + aggregation_function=max, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, ) extractiveness = SampleLevelMetricGrouping( metric_name=["summarization_coverage", "summarization_density", "summarization_compression"], @@ -265,7 +271,7 @@ class Metrics(Enum): sample_level_fn=GPassAtK( name_prefix="math", strip_strings=True, - sample_scoring_function=DynamicMultilingualExtractiveMatch( + sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, fallback_mode="first_match", precision=5, @@ -284,7 +290,7 @@ class Metrics(Enum): sample_level_fn=GPassAtK( name_prefix="latex", strip_strings=True, - sample_scoring_function=DynamicMultilingualExtractiveMatch( + sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, fallback_mode="first_match", precision=5, @@ -352,7 +358,7 @@ class Metrics(Enum): sample_level_fn=PassAtK( strip_strings=True, # Extracting mathematical expressions and latex expressions - sample_scoring_function=DynamicMultilingualExtractiveMatch( + sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], @@ -366,7 +372,7 @@ class Metrics(Enum): pass_at_k_letters = SampleLevelMetric( metric_name="pass@k", sample_level_fn=PassAtK( - sample_scoring_function=DynamicMultilingualExtractiveMatch( + sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], @@ -469,16 +475,22 @@ class Metrics(Enum): corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity"), higher_is_better=False, ) - gpqa_instruct_metric = DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], - precision=6, + gpqa_instruct_metric = SampleLevelMetric( + metric_name="extractive_match", + sample_level_fn=MultilingualExtractiveMatchMetric( + language=Language.ENGLISH, + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + precision=6, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, ) gpqa_instruct_pass_at_k = SampleLevelMetric( metric_name="gpqa_pass@k", sample_level_fn=PassAtK( - sample_scoring_function=DynamicMultilingualExtractiveMatch( + sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], diff --git a/src/lighteval/tasks/extended/olympiade_bench/main.py b/src/lighteval/tasks/extended/olympiade_bench/main.py index 369911969..d9fe0d2bc 100644 --- a/src/lighteval/tasks/extended/olympiade_bench/main.py +++ b/src/lighteval/tasks/extended/olympiade_bench/main.py @@ -21,11 +21,14 @@ # SOFTWARE. +import numpy as np + from lighteval.metrics.dynamic_metrics import ( - DynamicMultilingualExtractiveMatch, ExprExtractionConfig, LatexExtractionConfig, + MultilingualExtractiveMatchMetric, ) +from lighteval.metrics.metrics import SampleLevelMetric, SamplingMethod from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc from lighteval.utils.language import Language @@ -200,11 +203,17 @@ def olympiad_bench_prompt(line, task_name: str = None): extraction_targets = [ExprExtractionConfig(), LatexExtractionConfig()] -metric = DynamicMultilingualExtractiveMatch( - language=Language.ENGLISH, - gold_extraction_target=extraction_targets, - pred_extraction_target=extraction_targets, - precision=6, +metric = SampleLevelMetric( + metric_name="extractive_match", + sample_level_fn=MultilingualExtractiveMatchMetric( + language=Language.ENGLISH, + gold_extraction_target=extraction_targets, + pred_extraction_target=extraction_targets, + precision=6, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, ) task_configs = [] diff --git a/tests/metrics/test_extractive_match.py b/tests/metrics/test_extractive_match.py index b7fc65ce9..7504fcceb 100644 --- a/tests/metrics/test_extractive_match.py +++ b/tests/metrics/test_extractive_match.py @@ -24,10 +24,10 @@ import sympy from lighteval.metrics.dynamic_metrics import ( - DynamicMultilingualExtractiveMatch, ExprExtractionConfig, IndicesExtractionConfig, LatexExtractionConfig, + MultilingualExtractiveMatchMetric, ) from lighteval.metrics.utils.math_comparison import sympy_expr_eq from lighteval.models.model_output import ModelResponse @@ -66,7 +66,7 @@ def compare_strings( model_response = ModelResponse(text=[pred]) doc = Doc(choices=[gold, "", "", ""], query="", gold_index=0) - return DynamicMultilingualExtractiveMatch( + return MultilingualExtractiveMatchMetric( language=language, gold_extraction_target=extraction_targets, pred_extraction_target=extraction_targets, From e8274c97dc5ececbdd0a79cc3ebd61febf697dc6 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 13:25:02 +0000 Subject: [PATCH 31/38] updated --- src/lighteval/metrics/dynamic_metrics.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py index 35eef6a61..9ced582c7 100644 --- a/src/lighteval/metrics/dynamic_metrics.py +++ b/src/lighteval/metrics/dynamic_metrics.py @@ -40,13 +40,12 @@ from lighteval.metrics.utils.extractive_match_utils import ( # noqa: F401 ExprExtractionConfig, ExtractionTarget, - IndicesExtractionConfig, LatexExtractionConfig, extract_target_from_pred, get_extraction_regexes, ) from lighteval.metrics.utils.math_comparison import compare_gold_target -from lighteval.metrics.utils.metric_utils import SampleLevelMetric +from lighteval.metrics.utils.metric_utils import SampleLevelComputation, SampleLevelMetric from lighteval.models.model_output import ModelResponse from lighteval.tasks.requests import Doc, SamplingMethod from lighteval.utils.language import Language @@ -163,7 +162,7 @@ def __init__( ) -class MultilingualExtractiveMatchMetric(SampleLevelMetric): +class MultilingualExtractiveMatchMetric(SampleLevelComputation): def __init__( self, language: Language = Language.ENGLISH, From e043030e12a54071904d8d7b41d08172ddf218a0 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 13:50:06 +0000 Subject: [PATCH 32/38] fix --- examples/custom_tasks_templates/custom_yourbench_task.py | 2 +- src/lighteval/metrics/metrics.py | 8 +++++--- src/lighteval/tasks/extended/tiny_benchmarks/main.py | 2 +- tests/metrics/test_extractive_match.py | 4 ++-- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/custom_tasks_templates/custom_yourbench_task.py b/examples/custom_tasks_templates/custom_yourbench_task.py index c223ea378..d2246a1d0 100644 --- a/examples/custom_tasks_templates/custom_yourbench_task.py +++ b/examples/custom_tasks_templates/custom_yourbench_task.py @@ -240,7 +240,7 @@ def yourbench_prompt(line, task_name: str = ""): metric_name=["accuracy"], higher_is_better={"accuracy": True}, category=SamplingMethod.GENERATIVE, - sample_level_fn=JudgeLLMYourBench().compute, + sample_level_fn=JudgeLLMYourBench(), corpus_level_fn={"accuracy": np.mean}, ) extend_enum(Metrics, "yourbench_metrics", yourbench_metrics) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 50e1aabcf..a0c75c133 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -27,9 +27,6 @@ from aenum import Enum from lighteval.metrics.dynamic_metrics import ( - ExprExtractionConfig, - IndicesExtractionConfig, - LatexExtractionConfig, MultilingualExtractiveMatchMetric, ) from lighteval.metrics.harness_compatibility.drop import DropMetrics @@ -71,6 +68,11 @@ PerplexityPreparator, TargetPerplexityPreparator, ) +from lighteval.metrics.utils.extractive_match_utils import ( + ExprExtractionConfig, + IndicesExtractionConfig, + LatexExtractionConfig, +) from lighteval.metrics.utils.metric_utils import ( CorpusLevelMetric, CorpusLevelMetricGrouping, diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py index 0aa1ca678..bf65ac530 100644 --- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py +++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py @@ -279,6 +279,6 @@ def compute_corpus(self, y_input): higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True), sample_level_fn=TinyCorpusAggregator(name), category=category, - corpus_level_fn=TinyCorpusAggregator(name).aggregate, + corpus_level_fn=TinyCorpusAggregator(name), ), ) diff --git a/tests/metrics/test_extractive_match.py b/tests/metrics/test_extractive_match.py index 7504fcceb..d2fd71606 100644 --- a/tests/metrics/test_extractive_match.py +++ b/tests/metrics/test_extractive_match.py @@ -23,11 +23,11 @@ import pytest import sympy -from lighteval.metrics.dynamic_metrics import ( +from lighteval.metrics.dynamic_metrics import MultilingualExtractiveMatchMetric +from lighteval.metrics.utils.extractive_match_utils import ( ExprExtractionConfig, IndicesExtractionConfig, LatexExtractionConfig, - MultilingualExtractiveMatchMetric, ) from lighteval.metrics.utils.math_comparison import sympy_expr_eq from lighteval.models.model_output import ModelResponse From 31d7d787b82d04e1e2133ece33a52e21ca63b361 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Wed, 20 Aug 2025 16:01:17 +0200 Subject: [PATCH 33/38] Apply suggestions from code review Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- community_tasks/aimo_evals.py | 2 +- src/lighteval/metrics/utils/metric_utils.py | 4 ++-- src/lighteval/tasks/registry.py | 13 +++++++------ 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py index f0a01c16c..535711918 100644 --- a/community_tasks/aimo_evals.py +++ b/community_tasks/aimo_evals.py @@ -50,7 +50,7 @@ def aimo_prompt(line, task_name: str = None): evaluation_splits=["train"], few_shots_split="train", few_shots_select="sequential", - metric=[Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer})], + metrics=[Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer})], generation_size=2048, stop_sequence=None, ) diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py index 2b51185c6..6e253472a 100644 --- a/src/lighteval/metrics/utils/metric_utils.py +++ b/src/lighteval/metrics/utils/metric_utils.py @@ -35,7 +35,7 @@ class Metric: higher_is_better: bool category: SamplingMethod sample_level_fn: SampleLevelComputation | Preparator - corpus_level_fn: Callable | CorpusLevelComputation + corpus_level_fn: CorpusLevelComputation | Callable batched_compute: bool = False @@ -77,7 +77,7 @@ def get_corpus_aggregations(self) -> dict: def __call__(self, sample_params: dict | None): """Allow creating new instances with modified parameters""" - if sample_params: + if sample_params is not None: for k, v in sample_params.items(): setattr(self.sample_level_fn, k, v) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index a0ce627b9..f2c17fbc0 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -114,12 +114,13 @@ def get_tasks_configs(self, task: str) -> list[LightevalTaskConfig]: for metric in [m for m in config.metrics if "@" in m.metric_name]: # parametrizable metric for attribute, value in subtask_param["metric_params"].items(): setattr(metric.sample_level_fn, attribute, value) - if hasattr(metric.sample_level_fn, "attribute_must_be_set"): - for attribute in metric.sample_level_fn.attribute_must_be_set: - if getattr(metric.sample_level_fn, attribute) is None: - raise ValueError( - f"Metric {metric.metric_name} for task {task_name} was not correctly parametrized. Forgot to set {attribute}." - ) + required = getattr(metric.sample_level_fn, "attribute_must_be_set", []) + for attribute in required: + if getattr(metric.sample_level_fn, attribute) is None: + raise ValueError( + f"Metric {metric.metric_name} for task {task_name} " + f"was not correctly parametrized. Forgot to set '{attribute}'." + ) configs.append(config) From aeabbf9a5b7e6ae923662c6d395e8c83e12aa47d Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 14:01:02 +0000 Subject: [PATCH 34/38] review comments --- src/lighteval/metrics/metrics_corpus.py | 4 +++- src/lighteval/metrics/metrics_sample.py | 4 +++- src/lighteval/metrics/utils/metric_utils.py | 4 ++-- tests/tasks/test_registry.py | 4 ++-- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 710e073dd..09018bf70 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -27,6 +27,7 @@ import logging import math +from abc import ABC, abstractmethod from typing import Literal import numpy as np @@ -44,7 +45,8 @@ logger = logging.getLogger(__name__) -class CorpusLevelComputation: +class CorpusLevelComputation(ABC): + @abstractmethod def compute_corpus(self): raise NotImplementedError diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 8e796fc9b..08341781a 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -27,6 +27,7 @@ import inspect import logging import os +from abc import ABC, abstractmethod from typing import Callable, Literal, Union import nltk @@ -60,7 +61,8 @@ logger = logging.getLogger(__name__) -class SampleLevelComputation: +class SampleLevelComputation(ABC): + @abstractmethod def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): raise NotImplementedError diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py index 6e253472a..85b1e2bc6 100644 --- a/src/lighteval/metrics/utils/metric_utils.py +++ b/src/lighteval/metrics/utils/metric_utils.py @@ -55,8 +55,8 @@ def compute_sample( ) if isinstance(self, MetricGrouping): - return sample_level_fn(**kwargs) # result, formatted_doc, - return {self.metric_name: sample_level_fn(**kwargs)} # result, formatted_doc, + return sample_level_fn(**kwargs) + return {self.metric_name: sample_level_fn(**kwargs)} def get_corpus_aggregations(self) -> dict: if isinstance(self, MetricGrouping): diff --git a/tests/tasks/test_registry.py b/tests/tasks/test_registry.py index cd05fe230..caeb4e787 100644 --- a/tests/tasks/test_registry.py +++ b/tests/tasks/test_registry.py @@ -106,7 +106,7 @@ def test_superset_with_subset_task(): def test_cli_sampling_params(): """ - Tests that task info selector correctly handles supersets. + Tests task setting the sampling parameters in CLI. """ registry = Registry() @@ -118,7 +118,7 @@ def test_cli_sampling_params(): def test_cli_sampling_params_fail(): """ - Tests that task info selector correctly handles supersets. + Tests task setting the sampling parameters in CLI failure when args are wrong. """ registry = Registry() From 90f6b3754c75720f24a60e2b0c803e3b068e2942 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 14:17:25 +0000 Subject: [PATCH 35/38] fix dco --- docs/source/package_reference/metrics.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/package_reference/metrics.mdx b/docs/source/package_reference/metrics.mdx index 70e9f5502..1b946a82e 100644 --- a/docs/source/package_reference/metrics.mdx +++ b/docs/source/package_reference/metrics.mdx @@ -67,4 +67,4 @@ ## LLM-as-a-Judge ### JudgeLM -[[autodoc]] metrics.llm_as_judge.JudgeLM +[[autodoc]] metrics.utils.llm_as_judge.JudgeLM From af2cd81cdc1848cd47dd6bced9e138a788b79223 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 14:20:34 +0000 Subject: [PATCH 36/38] style --- community_tasks/aimo_evals.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py index 535711918..7895cabff 100644 --- a/community_tasks/aimo_evals.py +++ b/community_tasks/aimo_evals.py @@ -50,7 +50,9 @@ def aimo_prompt(line, task_name: str = None): evaluation_splits=["train"], few_shots_split="train", few_shots_select="sequential", - metrics=[Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer})], + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}) + ], generation_size=2048, stop_sequence=None, ) From 2f2dcfb22e35b1e329229c0d69cd67471c0e85c0 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 14:32:11 +0000 Subject: [PATCH 37/38] doc --- src/lighteval/metrics/metrics_sample.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 08341781a..ce2005c1b 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -1085,7 +1085,9 @@ def compute(self, model_responses: list[ModelResponse], docs: list[Doc], **kwarg class SamplingMetric: - """Handles normalization for sampling based metrics""" + """All sampling metrics we have defined below use the same set of normalization parameters and same behavior for the default sample_scoring_function. + This class just holds the normalization and applies it to all samples passed to preprocess, then uses the default sample function if not provided. + """ def __init__( self, From b1039e35871918111de99f43dab73aee13b2aec2 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 20 Aug 2025 14:42:22 +0000 Subject: [PATCH 38/38] updated quick tour --- docs/source/quicktour.mdx | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx index 7daea73da..de2059f49 100644 --- a/docs/source/quicktour.mdx +++ b/docs/source/quicktour.mdx @@ -44,6 +44,17 @@ The syntax for the task specification might be a bit hard to grasp at first. The If the fourth value is set to 1, lighteval will check if the prompt (including the few-shot examples) is too long for the context size of the task or the model. If so, the number of few shot examples is automatically reduced. +Tasks have a function applied at the sample level and one at the corpus level. For example, +- an exact match can be applied per sample, then averaged over the corpus to give the final score +- samples can be left untouched before applying Corpus BLEU at the corpus level +etc. + +If the task you are looking at has a sample level function (`sample_level_fn`) which can be parametrized, you can pass parameters in the CLI. +For example +```txt +{suite}|{task}@{parameter_name1}={value1},{parameter_name2}={value2},...|0|0 +``` + All officially supported tasks can be found at the [tasks_list](available-tasks) and in the [extended folder](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks/extended). Moreover, community-provided tasks can be found in the