huggingface
diff --git a/‎src/lighteval/logging/info_loggers.py‎
Lines changed: 5 additions & 0 deletions b/‎src/lighteval/logging/info_loggers.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/lighteval/metrics/__init__.py‎
Lines changed: 49 additions & 13 deletions b/‎src/lighteval/metrics/__init__.py‎
Lines changed: 49 additions & 13 deletions
diff --git a/‎src/lighteval/metrics/dynamic_metrics.py‎
Lines changed: 94 additions & 0 deletions b/‎src/lighteval/metrics/dynamic_metrics.py‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎src/lighteval/metrics/harness_compatibility/truthful_qa.py‎
Lines changed: 10 additions & 2 deletions b/‎src/lighteval/metrics/harness_compatibility/truthful_qa.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎src/lighteval/metrics/metrics.py‎
Lines changed: 6 additions & 5 deletions b/‎src/lighteval/metrics/metrics.py‎
Lines changed: 6 additions & 5 deletions
@@ -374,6 +374,11 @@ def log(
             detail.choices = doc.choices
             detail.gold_index = as_list(doc.gold_index)
             pred_saved = True
+        if task.has_metric_category[MetricCategory.MULTICHOICE_PMI]:
+            detail.choices = doc.choices
+            detail.gold_index = as_list(doc.gold_index)
+            doc.specific = {**(doc.specific or {}), **{"unconditioned_query": doc.unconditioned_query}}
+            pred_saved = True
         if (
             task.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN]
             or task.has_metric_category[MetricCategory.LLM_AS_JUDGE]
 
@@ -30,16 +30,22 @@
 
 def apply_target_perplexity_metric(results: list[ModelResponse], formatted_doc: Doc, metrics: list[Metric]):
     outputs = {}
-    # We only consider the best choice, to check if its logprobs are above 0.5
-    results = results[formatted_doc.gold_index]
-    target_logprob = results.result[0]
-    target_acc = results.result[1]
-    reference_text = formatted_doc.get_golds()[0]
+
+    target_golds = formatted_doc.get_golds()
+    assert len(results) == len(target_golds), "You should return as many results as there are golds"
+    target_logprobs = [res.result[0] for res in results]
+    argmax_logits_eq_gold_list = [res.result[1] for res in results]
+    target_tokens = [res.generated_tokens for res in results]
 
     for metric in metrics:
         if metric.category == MetricCategory.TARGET_PERPLEXITY:
             outputs.update(
-                metric.compute(logprobs=target_logprob, target_acc=target_acc, reference_text=reference_text)
+                metric.compute(
+                    logprobs=target_logprobs,
+                    argmax_logits_eq_gold_list=argmax_logits_eq_gold_list,
+                    reference_texts=target_golds,
+                    target_tokens=target_tokens,
+                )
             )
 
     return outputs
@@ -61,7 +67,7 @@ def apply_perplexity_metric(results: list[ModelResponse], formatted_doc: Doc, me
 
     for metric in metrics:
         if metric.category == MetricCategory.PERPLEXITY:
-            outputs.update(metric.compute(logprobs=results.result, reference_text=reference_text))
+            outputs.update(metric.compute(logprobs=[results.result], reference_texts=[reference_text]))
 
     return outputs
 
@@ -124,23 +130,44 @@ def apply_generative_metric(
 
 def apply_multichoice_metric(results: list[ModelResponse], formatted_doc: Doc, metrics: list[Metric]):
     outputs = {}
-    if len(formatted_doc.choices) <= 1:
+    n_choices = len(formatted_doc.choices)
+    is_pmi_category = all(metric.category == MetricCategory.MULTICHOICE_PMI for metric in metrics)
+
+    if n_choices <= 1:
         raise ValueError(
             "You can't use a multi choice metric with only one choice. Use `acc_golds_likelihood` instead."
         )
-    if len(results) != len(formatted_doc.choices):
+
+    if not is_pmi_category and len(results) != len(formatted_doc.choices):
         raise Exception(
             f"You shoud have returned as many model outputs as choices when using an multi choice metric. Returned {len(results)} instead of {len(formatted_doc.choices)}"
         )
 
+    if is_pmi_category and len(results) != n_choices * 2:
+        raise Exception(
+            f"You shoud have returned twice as many model outputs as choices when using an probability multi choice metric. Returned {len(results)} instead of {n_choices * 2} (conditioned and unconditioned)"
+        )
+
+    mc_results = results[:n_choices]
     # Todo: make better system with return_bool_score instead of taking first element
-    choices_logprob = [results[i].result[0] for i in range(len(formatted_doc.choices))]
+    conditioned_lp = [res.result[0] for res in mc_results]
+    unconditioned_lp = None
+    if is_pmi_category:
+        unconditioned_lp = [res.result[0] for res in results[n_choices : n_choices * 2]]
+
     gold_ixs = as_list(formatted_doc.gold_index)
+    choices_tokens = [res.generated_tokens for res in mc_results]
 
     for metric in metrics:
-        if metric.category == MetricCategory.MULTICHOICE:
+        if metric.category == MetricCategory.MULTICHOICE_PMI or metric.category == MetricCategory.MULTICHOICE:
             outputs.update(
-                metric.compute(choices_logprob=choices_logprob, gold_ixs=gold_ixs, formatted_doc=formatted_doc)
+                metric.compute(
+                    gold_ixs=gold_ixs,
+                    choices_logprob=conditioned_lp,
+                    unconditioned_logprob=unconditioned_lp,
+                    choices_tokens=choices_tokens,
+                    formatted_doc=formatted_doc,
+                )
             )
     return outputs
 
@@ -151,12 +178,21 @@ def apply_multichoice_metric_one_token(results: list[ModelResponse], formatted_d
         raise Exception("You returned more than one result for a sample with a gmultichoice metric on only one token.")
     results = results[0]
     choices_logprob = results.result
+    choices_texts = formatted_doc.choices
     gold_ixs = as_list(formatted_doc.gold_index)
 
     for metric in metrics:
         if metric.category == MetricCategory.MULTICHOICE_ONE_TOKEN:
             outputs.update(
-                metric.compute(choices_logprob=choices_logprob, gold_ixs=gold_ixs, formatted_doc=formatted_doc)
+                metric.compute(
+                    choices_logprob=choices_logprob,
+                    # Neither token or PMI are supported for this metric
+                    unconditioned_logprob=None,
+                    choices_tokens=None,
+                    choices_texts=choices_texts,
+                    gold_ixs=gold_ixs,
+                    formatted_doc=formatted_doc,
+                )
             )
 
     return outputs
 
@@ -0,0 +1,94 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from typing import Callable
+
+import numpy as np
+
+from lighteval.metrics.metrics_sample import LoglikelihoodAcc, NormalizedMultiChoiceProbability, Probability
+from lighteval.metrics.normalizations import LogProbNormalization, LogProbPMINorm, LogProbTokenNorm
+from lighteval.metrics.utils import MetricCategory, MetricUseCase, SampleLevelMetric
+
+
+def loglikelihood_acc_metric(normalization: LogProbNormalization | None = None) -> SampleLevelMetric:
+    """
+    Creates a accuracy (loglikelihood) metric, which returns accuracy given normalization.
+    """
+
+    normalization_str = normalization.name if normalization else ""
+    metric_name = f"acc_{normalization_str}"
+    return SampleLevelMetric(
+        metric_name=metric_name,
+        sample_level_fn=LoglikelihoodAcc(logprob_normalization=normalization).compute,
+        category=MetricCategory.MULTICHOICE
+        if not normalization == LogProbPMINorm()
+        else MetricCategory.MULTICHOICE_PMI,
+        use_case=MetricUseCase.ACCURACY,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+
+
+def normalized_multi_choice_prob_metric(
+    normalization: LogProbNormalization | None = None,
+    aggregation_function: Callable[[np.ndarray], float] = np.max,
+) -> SampleLevelMetric:
+    """
+    Creates a normalized multi-choice probability metric, which returns the probability of the gold choice / sum of probabilities of all choices (after logprobs are normalized).
+    """
+
+    normalization_str = normalization.name if normalization else ""
+    metric_name = "_".join(filter(None, ["normalized_mc_prob_", normalization_str]))
+
+    return SampleLevelMetric(
+        metric_name=metric_name,
+        sample_level_fn=NormalizedMultiChoiceProbability(
+            log_prob_normalization=normalization, aggregation_function=aggregation_function
+        ).compute,
+        category=MetricCategory.MULTICHOICE
+        if not normalization == LogProbPMINorm()
+        else MetricCategory.MULTICHOICE_PMI,
+        use_case=MetricUseCase.ACCURACY,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+
+
+def probability_metric(
+    normalization: LogProbTokenNorm | None = None,
+    aggregation_function: Callable[[np.ndarray], float] = np.max,
+) -> SampleLevelMetric:
+    """
+    Creates a probability metric, which returns the probability of the gold choice given normalization.
+    """
+
+    normalization_str = normalization.name if normalization else ""
+    metric_name = "_".join(filter(None, ["prob", normalization_str]))
+
+    return SampleLevelMetric(
+        metric_name=metric_name,
+        sample_level_fn=Probability(normalization=normalization, aggregation_function=aggregation_function).compute,
+        category=MetricCategory.TARGET_PERPLEXITY,
+        use_case=MetricUseCase.PERPLEXITY,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
@@ -22,9 +22,17 @@
 
 import numpy as np
 
+from lighteval.tasks.requests import Doc
+
 
 # Comes from the harness
-def truthfulqa_mc_metrics(gold_ixs, choices_logprob, formatted_doc):
+def truthfulqa_mc_metrics(
+    gold_ixs: list[int],
+    choices_logprob: list[float],
+    unconditioned_logprob: list[float] | None,
+    choices_tokens: list[list[int]] | None,
+    formatted_doc: Doc,
+):
     def mc1(lls):
         # The gold answers in `mc1_targets` are always first (index = `0`).
         return np.argmax(lls) == 0
@@ -47,7 +55,7 @@ def mc2(lls, split_idx):
             last_harness_gold = g
         else:
             break
-
+    # TODO: This completely ignores any normalization, but keeping it as is
     mc2_last_gold_ix = last_harness_gold - len_mc1 + 1
     mc1_lls, mc2_lls = choices_logprob[:len_mc1], choices_logprob[len_mc1:]
     return {"truthfulqa_mc1": mc1(mc1_lls), "truthfulqa_mc2": mc2(mc2_lls, mc2_last_gold_ix)}
@@ -51,6 +51,7 @@
     faithfulness,
 )
 from lighteval.metrics.normalizations import (
+    LogProbCharNorm,
     bigbench_normalizer,
     gsm8k_normalizer,
     harness_triviaqa_normalizer,
@@ -288,39 +289,39 @@ class Metrics(Enum):
     )
     loglikelihood_acc = SampleLevelMetric(
         metric_name="acc",
-        sample_level_fn=LoglikelihoodAcc().compute,
+        sample_level_fn=LoglikelihoodAcc(logprob_normalization=None).compute,
         category=MetricCategory.MULTICHOICE,
         use_case=MetricUseCase.ACCURACY,
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
     loglikelihood_acc_norm = SampleLevelMetric(
         metric_name="acc_norm",
-        sample_level_fn=LoglikelihoodAcc(length_normalization=True).compute,
+        sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm()).compute,
         category=MetricCategory.MULTICHOICE,
         use_case=MetricUseCase.ACCURACY,
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
     loglikelihood_acc_norm_nospace = SampleLevelMetric(
         metric_name="acc_norm",
-        sample_level_fn=LoglikelihoodAcc(length_normalization=True, ignore_first_space=True).compute,
+        sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm(ignore_first_space=True)).compute,
         category=MetricCategory.MULTICHOICE,
         use_case=MetricUseCase.ACCURACY,
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
     loglikelihood_acc_norm_single_token = SampleLevelMetric(
         metric_name="acc_norm",
-        sample_level_fn=LoglikelihoodAcc(length_normalization=True).compute,
+        sample_level_fn=LoglikelihoodAcc(logprob_normalization=LogProbCharNorm()).compute,
         category=MetricCategory.MULTICHOICE_ONE_TOKEN,
         use_case=MetricUseCase.ACCURACY,
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
     loglikelihood_acc_single_token = SampleLevelMetric(
         metric_name="acc",
-        sample_level_fn=LoglikelihoodAcc().compute,
+        sample_level_fn=LoglikelihoodAcc(logprob_normalization=None).compute,
         category=MetricCategory.MULTICHOICE_ONE_TOKEN,
         use_case=MetricUseCase.ACCURACY,
         corpus_level_fn=np.mean,