From d8cfc2e3b935cec89874b43411c6d453fcd09a4d Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 27 Aug 2025 09:28:16 +0000
Subject: [PATCH 01/26] Fixe Sampling Metrics and Evals

---
 src/lighteval/metrics/dynamic_metrics.py |  4 +--
 src/lighteval/metrics/metrics_sample.py  | 36 +++++++++++++-----------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py
index 9ced582c7..39f1010bc 100644
--- a/src/lighteval/metrics/dynamic_metrics.py
+++ b/src/lighteval/metrics/dynamic_metrics.py
@@ -220,7 +220,7 @@ def __init__(
 
     @timeout(2)
     def add_to_specifics_with_timeout(
-        formatted_doc: Doc, extracted_predictions: list[list[str]], extracted_golds: list[list[str]]
+        self, formatted_doc: Doc, extracted_predictions: list[list[str]], extracted_golds: list[list[str]]
     ) -> None:
         if formatted_doc.specific is None:
             formatted_doc.specific = {}
@@ -263,7 +263,7 @@ def compute(self, doc: Doc, model_response: ModelResponse) -> float:
         # We have to use timeout because the sypmy to str conversion can be very slow
         try:
             self.add_to_specifics_with_timeout(doc, extracted_predictions, extracted_golds)
-        except Exception:  # noqa: E722
+        except TimeoutError:  # noqa: E722
             logger.warning("Timeout when adding extracted predictions and golds to specific")
 
         return self.aggregation_function(
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index ce2005c1b..e0c7b5552 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -63,7 +63,7 @@
 
 class SampleLevelComputation(ABC):
     @abstractmethod
-    def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
+    def compute(self, model_response: ModelResponse, doc: Doc, **kwargs):
         raise NotImplementedError
 
 
@@ -1112,6 +1112,8 @@ def __init__(
         if callable(sample_scoring_function):
             self.score_sample = sample_scoring_function
             self.type_exact_match = None
+        elif isinstance(sample_scoring_function, SampleLevelComputation):
+            self.score_sample = sample_scoring_function.compute
         else:
             if isinstance(sample_scoring_function, str):
                 if sample_scoring_function not in ["prefix", "suffix", "full"]:
@@ -1119,6 +1121,7 @@ def __init__(
                         f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {sample_scoring_function} instead."
                     )
                 self.type_exact_match = sample_scoring_function
+                self.score_sample = self.default_sample_scoring
             else:
                 self.type_exact_match = "full"
             self.compute_score = self.default_sample_scoring
@@ -1130,7 +1133,7 @@ def preprocess(self, text: str) -> str:
         if self.strip_strings:
             text = text.strip()
 
-        if self.normalize:
+        if self.normalize is not None:
             text = self.normalize(text)
 
         return text
@@ -1161,11 +1164,11 @@ def __init__(self, k: int | None = None, **kwargs):
             sample_scoring_function (callable | str, optional): Function to use to compute the score for each sample.
                 If None, uses the default scoring function which is a simple exact match.
         """
-        super().__init__(kwargs)
+        super().__init__(**kwargs)
         self.k = k
         self.attribute_must_be_set = ["k"]
 
-    def compute(self, model_response: ModelResponse, doc: Doc, **kwargs):
+    def compute(self, model_response: ModelResponse, doc: Doc):
         """Computes the metric over a list of golds and predictions for one single sample.
         It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones,
         then compares it to the gold.
@@ -1189,14 +1192,14 @@ def num_samples(self):
 
 
 class MajAtK(SamplingMetric, SampleLevelComputation):
-    def __init__(self, k: int = None, **kwargs):
+    def __init__(self, k: int | None = None, **kwargs):
         """An exact match class."""
-        super().__init__(kwargs)
+        super().__init__(**kwargs)
 
         self.k = k
         self.attribute_must_be_set = ["k"]
 
-    def compute(self, model_response: ModelResponse, docs: Doc, **kwargs):
+    def compute(self, model_response: ModelResponse, docs: Doc):
         """Computes the metric over a list of golds and predictions for one single sample.
         It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones,
         then compares it to the gold.
@@ -1214,7 +1217,7 @@ def compute(self, model_response: ModelResponse, docs: Doc, **kwargs):
         if len(golds) > 1:
             raise Exception("Cannot compute maj@k with several golds")
 
-        processed_choices = [self.preprocess(gold=g) for g in docs.get_golds()]
+        processed_choices = [self.preprocess(text=g) for g in docs.get_golds()]
         new_doc = Doc(
             choices=processed_choices,
             query=docs.query,
@@ -1222,7 +1225,7 @@ def compute(self, model_response: ModelResponse, docs: Doc, **kwargs):
         )
         all_answers = []
         for pred in model_response.final_text[: self.k]:
-            all_answers.append(self.preprocess(pred=pred))
+            all_answers.append(self.preprocess(text=pred))
         majority_prediction = max(all_answers, key=all_answers.count)
         new_model_response = ModelResponse(
             text=[majority_prediction],
@@ -1241,7 +1244,7 @@ def __init__(self, k: int | None = None, n: int | None = None, **kwargs):
             k (int): Threshold for the number of successful attempts.
             n (int): Number of samples to generate
         """
-        super().__init__(kwargs)
+        super().__init__(**kwargs)
         self.k = k
         self.n = n
         self.attribute_must_be_set = ["k"]
@@ -1269,7 +1272,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
         elif len(predictions) < self.n:
             logger.warning(f"Number of predictions is less than {self.n} for pass@k.")
 
-        processed_choices = [self.preprocess(gold=g) for g in doc.choices]
+        processed_choices = [self.preprocess(text=g) for g in doc.choices]
         new_doc = Doc(
             choices=processed_choices,
             query=doc.query,
@@ -1278,11 +1281,12 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
 
         all_scores = []
         for pred in predictions[: self.n]:
-            cur_pred = self.preprocess(pred=pred)
+            cur_pred = self.preprocess(text=pred)
             new_model_response = ModelResponse(
                 text=[cur_pred],
             )
-            all_scores.append(self.score_sample(new_doc, new_model_response))
+            breakpoint()
+            all_scores.append(self.score_sample(doc=new_doc, model_response=new_model_response))
 
         return self.pass_at_k(all_scores)
 
@@ -1314,7 +1318,7 @@ def __init__(
             n (int): Number of samples to generate.
             thresholds (list): Thresholds to control successful attempts in k generate.
         """
-        super().__init__(kwargs)
+        super().__init__(**kwargs)
         self._k = k
         self.n = n
         self.attribute_must_be_set = ["k"]
@@ -1356,7 +1360,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float:
         elif len(predictions) < self.n:
             logger.warning(f"Number of predictions is less than {self.n} for G-Pass@k.")
 
-        processed_choices = [self.preprocess(gold=g) for g in doc.choices]
+        processed_choices = [self.preprocess(text=g) for g in doc.choices]
         new_doc = Doc(
             choices=processed_choices,
             query=doc.query,
@@ -1365,7 +1369,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float:
 
         all_scores = []
         for pred in predictions[: self.n]:
-            cur_pred = self.preprocess(pred=pred)
+            cur_pred = self.preprocess(text=pred)
             new_model_response = ModelResponse(
                 text=[cur_pred],
             )

From 7ae5da53503a206aebeff1c9dfb66b9a44d47ffc Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 27 Aug 2025 09:29:53 +0000
Subject: [PATCH 02/26] remove breakpoint

---
 src/lighteval/metrics/metrics_sample.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index e0c7b5552..99deaff2c 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -1285,7 +1285,6 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
             new_model_response = ModelResponse(
                 text=[cur_pred],
             )
-            breakpoint()
             all_scores.append(self.score_sample(doc=new_doc, model_response=new_model_response))
 
         return self.pass_at_k(all_scores)

From a00f3c03e94035e726d3b4bbfd02677286a59f52 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 27 Aug 2025 14:07:59 +0000
Subject: [PATCH 03/26] add auto tests for metrics

---
 pyproject.toml                                |  10 +-
 src/lighteval/metrics/metrics_sample.py       |  15 +-
 tests/unit/metrics/pytest.ini                 |  18 +
 .../tasks/templates/test_continuation.py      |   0
 .../metrics}/tasks/templates/test_copa.py     |   0
 .../tasks/templates/test_hellaswag.py         |   0
 .../tasks/templates/test_multichoice.py       |   0
 .../metrics}/tasks/templates/test_nli.py      |   0
 .../tasks/templates/test_translation.py       |   0
 .../metrics}/tasks/test_lighteval_task.py     |   0
 .../{ => unit/metrics}/tasks/test_registry.py |   0
 .../metrics/test_automated_metrics_pytest.py  | 104 +++++
 tests/unit/metrics/test_cases/README.md       | 116 +++++
 .../test_cases/acc_golds_likelihood.json      |   3 +
 tests/unit/metrics/test_cases/avg_at_k.json   |   3 +
 .../metrics/test_cases/avg_at_k_math.json     |   3 +
 tests/unit/metrics/test_cases/bert_score.json |   3 +
 .../metrics/test_cases/bits_per_byte.json     |   3 +
 tests/unit/metrics/test_cases/bleu.json       |   3 +
 tests/unit/metrics/test_cases/bleu_1.json     |   3 +
 tests/unit/metrics/test_cases/bleu_4.json     |   3 +
 tests/unit/metrics/test_cases/bleurt.json     |   3 +
 .../metrics/test_cases/byte_perplexity.json   |   3 +
 tests/unit/metrics/test_cases/chrf.json       |   3 +
 tests/unit/metrics/test_cases/chrf_plus.json  |   3 +
 tests/unit/metrics/test_cases/copyright.json  |   3 +
 tests/unit/metrics/test_cases/drop.json       |   3 +
 .../unit/metrics/test_cases/exact_match.json  |   3 +
 .../metrics/test_cases/expr_gold_metric.json  |   3 +
 .../metrics/test_cases/extractiveness.json    |   3 +
 tests/unit/metrics/test_cases/f1_score.json   |   3 +
 .../metrics/test_cases/f1_score_macro.json    |   3 +
 .../metrics/test_cases/f1_score_micro.json    |   3 +
 .../unit/metrics/test_cases/faithfulness.json |   3 +
 .../unit/metrics/test_cases/g_pass_at_k.json  |   3 +
 .../metrics/test_cases/g_pass_at_k_latex.json |   3 +
 .../metrics/test_cases/g_pass_at_k_math.json  |   3 +
 .../test_cases/gpqa_instruct_metric.json      |   3 +
 .../test_cases/gpqa_instruct_pass_at_k.json   |   3 +
 .../metrics/test_cases/loglikelihood_acc.json |   3 +
 .../metrics/test_cases/loglikelihood_f1.json  |   3 +
 tests/unit/metrics/test_cases/maj_at_k.json   |   3 +
 tests/unit/metrics/test_cases/mcc.json        |   3 +
 tests/unit/metrics/test_cases/mrr.json        |   3 +
 .../metrics/test_cases/multi_f1_numeric.json  |   3 +
 tests/unit/metrics/test_cases/pass_at_k.json  |   3 +
 .../metrics/test_cases/pass_at_k_letters.json |   3 +
 .../metrics/test_cases/pass_at_k_math.json    |   3 +
 .../test_cases/prediction_perplexity.json     |   3 +
 .../unit/metrics/test_cases/recall_at_k.json  |   3 +
 tests/unit/metrics/test_cases/rouge1.json     |   3 +
 tests/unit/metrics/test_cases/rouge2.json     |   3 +
 tests/unit/metrics/test_cases/rougeL.json     |   3 +
 tests/unit/metrics/test_cases/rougeLsum.json  |   3 +
 tests/unit/metrics/test_cases/rouge_t5.json   |   3 +
 .../metrics/test_cases/simpleqa_judge.json    |   3 +
 .../metrics/test_cases/target_perplexity.json |   3 +
 tests/unit/metrics/test_cases/ter.json        |   3 +
 .../test_cases/truthfulqa_mc_metrics.json     |   3 +
 .../metrics/test_cases/word_perplexity.json   |   3 +
 .../metrics/test_extractive_match.py          |   0
 .../metrics/test_metric_requests.py           |   0
 tests/unit/metrics/test_metrics_automated.py  | 406 ++++++++++++++++++
 .../{ => unit}/metrics/test_normalizations.py |   0
 .../unit/metrics/test_unit_harness_metrics.py | 139 ++++++
 65 files changed, 939 insertions(+), 10 deletions(-)
 create mode 100644 tests/unit/metrics/pytest.ini
 rename tests/{ => unit/metrics}/tasks/templates/test_continuation.py (100%)
 rename tests/{ => unit/metrics}/tasks/templates/test_copa.py (100%)
 rename tests/{ => unit/metrics}/tasks/templates/test_hellaswag.py (100%)
 rename tests/{ => unit/metrics}/tasks/templates/test_multichoice.py (100%)
 rename tests/{ => unit/metrics}/tasks/templates/test_nli.py (100%)
 rename tests/{ => unit/metrics}/tasks/templates/test_translation.py (100%)
 rename tests/{ => unit/metrics}/tasks/test_lighteval_task.py (100%)
 rename tests/{ => unit/metrics}/tasks/test_registry.py (100%)
 create mode 100644 tests/unit/metrics/test_automated_metrics_pytest.py
 create mode 100644 tests/unit/metrics/test_cases/README.md
 create mode 100644 tests/unit/metrics/test_cases/acc_golds_likelihood.json
 create mode 100644 tests/unit/metrics/test_cases/avg_at_k.json
 create mode 100644 tests/unit/metrics/test_cases/avg_at_k_math.json
 create mode 100644 tests/unit/metrics/test_cases/bert_score.json
 create mode 100644 tests/unit/metrics/test_cases/bits_per_byte.json
 create mode 100644 tests/unit/metrics/test_cases/bleu.json
 create mode 100644 tests/unit/metrics/test_cases/bleu_1.json
 create mode 100644 tests/unit/metrics/test_cases/bleu_4.json
 create mode 100644 tests/unit/metrics/test_cases/bleurt.json
 create mode 100644 tests/unit/metrics/test_cases/byte_perplexity.json
 create mode 100644 tests/unit/metrics/test_cases/chrf.json
 create mode 100644 tests/unit/metrics/test_cases/chrf_plus.json
 create mode 100644 tests/unit/metrics/test_cases/copyright.json
 create mode 100644 tests/unit/metrics/test_cases/drop.json
 create mode 100644 tests/unit/metrics/test_cases/exact_match.json
 create mode 100644 tests/unit/metrics/test_cases/expr_gold_metric.json
 create mode 100644 tests/unit/metrics/test_cases/extractiveness.json
 create mode 100644 tests/unit/metrics/test_cases/f1_score.json
 create mode 100644 tests/unit/metrics/test_cases/f1_score_macro.json
 create mode 100644 tests/unit/metrics/test_cases/f1_score_micro.json
 create mode 100644 tests/unit/metrics/test_cases/faithfulness.json
 create mode 100644 tests/unit/metrics/test_cases/g_pass_at_k.json
 create mode 100644 tests/unit/metrics/test_cases/g_pass_at_k_latex.json
 create mode 100644 tests/unit/metrics/test_cases/g_pass_at_k_math.json
 create mode 100644 tests/unit/metrics/test_cases/gpqa_instruct_metric.json
 create mode 100644 tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json
 create mode 100644 tests/unit/metrics/test_cases/loglikelihood_acc.json
 create mode 100644 tests/unit/metrics/test_cases/loglikelihood_f1.json
 create mode 100644 tests/unit/metrics/test_cases/maj_at_k.json
 create mode 100644 tests/unit/metrics/test_cases/mcc.json
 create mode 100644 tests/unit/metrics/test_cases/mrr.json
 create mode 100644 tests/unit/metrics/test_cases/multi_f1_numeric.json
 create mode 100644 tests/unit/metrics/test_cases/pass_at_k.json
 create mode 100644 tests/unit/metrics/test_cases/pass_at_k_letters.json
 create mode 100644 tests/unit/metrics/test_cases/pass_at_k_math.json
 create mode 100644 tests/unit/metrics/test_cases/prediction_perplexity.json
 create mode 100644 tests/unit/metrics/test_cases/recall_at_k.json
 create mode 100644 tests/unit/metrics/test_cases/rouge1.json
 create mode 100644 tests/unit/metrics/test_cases/rouge2.json
 create mode 100644 tests/unit/metrics/test_cases/rougeL.json
 create mode 100644 tests/unit/metrics/test_cases/rougeLsum.json
 create mode 100644 tests/unit/metrics/test_cases/rouge_t5.json
 create mode 100644 tests/unit/metrics/test_cases/simpleqa_judge.json
 create mode 100644 tests/unit/metrics/test_cases/target_perplexity.json
 create mode 100644 tests/unit/metrics/test_cases/ter.json
 create mode 100644 tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json
 create mode 100644 tests/unit/metrics/test_cases/word_perplexity.json
 rename tests/{ => unit}/metrics/test_extractive_match.py (100%)
 rename tests/{ => unit}/metrics/test_metric_requests.py (100%)
 create mode 100644 tests/unit/metrics/test_metrics_automated.py
 rename tests/{ => unit}/metrics/test_normalizations.py (100%)
 create mode 100644 tests/unit/metrics/test_unit_harness_metrics.py

diff --git a/pyproject.toml b/pyproject.toml
index 04da22e55..797a7f36b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,11 +58,14 @@ dependencies = [
     "accelerate",
     "huggingface_hub[hf_xet]>=0.30.2",
     "torch>=2.0,<3.0",
-    "GitPython>=3.1.41", # for logging
+    "GitPython>=3.1.41",
+    # for logging
     "datasets>=4.0.0",
     "pydantic",
-    "numpy>=2",  # pinned to avoid incompatibilities
-    "hf-xet>=1.1.8",  # pinned to avoid failing test suite
+    "numpy>=2",
+    # pinned to avoid incompatibilities
+    "hf-xet>=1.1.8",
+    # pinned to avoid failing test suite
     # Prettiness
     "typer",
     "termcolor==2.3.0",
@@ -82,6 +85,7 @@ dependencies = [
     "fsspec>=2023.12.2",
     "httpx>=0.27.2",
     "latex2sympy2_extended==1.0.6",
+    "pip>=25.2",
 ]
 
 [project.optional-dependencies]
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 99deaff2c..17179899e 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -1125,6 +1125,7 @@ def __init__(
             else:
                 self.type_exact_match = "full"
             self.compute_score = self.default_sample_scoring
+            self.score_sample = self.default_sample_scoring
 
     def preprocess(self, text: str) -> str:
         if not text:
@@ -1182,7 +1183,7 @@ def compute(self, model_response: ModelResponse, doc: Doc):
         """
         all_scores = []
         for i in range(self.k):
-            all_scores.append(self.compute_score(doc, model_response[i]))
+            all_scores.append(self.score_sample(doc, model_response[i]))
 
         avg_score = np.mean(all_scores)
         return avg_score
@@ -1199,7 +1200,7 @@ def __init__(self, k: int | None = None, **kwargs):
         self.k = k
         self.attribute_must_be_set = ["k"]
 
-    def compute(self, model_response: ModelResponse, docs: Doc):
+    def compute(self, doc: Doc, model_response: ModelResponse):
         """Computes the metric over a list of golds and predictions for one single sample.
         It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones,
         then compares it to the gold.
@@ -1213,15 +1214,15 @@ def compute(self, model_response: ModelResponse, docs: Doc):
         """
         if self.k is None:
             raise Exception("You did not set the value of k")
-        golds = docs.get_golds()
+        golds = doc.get_golds()
         if len(golds) > 1:
             raise Exception("Cannot compute maj@k with several golds")
 
-        processed_choices = [self.preprocess(text=g) for g in docs.get_golds()]
+        processed_choices = [self.preprocess(text=g) for g in doc.get_golds()]
         new_doc = Doc(
             choices=processed_choices,
-            query=docs.query,
-            gold_index=docs.gold_index,
+            query=doc.query,
+            gold_index=doc.gold_index,
         )
         all_answers = []
         for pred in model_response.final_text[: self.k]:
@@ -1230,7 +1231,7 @@ def compute(self, model_response: ModelResponse, docs: Doc):
         new_model_response = ModelResponse(
             text=[majority_prediction],
         )
-        return self.compute_score(new_model_response, new_doc)
+        return self.compute_score(new_doc, new_model_response)
 
     def num_samples(self):
         return self.k
diff --git a/tests/unit/metrics/pytest.ini b/tests/unit/metrics/pytest.ini
new file mode 100644
index 000000000..f5198f45c
--- /dev/null
+++ b/tests/unit/metrics/pytest.ini
@@ -0,0 +1,18 @@
+[tool:pytest]
+testpaths = .
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+addopts =
+    -v
+    --tb=short
+    --strict-markers
+    --disable-warnings
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    unit: marks tests as unit tests
+    integration: marks tests as integration tests
+    automated: marks tests as automated metric tests
+filterwarnings =
+    ignore::DeprecationWarning
+    ignore::PendingDeprecationWarning
diff --git a/tests/tasks/templates/test_continuation.py b/tests/unit/metrics/tasks/templates/test_continuation.py
similarity index 100%
rename from tests/tasks/templates/test_continuation.py
rename to tests/unit/metrics/tasks/templates/test_continuation.py
diff --git a/tests/tasks/templates/test_copa.py b/tests/unit/metrics/tasks/templates/test_copa.py
similarity index 100%
rename from tests/tasks/templates/test_copa.py
rename to tests/unit/metrics/tasks/templates/test_copa.py
diff --git a/tests/tasks/templates/test_hellaswag.py b/tests/unit/metrics/tasks/templates/test_hellaswag.py
similarity index 100%
rename from tests/tasks/templates/test_hellaswag.py
rename to tests/unit/metrics/tasks/templates/test_hellaswag.py
diff --git a/tests/tasks/templates/test_multichoice.py b/tests/unit/metrics/tasks/templates/test_multichoice.py
similarity index 100%
rename from tests/tasks/templates/test_multichoice.py
rename to tests/unit/metrics/tasks/templates/test_multichoice.py
diff --git a/tests/tasks/templates/test_nli.py b/tests/unit/metrics/tasks/templates/test_nli.py
similarity index 100%
rename from tests/tasks/templates/test_nli.py
rename to tests/unit/metrics/tasks/templates/test_nli.py
diff --git a/tests/tasks/templates/test_translation.py b/tests/unit/metrics/tasks/templates/test_translation.py
similarity index 100%
rename from tests/tasks/templates/test_translation.py
rename to tests/unit/metrics/tasks/templates/test_translation.py
diff --git a/tests/tasks/test_lighteval_task.py b/tests/unit/metrics/tasks/test_lighteval_task.py
similarity index 100%
rename from tests/tasks/test_lighteval_task.py
rename to tests/unit/metrics/tasks/test_lighteval_task.py
diff --git a/tests/tasks/test_registry.py b/tests/unit/metrics/tasks/test_registry.py
similarity index 100%
rename from tests/tasks/test_registry.py
rename to tests/unit/metrics/tasks/test_registry.py
diff --git a/tests/unit/metrics/test_automated_metrics_pytest.py b/tests/unit/metrics/test_automated_metrics_pytest.py
new file mode 100644
index 000000000..eb441e3bc
--- /dev/null
+++ b/tests/unit/metrics/test_automated_metrics_pytest.py
@@ -0,0 +1,104 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""
+Pytest integration for the automated metric testing framework.
+
+This module provides pytest fixtures and test functions that can load and run
+test cases from JSON files.
+"""
+
+import json
+from pathlib import Path
+from typing import List
+
+import pytest
+from test_metrics_automated import AutomatedMetricTester, MetricTestSuite
+
+
+@pytest.fixture
+def metric_tester():
+    """Fixture providing an AutomatedMetricTester instance."""
+    return AutomatedMetricTester()
+
+
+def load_test_suite_from_file(file_path: str) -> MetricTestSuite:
+    """Load a test suite from a JSON file."""
+    with open(file_path, "r") as f:
+        data = json.load(f)
+    return MetricTestSuite(**data)
+
+
+def get_test_suite_files() -> List[str]:
+    """Get all test suite JSON files from the test_cases directory."""
+    test_cases_dir = Path(__file__).parent / "test_cases"
+    if not test_cases_dir.exists():
+        return []
+
+    json_files = list(test_cases_dir.glob("*.json"))
+    return [str(f) for f in json_files]
+
+
+def parametrize_test_suites():
+    """Create parametrized test cases for all test suite files."""
+    test_files = get_test_suite_files()
+    if not test_files:
+        pytest.skip("No test suite files found")
+
+    return test_files
+
+
+class TestAutomatedMetrics:
+    """Test class for automated metric testing with pytest."""
+
+    @pytest.mark.parametrize("test_file", parametrize_test_suites())
+    def test_metric_suite(self, metric_tester, test_file):
+        """Test a complete metric test suite from a JSON file."""
+        test_suite = load_test_suite_from_file(test_file)
+
+        # Run all test cases in the suite
+        results = metric_tester.run_test_suite(test_suite)
+
+        # Separate failed tests from skipped tests
+        failed_tests = [r for r in results if not r["success"] and not r.get("skipped", False)]
+        skipped_tests = [r for r in results if r.get("skipped", False)]
+
+        if failed_tests:
+            # Create detailed error message
+            error_msg = f"Test suite '{test_suite.name}' failed with {len(failed_tests)} failed tests:\n"
+            for result in failed_tests:
+                error_msg += f"\n  - {result['test_case']}: "
+                if result["error"]:
+                    error_msg += f"Error: {result['error']}"
+                else:
+                    error_msg += f"Expected {result['expected']}, got {result['actual']}"
+
+            pytest.fail(error_msg)
+
+        # Log skipped tests
+        if skipped_tests:
+            print(f"\nSkipped {len(skipped_tests)} tests in '{test_suite.name}':")
+            for result in skipped_tests:
+                print(f"  - {result['test_case']}: {result.get('skip_reason', 'Unknown reason')}")
+
+        # All non-skipped tests passed
+        assert len(failed_tests) == 0, f"Expected all non-skipped tests to pass, but {len(failed_tests)} failed"
diff --git a/tests/unit/metrics/test_cases/README.md b/tests/unit/metrics/test_cases/README.md
new file mode 100644
index 000000000..3010cf1d2
--- /dev/null
+++ b/tests/unit/metrics/test_cases/README.md
@@ -0,0 +1,116 @@
+# Metric Test Cases
+
+This directory contains individual JSON files for each metric tested in the LightEval framework. Each file contains all test cases for a specific metric.
+
+## Structure
+
+Each JSON file follows this structure:
+
+```json
+{
+  "name": "Metric Name Test Suite",
+  "description": "Description of the test suite",
+  "test_cases": [
+    {
+      "name": "Test Case Name",
+      "metric_class": "metric_name",
+      "metric_params": {},
+      "doc": {
+        "query": "Input query",
+        "choices": ["choice1", "choice2", "choice3"],
+        "gold_index": 0,
+        "task_name": "test"
+      },
+      "model_response": {
+        "text": ["model_output"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "metric_key": expected_value
+      },
+      "tolerance": 0.01,
+      "description": "Test case description"
+    }
+  ]
+}
+```
+
+## Available Test Files
+
+All 47 metrics from the `METRIC_CLASSES` dictionary have their own JSON test files:
+
+### Text Generation Metrics
+- `exact_match.json` - Exact match metric (2 test cases)
+- `f1_score.json` - F1 score metric (1 test case)
+- `f1_score_macro.json` - F1 score macro metric
+- `f1_score_micro.json` - F1 score micro metric
+- `rouge1.json` - ROUGE1 metric (1 test case)
+- `rouge2.json` - ROUGE2 metric
+- `rougeL.json` - ROUGE-L metric
+- `rougeLsum.json` - ROUGE-Lsum metric
+- `rouge_t5.json` - ROUGE-T5 metric
+- `bert_score.json` - BERT Score metric
+- `bleu.json` - BLEU metric
+- `bleu_1.json` - BLEU-1 metric
+- `bleu_4.json` - BLEU-4 metric
+- `bleurt.json` - BLEURT metric
+- `chrf.json` - ChrF metric
+- `chrf_plus.json` - ChrF+ metric
+- `ter.json` - TER metric
+
+### Perplexity Metrics
+- `bits_per_byte.json` - Bits per byte metric
+- `byte_perplexity.json` - Byte perplexity metric
+- `word_perplexity.json` - Word perplexity metric
+- `prediction_perplexity.json` - Prediction perplexity metric
+- `target_perplexity.json` - Target perplexity metric
+
+### Likelihood Metrics
+- `loglikelihood_acc.json` - Loglikelihood accuracy metric (1 test case)
+- `loglikelihood_f1.json` - Loglikelihood F1 metric
+- `acc_golds_likelihood.json` - Accuracy golds likelihood metric
+
+### Pass-at-k Metrics
+- `pass_at_k.json` - Pass at k metric
+- `pass_at_k_math.json` - Pass at k math metric
+- `pass_at_k_letters.json` - Pass at k letters metric
+- `g_pass_at_k.json` - G-pass at k metric
+- `g_pass_at_k_math.json` - G-pass at k math metric
+- `g_pass_at_k_latex.json` - G-pass at k latex metric
+- `gpqa_instruct_pass_at_k.json` - GPQA instruct pass at k metric
+
+### Other Metrics
+- `recall_at_k.json` - Recall at k metric
+- `mrr.json` - Mean Reciprocal Rank metric
+- `avg_at_k.json` - Average at k metric
+- `avg_at_k_math.json` - Average at k math metric
+- `maj_at_k.json` - Majority at k metric
+- `extractiveness.json` - Extractiveness metric
+- `faithfulness.json` - Faithfulness metric
+- `copyright.json` - Copyright metric
+- `drop.json` - DROP metric
+- `gpqa_instruct_metric.json` - GPQA instruct metric
+- `expr_gold_metric.json` - Expression gold metric
+- `truthfulqa_mc_metrics.json` - TruthfulQA multiple choice metrics
+- `simpleqa_judge.json` - SimpleQA judge metric
+- `multi_f1_numeric.json` - Multi F1 numeric metric
+- `mcc.json` - Matthews Correlation Coefficient metric
+
+## Usage
+
+These test files can be used with the `AutomatedMetricTester` class in `test_metrics_automated.py`:
+
+```python
+tester = AutomatedMetricTester()
+results = tester.run_test_suites_from_file("tests/metrics/test_cases/exact_match.json")
+```
+
+## Adding New Test Cases
+
+To add new test cases for a metric:
+
+1. Open the corresponding JSON file for that metric
+2. Add a new test case object to the `test_cases` array
+3. Follow the same structure as existing test cases
+4. Ensure the `metric_class` matches the metric being tested
diff --git a/tests/unit/metrics/test_cases/acc_golds_likelihood.json b/tests/unit/metrics/test_cases/acc_golds_likelihood.json
new file mode 100644
index 000000000..5d0063739
--- /dev/null
+++ b/tests/unit/metrics/test_cases/acc_golds_likelihood.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75ac8d94b83730e83e9b4b7a3d34ef579a92ca0382f5806a75e469b428215b4c
+size 986
diff --git a/tests/unit/metrics/test_cases/avg_at_k.json b/tests/unit/metrics/test_cases/avg_at_k.json
new file mode 100644
index 000000000..275d0ccb0
--- /dev/null
+++ b/tests/unit/metrics/test_cases/avg_at_k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:925eaea4ae4fc9a773f5628916524116e666a91ffe15a2949123abd3295ceea1
+size 929
diff --git a/tests/unit/metrics/test_cases/avg_at_k_math.json b/tests/unit/metrics/test_cases/avg_at_k_math.json
new file mode 100644
index 000000000..c62f7f8b1
--- /dev/null
+++ b/tests/unit/metrics/test_cases/avg_at_k_math.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50f538b5160294a12d0340e1e7f0a867e61bb0491d3ea3b66ef8e565e30e1526
+size 959
diff --git a/tests/unit/metrics/test_cases/bert_score.json b/tests/unit/metrics/test_cases/bert_score.json
new file mode 100644
index 000000000..fd9b329e7
--- /dev/null
+++ b/tests/unit/metrics/test_cases/bert_score.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f32c2eae678b162629ee1a17cb11c85e29ed774b19a0e769feb3761266a09a2
+size 929
diff --git a/tests/unit/metrics/test_cases/bits_per_byte.json b/tests/unit/metrics/test_cases/bits_per_byte.json
new file mode 100644
index 000000000..8aa7007e8
--- /dev/null
+++ b/tests/unit/metrics/test_cases/bits_per_byte.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba7c2f558287c1cbed6ec62ce42eee3e3864ce3d59fcf20d20b22b21e94e5a17
+size 954
diff --git a/tests/unit/metrics/test_cases/bleu.json b/tests/unit/metrics/test_cases/bleu.json
new file mode 100644
index 000000000..15e03d907
--- /dev/null
+++ b/tests/unit/metrics/test_cases/bleu.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bac803950c223280611f63dda6d0bbc6e78bac0b270a7674429311406ddc5035
+size 891
diff --git a/tests/unit/metrics/test_cases/bleu_1.json b/tests/unit/metrics/test_cases/bleu_1.json
new file mode 100644
index 000000000..238a62928
--- /dev/null
+++ b/tests/unit/metrics/test_cases/bleu_1.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7c63beea1027629eb285c861b5850fc04740106a568ecf8d19622163706283e
+size 903
diff --git a/tests/unit/metrics/test_cases/bleu_4.json b/tests/unit/metrics/test_cases/bleu_4.json
new file mode 100644
index 000000000..252c4b02e
--- /dev/null
+++ b/tests/unit/metrics/test_cases/bleu_4.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0191660dc5bbdf7dd04cd58b2910ec8c741a93c6252d5cb8c2686382137da073
+size 903
diff --git a/tests/unit/metrics/test_cases/bleurt.json b/tests/unit/metrics/test_cases/bleurt.json
new file mode 100644
index 000000000..fa28d1606
--- /dev/null
+++ b/tests/unit/metrics/test_cases/bleurt.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac1081a08f33547bd1158bb4eb535c8ae1dd90d05d1db5de6e99ee21e6abd97c
+size 907
diff --git a/tests/unit/metrics/test_cases/byte_perplexity.json b/tests/unit/metrics/test_cases/byte_perplexity.json
new file mode 100644
index 000000000..88419852d
--- /dev/null
+++ b/tests/unit/metrics/test_cases/byte_perplexity.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4116e450910250997b6a24b4e51149a88cd0f29da2c6a160d9a4e3a05de8b830
+size 968
diff --git a/tests/unit/metrics/test_cases/chrf.json b/tests/unit/metrics/test_cases/chrf.json
new file mode 100644
index 000000000..6d8613f29
--- /dev/null
+++ b/tests/unit/metrics/test_cases/chrf.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e144f94ef8e119ec32454573c11d969090c6ddf0aa85b17354543223b2d1a92
+size 891
diff --git a/tests/unit/metrics/test_cases/chrf_plus.json b/tests/unit/metrics/test_cases/chrf_plus.json
new file mode 100644
index 000000000..fb63d59e4
--- /dev/null
+++ b/tests/unit/metrics/test_cases/chrf_plus.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c87e1da3227bcd0ce18af1463f47c0c19299350ec247b1813233b0cc139de145
+size 923
diff --git a/tests/unit/metrics/test_cases/copyright.json b/tests/unit/metrics/test_cases/copyright.json
new file mode 100644
index 000000000..56c7da7b9
--- /dev/null
+++ b/tests/unit/metrics/test_cases/copyright.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31866d73fe46f534ec8eb8232151657a0f266b7f8251b81d7124dbb2c56da7f4
+size 1007
diff --git a/tests/unit/metrics/test_cases/drop.json b/tests/unit/metrics/test_cases/drop.json
new file mode 100644
index 000000000..9a15ce295
--- /dev/null
+++ b/tests/unit/metrics/test_cases/drop.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7fd23b2a4d60de9ed7e550021a7f943479117d3234c2191b2ba94872fe5c264
+size 1077
diff --git a/tests/unit/metrics/test_cases/exact_match.json b/tests/unit/metrics/test_cases/exact_match.json
new file mode 100644
index 000000000..8f028902b
--- /dev/null
+++ b/tests/unit/metrics/test_cases/exact_match.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:710acbfe499fbe88f152b50efaef99c091813fb529b67dcd602007ea277c3060
+size 1223
diff --git a/tests/unit/metrics/test_cases/expr_gold_metric.json b/tests/unit/metrics/test_cases/expr_gold_metric.json
new file mode 100644
index 000000000..5e360ad51
--- /dev/null
+++ b/tests/unit/metrics/test_cases/expr_gold_metric.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae16455625d67590bdf24fdb28b91684f732952db8110d53145b16295d5883fd
+size 975
diff --git a/tests/unit/metrics/test_cases/extractiveness.json b/tests/unit/metrics/test_cases/extractiveness.json
new file mode 100644
index 000000000..e473d6d8a
--- /dev/null
+++ b/tests/unit/metrics/test_cases/extractiveness.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7603583d63d162186c8e46be3ca4b8ba1dc15afdef99d2009c8172f8360d798e
+size 946
diff --git a/tests/unit/metrics/test_cases/f1_score.json b/tests/unit/metrics/test_cases/f1_score.json
new file mode 100644
index 000000000..507d6806b
--- /dev/null
+++ b/tests/unit/metrics/test_cases/f1_score.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f1e9e4123ac0aabf5588b726c52fd0fa76c9a6a72001eb50eb6549b982e55d1
+size 693
diff --git a/tests/unit/metrics/test_cases/f1_score_macro.json b/tests/unit/metrics/test_cases/f1_score_macro.json
new file mode 100644
index 000000000..219b3815e
--- /dev/null
+++ b/tests/unit/metrics/test_cases/f1_score_macro.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb1c48d29ea568c0b3e1928fc7852f0dc58205ba17bb2caf849d7390e6d52e2
+size 949
diff --git a/tests/unit/metrics/test_cases/f1_score_micro.json b/tests/unit/metrics/test_cases/f1_score_micro.json
new file mode 100644
index 000000000..bffa0896f
--- /dev/null
+++ b/tests/unit/metrics/test_cases/f1_score_micro.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ff067c9e17d82788867c4bff4c4e4fcc9390da0d2d327a5b5c3ec9c4a102fcc
+size 949
diff --git a/tests/unit/metrics/test_cases/faithfulness.json b/tests/unit/metrics/test_cases/faithfulness.json
new file mode 100644
index 000000000..7baddec23
--- /dev/null
+++ b/tests/unit/metrics/test_cases/faithfulness.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:157f820c24bfee8ec961df6d57844fc170c5e52f8a463669918640256f53c361
+size 1022
diff --git a/tests/unit/metrics/test_cases/g_pass_at_k.json b/tests/unit/metrics/test_cases/g_pass_at_k.json
new file mode 100644
index 000000000..b164628e4
--- /dev/null
+++ b/tests/unit/metrics/test_cases/g_pass_at_k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfd2b8f9b839368eebc90e624081301945d8b4f238b23d2f1aba25328577deab
+size 905
diff --git a/tests/unit/metrics/test_cases/g_pass_at_k_latex.json b/tests/unit/metrics/test_cases/g_pass_at_k_latex.json
new file mode 100644
index 000000000..c94a9b7c7
--- /dev/null
+++ b/tests/unit/metrics/test_cases/g_pass_at_k_latex.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5300d1c0ba4e886e27efa190449b4ef9afc9cae8ad32d7a84259ac0562c04b5
+size 1130
diff --git a/tests/unit/metrics/test_cases/g_pass_at_k_math.json b/tests/unit/metrics/test_cases/g_pass_at_k_math.json
new file mode 100644
index 000000000..dcae880bb
--- /dev/null
+++ b/tests/unit/metrics/test_cases/g_pass_at_k_math.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9a23faf6fa94e35e4ef147a08dfcccefcf3d6296e99f51ffa0fd74bebc983a7
+size 1108
diff --git a/tests/unit/metrics/test_cases/gpqa_instruct_metric.json b/tests/unit/metrics/test_cases/gpqa_instruct_metric.json
new file mode 100644
index 000000000..e9b421e91
--- /dev/null
+++ b/tests/unit/metrics/test_cases/gpqa_instruct_metric.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11d94ac03ce4c4d4f6704d3f7e12c2569c8cf55bd64f5fc90170c4052fa6ba51
+size 999
diff --git a/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json b/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json
new file mode 100644
index 000000000..655f270bc
--- /dev/null
+++ b/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:357a97f311d4421e6575e96524b119ff02aa04d9e2fb7899ec8e4725a2307f94
+size 1025
diff --git a/tests/unit/metrics/test_cases/loglikelihood_acc.json b/tests/unit/metrics/test_cases/loglikelihood_acc.json
new file mode 100644
index 000000000..3046bb396
--- /dev/null
+++ b/tests/unit/metrics/test_cases/loglikelihood_acc.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e48acb928cc759b938e2f8d3acd5a65b26bbbef39acd100f580f20aa4d75421
+size 721
diff --git a/tests/unit/metrics/test_cases/loglikelihood_f1.json b/tests/unit/metrics/test_cases/loglikelihood_f1.json
new file mode 100644
index 000000000..5deb7a3ae
--- /dev/null
+++ b/tests/unit/metrics/test_cases/loglikelihood_f1.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea1a1da0d5651cca5268172136a7a1951dd6f68c6fda93464fd2ba9dd3e151c7
+size 965
diff --git a/tests/unit/metrics/test_cases/maj_at_k.json b/tests/unit/metrics/test_cases/maj_at_k.json
new file mode 100644
index 000000000..8bbf1c6e8
--- /dev/null
+++ b/tests/unit/metrics/test_cases/maj_at_k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c0a1a99a62f391296510cc8d7b2c30de6ba9a4cc672a12605ca7d44b73cae29
+size 698
diff --git a/tests/unit/metrics/test_cases/mcc.json b/tests/unit/metrics/test_cases/mcc.json
new file mode 100644
index 000000000..7fe61d007
--- /dev/null
+++ b/tests/unit/metrics/test_cases/mcc.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e689b7971e13f8dcec41c5f873158b32d2e0646feba762fe92405dd0bd39215c
+size 884
diff --git a/tests/unit/metrics/test_cases/mrr.json b/tests/unit/metrics/test_cases/mrr.json
new file mode 100644
index 000000000..654dbbc35
--- /dev/null
+++ b/tests/unit/metrics/test_cases/mrr.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20d4a5e143b068600bc2ad3e345061128c53a90eb8580840fd3da4776f3e989e
+size 884
diff --git a/tests/unit/metrics/test_cases/multi_f1_numeric.json b/tests/unit/metrics/test_cases/multi_f1_numeric.json
new file mode 100644
index 000000000..17d18c1d7
--- /dev/null
+++ b/tests/unit/metrics/test_cases/multi_f1_numeric.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5033944de260bfe4a0fe14eebb87b1e370f9a92d1c54883722134f60fa032d93
+size 961
diff --git a/tests/unit/metrics/test_cases/pass_at_k.json b/tests/unit/metrics/test_cases/pass_at_k.json
new file mode 100644
index 000000000..3fd01b414
--- /dev/null
+++ b/tests/unit/metrics/test_cases/pass_at_k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:855466ba73e0faf312b68666169a0077fa2308d1aa0410e7b29d4a1a4d328882
+size 936
diff --git a/tests/unit/metrics/test_cases/pass_at_k_letters.json b/tests/unit/metrics/test_cases/pass_at_k_letters.json
new file mode 100644
index 000000000..ed483a09d
--- /dev/null
+++ b/tests/unit/metrics/test_cases/pass_at_k_letters.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4f0439d333537ae8485d4f6e3553eebfd0365db97460bee2f956f8f1d3bc582
+size 984
diff --git a/tests/unit/metrics/test_cases/pass_at_k_math.json b/tests/unit/metrics/test_cases/pass_at_k_math.json
new file mode 100644
index 000000000..967c62406
--- /dev/null
+++ b/tests/unit/metrics/test_cases/pass_at_k_math.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b266f73f7141d0a97568e9e9cc3bb9b75be94b87b566f27e8fa86cdcfa6663d
+size 637
diff --git a/tests/unit/metrics/test_cases/prediction_perplexity.json b/tests/unit/metrics/test_cases/prediction_perplexity.json
new file mode 100644
index 000000000..3afd599e2
--- /dev/null
+++ b/tests/unit/metrics/test_cases/prediction_perplexity.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6772f57e5e8e144a4c24049441c127fce4daded47081327ae064c6613f94779e
+size 992
diff --git a/tests/unit/metrics/test_cases/recall_at_k.json b/tests/unit/metrics/test_cases/recall_at_k.json
new file mode 100644
index 000000000..8c6e4190f
--- /dev/null
+++ b/tests/unit/metrics/test_cases/recall_at_k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db8df096318bc9d072bda2dd77c2f43a0ab0ce341928453dc18b4791b89e758a
+size 935
diff --git a/tests/unit/metrics/test_cases/rouge1.json b/tests/unit/metrics/test_cases/rouge1.json
new file mode 100644
index 000000000..92d7f945d
--- /dev/null
+++ b/tests/unit/metrics/test_cases/rouge1.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:201cc4f2c59de282b3cc9ccac2dfbb080cb17ccda6c89fa497d4d1e7a1e44052
+size 689
diff --git a/tests/unit/metrics/test_cases/rouge2.json b/tests/unit/metrics/test_cases/rouge2.json
new file mode 100644
index 000000000..6f5ab48f9
--- /dev/null
+++ b/tests/unit/metrics/test_cases/rouge2.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da3f20ce95aae69fc9dfb39f6b64ab1cbc9e9d4df75eafaad5fbd755c8e5db19
+size 903
diff --git a/tests/unit/metrics/test_cases/rougeL.json b/tests/unit/metrics/test_cases/rougeL.json
new file mode 100644
index 000000000..a05067c84
--- /dev/null
+++ b/tests/unit/metrics/test_cases/rougeL.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c12497e66af2359af1f9bebcf96aeb495ce15cde9ab71c37279a68c16b2c07db
+size 903
diff --git a/tests/unit/metrics/test_cases/rougeLsum.json b/tests/unit/metrics/test_cases/rougeLsum.json
new file mode 100644
index 000000000..00a91d02d
--- /dev/null
+++ b/tests/unit/metrics/test_cases/rougeLsum.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb44e69dbbb59ac026a9b0e356efdd191e0443a633b8d6e70a16e177338d1b5d
+size 924
diff --git a/tests/unit/metrics/test_cases/rouge_t5.json b/tests/unit/metrics/test_cases/rouge_t5.json
new file mode 100644
index 000000000..0798b3ba8
--- /dev/null
+++ b/tests/unit/metrics/test_cases/rouge_t5.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7d7ec4b45e3c67dbd3431c3aa7cde973d994e79d039031febff027f938b0988
+size 989
diff --git a/tests/unit/metrics/test_cases/simpleqa_judge.json b/tests/unit/metrics/test_cases/simpleqa_judge.json
new file mode 100644
index 000000000..9b565d011
--- /dev/null
+++ b/tests/unit/metrics/test_cases/simpleqa_judge.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd3867c275c1afc6a76bdd7aa1cfc4835d4379f5e1b105167c6738a146854d48
+size 953
diff --git a/tests/unit/metrics/test_cases/target_perplexity.json b/tests/unit/metrics/test_cases/target_perplexity.json
new file mode 100644
index 000000000..1c63104e0
--- /dev/null
+++ b/tests/unit/metrics/test_cases/target_perplexity.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f5d79b4c0f5ef2e65a20974d50fe322b57263bc598599d2a7c257d88b30b38e
+size 982
diff --git a/tests/unit/metrics/test_cases/ter.json b/tests/unit/metrics/test_cases/ter.json
new file mode 100644
index 000000000..3bcf09f7c
--- /dev/null
+++ b/tests/unit/metrics/test_cases/ter.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:628eb548f3cff4994449eb6788ca374bec65b3e20b73dd69f58deefe6522e589
+size 884
diff --git a/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json b/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json
new file mode 100644
index 000000000..131c42c16
--- /dev/null
+++ b/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6e70aa07d9fcdbd5020bc81f14f6e7904f88cc36681d5134df0bd5c5808f0a7
+size 1604
diff --git a/tests/unit/metrics/test_cases/word_perplexity.json b/tests/unit/metrics/test_cases/word_perplexity.json
new file mode 100644
index 000000000..6fd35f398
--- /dev/null
+++ b/tests/unit/metrics/test_cases/word_perplexity.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1389311e25a87a629aef62751d274fc55a743564078f2cbb90e67d159fe8a4e5
+size 968
diff --git a/tests/metrics/test_extractive_match.py b/tests/unit/metrics/test_extractive_match.py
similarity index 100%
rename from tests/metrics/test_extractive_match.py
rename to tests/unit/metrics/test_extractive_match.py
diff --git a/tests/metrics/test_metric_requests.py b/tests/unit/metrics/test_metric_requests.py
similarity index 100%
rename from tests/metrics/test_metric_requests.py
rename to tests/unit/metrics/test_metric_requests.py
diff --git a/tests/unit/metrics/test_metrics_automated.py b/tests/unit/metrics/test_metrics_automated.py
new file mode 100644
index 000000000..e336f1d0b
--- /dev/null
+++ b/tests/unit/metrics/test_metrics_automated.py
@@ -0,0 +1,406 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""
+Automated testing framework for LightEval metrics.
+
+This module provides a simple way to test metrics by providing input/output pairs.
+You can define test cases with expected inputs and outputs, and the framework will
+automatically run them and verify the results.
+"""
+
+import json
+import logging
+from dataclasses import field
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+from pydantic import BaseModel
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.models.model_output import ModelResponse
+from lighteval.tasks.requests import Doc
+
+
+logger = logging.getLogger(__name__)
+
+
+class MetricTestCase(BaseModel):
+    """A test case for a metric with input and expected output."""
+
+    name: str
+    metric_class: str
+    metric_params: Dict[str, Any] = field(default_factory=dict)
+    doc: Dict[str, Any]
+    model_response: Dict[str, Any]
+    expected_output: Union[float, Dict[str, float]]
+    tolerance: float = 1e-2
+    description: Optional[str] = None
+
+
+class MetricTestSuite(BaseModel):
+    """A collection of test cases for metrics."""
+
+    name: str
+    test_cases: List[MetricTestCase]
+    description: Optional[str] = None
+
+
+class AutomatedMetricTester:
+    """Automated testing framework for LightEval metrics."""
+
+    # Mapping of metric names to Metrics enum values
+    METRIC_CLASSES = {
+        # Map metric names to their corresponding Metrics enum values
+        "exact_match": Metrics.exact_match,
+        "f1_score": Metrics.f1_score,
+        "loglikelihood_acc": Metrics.loglikelihood_acc,
+        "recall_at_k": Metrics.recall_at_k,
+        "mrr": Metrics.mrr,
+        "rouge1": Metrics.rouge1,
+        "rouge2": Metrics.rouge2,
+        "rougeL": Metrics.rougeL,
+        "rougeLsum": Metrics.rougeLsum,
+        "rouge_t5": Metrics.rouge_t5,
+        "extractiveness": Metrics.extractiveness,
+        "bleurt": Metrics.bleurt,
+        "copyright": Metrics.copyright,
+        "drop": Metrics.drop,
+        "avg_at_k": Metrics.avg_at_k,
+        "avg_at_k_math": Metrics.avg_at_k_math,
+        "g_pass_at_k": Metrics.g_pass_at_k,
+        "g_pass_at_k_math": Metrics.g_pass_at_k_math,
+        "g_pass_at_k_latex": Metrics.g_pass_at_k_latex,
+        "maj_at_k": Metrics.maj_at_k,
+        "pass_at_k": Metrics.pass_at_k,
+        "pass_at_k_math": Metrics.pass_at_k_math,
+        "pass_at_k_letters": Metrics.pass_at_k_letters,
+        "gpqa_instruct_metric": Metrics.gpqa_instruct_metric,
+        "gpqa_instruct_pass_at_k": Metrics.gpqa_instruct_pass_at_k,
+        "expr_gold_metric": Metrics.expr_gold_metric,
+        "acc_golds_likelihood": Metrics.acc_golds_likelihood,
+        "truthfulqa_mc_metrics": Metrics.truthfulqa_mc_metrics,
+        # "faithfulness": Metrics.faithfulness, issue with tokenizer
+        # "prediction_perplexity": Metrics.prediction_perplexity,
+        # "target_perplexity": Metrics.target_perplexity,
+        # "bert_score": Metrics.bert_score, issue with the scoring function, int too big to convert
+        # "simpleqa_judge": Metrics.simpleqa_judge, Batched metrics not supported yet
+        # "bleu": Metrics.bleu,
+        # "bleu_1": Metrics.bleu_1,
+        # "bleu_4": Metrics.bleu_4,
+        # "bits_per_byte": Metrics.bits_per_byte,
+        # "byte_perplexity": Metrics.byte_perplexity,
+        # "chrf": Metrics.chrf,
+        # "chrf_plus": Metrics.chrf_plus,
+        # "loglikelihood_f1": Metrics.loglikelihood_f1,
+        # "multi_f1_numeric": Metrics.multi_f1_numeric,
+        # "ter": Metrics.ter,
+        # "word_perplexity": Metrics.word_perplexity,
+        # "f1_score_macro": Metrics.f1_score_macro,
+        # "f1_score_micro": Metrics.f1_score_micro,
+        # "mcc": Metrics.mcc,
+    }
+
+    def __init__(self):
+        self.test_results = []
+
+    def create_doc_from_dict(self, doc_dict: Dict[str, Any]) -> Doc:
+        """Create a Doc object from a dictionary representation."""
+        return Doc(
+            query=doc_dict.get("query", ""),
+            choices=doc_dict.get("choices", []),
+            gold_index=doc_dict.get("gold_index", 0),
+            task_name=doc_dict.get("task_name", "test"),
+            specific=doc_dict.get("specific", {}),
+        )
+
+    def create_model_response_from_dict(self, response_dict: Dict[str, Any]) -> ModelResponse:
+        """Create a ModelResponse object from a dictionary representation."""
+        return ModelResponse(
+            text=response_dict.get("text", []),
+            logprobs=response_dict.get("logprobs", []),
+            output_tokens=response_dict.get("output_tokens", []),
+            argmax_logits_eq_gold=response_dict.get("argmax_logits_eq_gold", []),
+        )
+
+    def instantiate_metric(self, metric_class: str, metric_params: Dict[str, Any]):
+        """Get a metric from the Metrics enum with the given parameters."""
+        if metric_class not in self.METRIC_CLASSES:
+            raise ValueError(f"Unknown metric class: {metric_class}")
+
+        # Get the metric from the Metrics enum
+        if metric_params != {}:
+            metric_enum_value = self.METRIC_CLASSES[metric_class].value(metric_params)
+        else:
+            metric_enum_value = self.METRIC_CLASSES[metric_class].value
+
+        # The Metrics enum values are already instantiated, so we just return them
+        # The metric_params are ignored for now since the Metrics enum values are pre-configured
+        return metric_enum_value
+
+    def run_test_case(self, test_case: MetricTestCase) -> Dict[str, Any]:
+        """Run a single test case and return the result."""
+        try:
+            # Check if metric is available in METRIC_CLASSES
+            if test_case.metric_class not in self.METRIC_CLASSES:
+                return {
+                    "test_case": test_case.name,
+                    "success": True,  # Mark as success to skip
+                    "expected": test_case.expected_output,
+                    "actual": None,
+                    "error": None,
+                    "skipped": True,
+                    "skip_reason": f"Metric '{test_case.metric_class}' not available in METRIC_CLASSES",
+                }
+
+            # Get the metric from the Metrics enum
+            metric = self.instantiate_metric(test_case.metric_class, test_case.metric_params)
+
+            # Create input objects
+            doc = self.create_doc_from_dict(test_case.doc)
+            model_response = self.create_model_response_from_dict(test_case.model_response)
+
+            # Create sample_params for the metric
+            sample_params = {
+                "doc": doc,
+                "model_response": model_response,
+            }
+
+            # Run the metric using the Metrics enum value
+            actual_output = metric.compute_sample(**sample_params)
+
+            # Compare with expected output
+            success = self._compare_dict_outputs(actual_output, test_case.expected_output, test_case.tolerance)
+            return {
+                "test_case": test_case.name,
+                "success": success,
+                "expected": test_case.expected_output,
+                "actual": actual_output,
+                "error": None,
+                "skipped": False,
+            }
+
+        except Exception as e:
+            return {
+                "test_case": test_case.name,
+                "success": False,
+                "expected": test_case.expected_output,
+                "actual": None,
+                "error": str(e),
+                "skipped": False,
+            }
+
+    def _compare_scalar_outputs(self, actual: Any, expected: float, tolerance: float) -> bool:
+        """Compare scalar outputs with tolerance."""
+        if isinstance(actual, (int, float)) and isinstance(expected, (int, float)):
+            return abs(actual - expected) <= tolerance
+        return actual == expected
+
+    def _compare_dict_outputs(self, actual: Dict[str, Any], expected: Dict[str, float], tolerance: float) -> bool:
+        """Compare dictionary outputs with tolerance."""
+        if not isinstance(actual, dict) or not isinstance(expected, dict):
+            return actual == expected
+
+        if set(actual.keys()) != set(expected.keys()):
+            return False
+
+        for key in actual.keys():
+            actual_value = actual[key]
+            expected_value = expected[key]
+
+            # Handle corpus metric inputs (objects with specific types)
+            if hasattr(actual_value, "__class__") and "CorpusMetricInput" in str(actual_value.__class__):
+                # For corpus metric inputs, just check that the key exists and the object is created
+                continue
+            elif hasattr(actual_value, "__class__") and "np.float64" in str(actual_value.__class__):
+                # For numpy float64 values, convert to regular float for comparison
+                actual_value = float(actual_value)
+
+            if not self._compare_scalar_outputs(actual_value, expected_value, tolerance):
+                return False
+
+        return True
+
+    def run_test_suite(self, test_suite: MetricTestSuite) -> List[Dict[str, Any]]:
+        """Run a complete test suite and return results."""
+        logger.info(f"Running test suite: {test_suite.name}")
+        if test_suite.description:
+            logger.info(f"Description: {test_suite.description}")
+
+        results = []
+        for test_case in test_suite.test_cases:
+            result = self.run_test_case(test_case)
+            results.append(result)
+
+            if result.get("skipped", False):
+                logger.info(f"⏭ {test_case.name}: SKIPPED - {result.get('skip_reason', 'Unknown reason')}")
+            elif result["success"]:
+                logger.info(f"✓ {test_case.name}: PASSED")
+            else:
+                logger.error(f"✗ {test_case.name}: FAILED")
+                if result["error"]:
+                    logger.error(f"  Error: {result['error']}")
+                else:
+                    logger.error(f"  Expected: {result['expected']}")
+                    logger.error(f"  Actual: {result['actual']}")
+
+        return results
+
+    def run_test_suites_from_file(self, file_path: Union[str, Path]) -> List[Dict[str, Any]]:
+        """Run test suites from a JSON file."""
+        with open(file_path, "r") as f:
+            data = json.load(f)
+
+        if isinstance(data, list):
+            # Multiple test suites
+            all_results = []
+            for suite_data in data:
+                test_suite = MetricTestSuite(**suite_data)
+                results = self.run_test_suite(test_suite)
+                all_results.extend(results)
+            return all_results
+        else:
+            # Single test suite
+            test_suite = MetricTestSuite(**data)
+            return self.run_test_suite(test_suite)
+
+    def save_test_suite_to_file(self, test_suite: MetricTestSuite, file_path: Union[str, Path]):
+        """Save a test suite to a JSON file."""
+        with open(file_path, "w") as f:
+            json.dump(test_suite.dict(), f, indent=2)
+
+    def create_example_test_suite(self) -> MetricTestSuite:
+        """Create an example test suite with various metrics."""
+        return MetricTestSuite(
+            name="Example Test Suite",
+            description="Example test cases for various metrics",
+            test_cases=[
+                MetricTestCase(
+                    name="Exact Match - Perfect Match",
+                    metric_class="exact_match",
+                    metric_params={},
+                    doc={
+                        "query": "What is the capital of France?",
+                        "choices": ["Paris", "London", "Berlin"],
+                        "gold_index": 0,
+                        "task_name": "test",
+                    },
+                    model_response={
+                        "text": ["Paris"],
+                        "logprobs": [],
+                        "output_tokens": [],
+                    },
+                    expected_output={"em": 1.0},
+                    description="Test exact match with perfect prediction",
+                ),
+                MetricTestCase(
+                    name="Exact Match - No Match",
+                    metric_class="exact_match",
+                    metric_params={},
+                    doc={
+                        "query": "What is the capital of France?",
+                        "choices": ["Paris", "London", "Berlin"],
+                        "gold_index": 0,
+                        "task_name": "test",
+                    },
+                    model_response={
+                        "text": ["London"],
+                        "logprobs": [],
+                        "output_tokens": [],
+                    },
+                    expected_output={"em": 0.0},
+                    description="Test exact match with wrong prediction",
+                ),
+                MetricTestCase(
+                    name="F1 Score - Good Match",
+                    metric_class="f1_score",
+                    metric_params={},
+                    doc={
+                        "query": "Summarize the text",
+                        "choices": ["The quick brown fox jumps over the lazy dog"],
+                        "gold_index": 0,
+                        "task_name": "test",
+                    },
+                    model_response={
+                        "text": ["The quick brown fox jumps over the lazy dog"],
+                        "logprobs": [],
+                        "output_tokens": [],
+                    },
+                    expected_output={"f1": 1.0},
+                    description="Test F1 score with perfect match",
+                ),
+                MetricTestCase(
+                    name="Loglikelihood Accuracy - Correct Choice",
+                    metric_class="loglikelihood_acc",
+                    metric_params={},
+                    doc={
+                        "query": "Choose the correct answer",
+                        "choices": ["A", "B", "C"],
+                        "gold_index": 0,
+                        "task_name": "test",
+                    },
+                    model_response={
+                        "text": ["A"],
+                        "logprobs": [0.5, 0.3, 0.2],  # A has highest logprob
+                        "output_tokens": [[1], [2], [3]],
+                    },
+                    expected_output={"acc": 1},
+                    description="Test loglikelihood accuracy with correct choice",
+                ),
+                MetricTestCase(
+                    name="ROUGE Score",
+                    metric_class="rouge1",
+                    metric_params={"methods": ["rouge1"]},
+                    doc={
+                        "query": "Summarize the text",
+                        "choices": ["The quick brown fox jumps over the lazy dog"],
+                        "gold_index": 0,
+                        "task_name": "test",
+                    },
+                    model_response={
+                        "text": ["The quick brown fox jumps over the lazy dog"],
+                        "logprobs": [],
+                        "output_tokens": [],
+                    },
+                    expected_output={"rouge1": 1.0},
+                    description="Test ROUGE score with perfect match",
+                ),
+            ],
+        )
+
+
+if __name__ == "__main__":
+    # Example usage
+    tester = AutomatedMetricTester()
+
+    # Create and run example test suite
+    example_suite = tester.create_example_test_suite()
+    results = tester.run_test_suite(example_suite)
+
+    # Print summary
+    passed = sum(1 for r in results if r["success"])
+    total = len(results)
+    print(f"\nTest Summary: {passed}/{total} tests passed")
+
+    # Save example test suite to file
+    tester.save_test_suite_to_file(example_suite, "example_test_suite.json")
+    print("Example test suite saved to example_test_suite.json")
diff --git a/tests/metrics/test_normalizations.py b/tests/unit/metrics/test_normalizations.py
similarity index 100%
rename from tests/metrics/test_normalizations.py
rename to tests/unit/metrics/test_normalizations.py
diff --git a/tests/unit/metrics/test_unit_harness_metrics.py b/tests/unit/metrics/test_unit_harness_metrics.py
new file mode 100644
index 000000000..6d1764593
--- /dev/null
+++ b/tests/unit/metrics/test_unit_harness_metrics.py
@@ -0,0 +1,139 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import json
+import os
+
+import pytest
+
+from lighteval.metrics import apply_metric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.sample_preparator import (
+    GenerativeCorpusMetricInput,
+    LogprobCorpusMetricInput,
+    PerplexityCorpusMetricInput,
+)
+from lighteval.models.model_output import ModelResponse
+from lighteval.tasks.requests import Doc
+from lighteval.utils.utils import as_list
+
+
+PATH_TO_HARNESS_METRICS = os.path.join(os.path.dirname(__file__), "reference_scores/harness_metrics.json")
+
+
+def pytest_generate_tests(metafunc: pytest.Metafunc):
+    """Initializes the main test setup. This function is automatically called by pytest and
+    should not be called manually.
+
+    Every function with "model_input" as arguments will be sent the "parameters".
+    This function will be run only once, ensuring that each model is run only once on the selected tasks.
+    (This is better than using fixtures as fixtures are re-run once for each test, which is not a behavior we want).
+    """
+    parameters = []
+
+    # If model_input is a test function argument
+    # (= the function requires a fixture)
+    if "prompt_inputs" in metafunc.fixturenames:
+        with open(PATH_TO_HARNESS_METRICS) as f:
+            metric_to_examples = json.load(f)
+
+            for metric, examples in metric_to_examples.items():
+                for task_name, examples_list in examples.items():
+                    parameters.append((metric, task_name, examples_list))
+        metafunc.parametrize("prompt_inputs", parameters, scope="session")
+
+
+def test_model_prediction(prompt_inputs: tuple[str, str, list]):  # noqa: C901
+    """Evaluates a model on a full task - is parametrized using pytest_generate_test"""
+    metric, task_name, examples = prompt_inputs
+    metric_name = metric
+    metric = Metrics[metric].value
+
+    for example in examples:
+        doc = {
+            k: v
+            for k, v in example.items()
+            if k in ["full_prompt", "choices", "gold_index", "original_query", "specific"]
+        }
+        doc["query"] = doc.pop("full_prompt")
+        doc = Doc(**doc)
+        error_msg = f"Metric {metric_name} failed on input {doc} from task {task_name}.\n"
+
+        match example["predictions"]:
+            case [first_element, *_] if isinstance(first_element, str):
+                # If the predictions are a list of strings, we assume it's a generative task
+                responses = [ModelResponse(text=example["predictions"], output_tokens=[[]], input_tokens=[])]
+            case [first_element, *_] if isinstance(first_element, float):
+                # If the predictions are a list of floats, we assume it's a logprob task
+                responses = [ModelResponse(logprobs=example["predictions"], output_tokens=[[]], input_tokens=[])]
+            case [first_element, *_] if len(first_element) == 2 and isinstance(first_element[1], bool):
+                # If the predictions are a list of lists with two elements, we assume it's a loglikelihood task with argmax
+                responses = [
+                    ModelResponse(
+                        logprobs=[pred[0] for pred in example["predictions"]],
+                        argmax_logits_eq_gold=[pred[1] for pred in example["predictions"]],
+                        output_tokens=[[]],
+                        input_tokens=[],
+                    )
+                ]
+            case _:
+                # If the predictions are not a list of strings or floats, we assume it's a custom task
+                responses = [ModelResponse(logprobs=example["predictions"][0], input_tokens=[])]
+
+        results = apply_metric(responses=responses, docs=[doc], metrics=[metric])[0]
+        assert responses is not None, error_msg
+
+        metric_result = {k: list(v) if isinstance(v, tuple) else v for k, v in results.items()}
+
+        metric_reference = {k: example[k] for k in results.keys()}
+        error_msg += f"Prediction: {results}\n"
+        error_msg += f"Reference: {metric_reference}\n"
+        error_msg += f"Returned : {metric_result}"
+
+        for key in metric_result.keys():
+            if type(metric_result[key]) in [
+                LogprobCorpusMetricInput,
+                GenerativeCorpusMetricInput,
+                PerplexityCorpusMetricInput,
+            ]:
+                cur_result_list = as_list(metric_result[key].to_dict())
+            else:
+                cur_result_list = as_list(metric_result[key])
+            cur_ref_list = as_list(metric_reference[key])
+
+            # item wise comparison of lists
+            if isinstance(cur_result_list[0], list):
+                for res, ref in zip(cur_result_list, cur_ref_list):
+                    try:
+                        assert res == pytest.approx(ref, rel=1e-8), error_msg
+                    except Exception:
+                        assert False, (
+                            key + "\n" + str(cur_result_list) + "\n" + str(cur_ref_list) + "\n" + task_name + "\n"
+                        )
+            else:
+                try:
+                    assert cur_result_list == pytest.approx(cur_ref_list, rel=1e-8), error_msg
+                except Exception:
+                    # assert False, error_msg + "\n" + str(e)
+                    assert False, (
+                        key + "\n" + str(cur_result_list) + "\n" + str(cur_ref_list) + "\n" + task_name + "\n"
+                    )

From bf252114cbbca7d8bea63c43a7da07d1c10661df Mon Sep 17 00:00:00 2001
From: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Date: Wed, 27 Aug 2025 16:12:39 +0200
Subject: [PATCH 04/26] Delete tests/unit/metrics/test_cases/README.md

---
 tests/unit/metrics/test_cases/README.md | 116 ------------------------
 1 file changed, 116 deletions(-)
 delete mode 100644 tests/unit/metrics/test_cases/README.md

diff --git a/tests/unit/metrics/test_cases/README.md b/tests/unit/metrics/test_cases/README.md
deleted file mode 100644
index 3010cf1d2..000000000
--- a/tests/unit/metrics/test_cases/README.md
+++ /dev/null
@@ -1,116 +0,0 @@
-# Metric Test Cases
-
-This directory contains individual JSON files for each metric tested in the LightEval framework. Each file contains all test cases for a specific metric.
-
-## Structure
-
-Each JSON file follows this structure:
-
-```json
-{
-  "name": "Metric Name Test Suite",
-  "description": "Description of the test suite",
-  "test_cases": [
-    {
-      "name": "Test Case Name",
-      "metric_class": "metric_name",
-      "metric_params": {},
-      "doc": {
-        "query": "Input query",
-        "choices": ["choice1", "choice2", "choice3"],
-        "gold_index": 0,
-        "task_name": "test"
-      },
-      "model_response": {
-        "text": ["model_output"],
-        "logprobs": [],
-        "output_tokens": []
-      },
-      "expected_output": {
-        "metric_key": expected_value
-      },
-      "tolerance": 0.01,
-      "description": "Test case description"
-    }
-  ]
-}
-```
-
-## Available Test Files
-
-All 47 metrics from the `METRIC_CLASSES` dictionary have their own JSON test files:
-
-### Text Generation Metrics
-- `exact_match.json` - Exact match metric (2 test cases)
-- `f1_score.json` - F1 score metric (1 test case)
-- `f1_score_macro.json` - F1 score macro metric
-- `f1_score_micro.json` - F1 score micro metric
-- `rouge1.json` - ROUGE1 metric (1 test case)
-- `rouge2.json` - ROUGE2 metric
-- `rougeL.json` - ROUGE-L metric
-- `rougeLsum.json` - ROUGE-Lsum metric
-- `rouge_t5.json` - ROUGE-T5 metric
-- `bert_score.json` - BERT Score metric
-- `bleu.json` - BLEU metric
-- `bleu_1.json` - BLEU-1 metric
-- `bleu_4.json` - BLEU-4 metric
-- `bleurt.json` - BLEURT metric
-- `chrf.json` - ChrF metric
-- `chrf_plus.json` - ChrF+ metric
-- `ter.json` - TER metric
-
-### Perplexity Metrics
-- `bits_per_byte.json` - Bits per byte metric
-- `byte_perplexity.json` - Byte perplexity metric
-- `word_perplexity.json` - Word perplexity metric
-- `prediction_perplexity.json` - Prediction perplexity metric
-- `target_perplexity.json` - Target perplexity metric
-
-### Likelihood Metrics
-- `loglikelihood_acc.json` - Loglikelihood accuracy metric (1 test case)
-- `loglikelihood_f1.json` - Loglikelihood F1 metric
-- `acc_golds_likelihood.json` - Accuracy golds likelihood metric
-
-### Pass-at-k Metrics
-- `pass_at_k.json` - Pass at k metric
-- `pass_at_k_math.json` - Pass at k math metric
-- `pass_at_k_letters.json` - Pass at k letters metric
-- `g_pass_at_k.json` - G-pass at k metric
-- `g_pass_at_k_math.json` - G-pass at k math metric
-- `g_pass_at_k_latex.json` - G-pass at k latex metric
-- `gpqa_instruct_pass_at_k.json` - GPQA instruct pass at k metric
-
-### Other Metrics
-- `recall_at_k.json` - Recall at k metric
-- `mrr.json` - Mean Reciprocal Rank metric
-- `avg_at_k.json` - Average at k metric
-- `avg_at_k_math.json` - Average at k math metric
-- `maj_at_k.json` - Majority at k metric
-- `extractiveness.json` - Extractiveness metric
-- `faithfulness.json` - Faithfulness metric
-- `copyright.json` - Copyright metric
-- `drop.json` - DROP metric
-- `gpqa_instruct_metric.json` - GPQA instruct metric
-- `expr_gold_metric.json` - Expression gold metric
-- `truthfulqa_mc_metrics.json` - TruthfulQA multiple choice metrics
-- `simpleqa_judge.json` - SimpleQA judge metric
-- `multi_f1_numeric.json` - Multi F1 numeric metric
-- `mcc.json` - Matthews Correlation Coefficient metric
-
-## Usage
-
-These test files can be used with the `AutomatedMetricTester` class in `test_metrics_automated.py`:
-
-```python
-tester = AutomatedMetricTester()
-results = tester.run_test_suites_from_file("tests/metrics/test_cases/exact_match.json")
-```
-
-## Adding New Test Cases
-
-To add new test cases for a metric:
-
-1. Open the corresponding JSON file for that metric
-2. Add a new test case object to the `test_cases` array
-3. Follow the same structure as existing test cases
-4. Ensure the `metric_class` matches the metric being tested

From 2b65d084a10efe8d4c4b0fc48ea12be2a0d993f2 Mon Sep 17 00:00:00 2001
From: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Date: Wed, 27 Aug 2025 16:14:03 +0200
Subject: [PATCH 05/26] Delete tests/unit/metrics/test_unit_harness_metrics.py

---
 .../unit/metrics/test_unit_harness_metrics.py | 139 ------------------
 1 file changed, 139 deletions(-)
 delete mode 100644 tests/unit/metrics/test_unit_harness_metrics.py

diff --git a/tests/unit/metrics/test_unit_harness_metrics.py b/tests/unit/metrics/test_unit_harness_metrics.py
deleted file mode 100644
index 6d1764593..000000000
--- a/tests/unit/metrics/test_unit_harness_metrics.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import json
-import os
-
-import pytest
-
-from lighteval.metrics import apply_metric
-from lighteval.metrics.metrics import Metrics
-from lighteval.metrics.sample_preparator import (
-    GenerativeCorpusMetricInput,
-    LogprobCorpusMetricInput,
-    PerplexityCorpusMetricInput,
-)
-from lighteval.models.model_output import ModelResponse
-from lighteval.tasks.requests import Doc
-from lighteval.utils.utils import as_list
-
-
-PATH_TO_HARNESS_METRICS = os.path.join(os.path.dirname(__file__), "reference_scores/harness_metrics.json")
-
-
-def pytest_generate_tests(metafunc: pytest.Metafunc):
-    """Initializes the main test setup. This function is automatically called by pytest and
-    should not be called manually.
-
-    Every function with "model_input" as arguments will be sent the "parameters".
-    This function will be run only once, ensuring that each model is run only once on the selected tasks.
-    (This is better than using fixtures as fixtures are re-run once for each test, which is not a behavior we want).
-    """
-    parameters = []
-
-    # If model_input is a test function argument
-    # (= the function requires a fixture)
-    if "prompt_inputs" in metafunc.fixturenames:
-        with open(PATH_TO_HARNESS_METRICS) as f:
-            metric_to_examples = json.load(f)
-
-            for metric, examples in metric_to_examples.items():
-                for task_name, examples_list in examples.items():
-                    parameters.append((metric, task_name, examples_list))
-        metafunc.parametrize("prompt_inputs", parameters, scope="session")
-
-
-def test_model_prediction(prompt_inputs: tuple[str, str, list]):  # noqa: C901
-    """Evaluates a model on a full task - is parametrized using pytest_generate_test"""
-    metric, task_name, examples = prompt_inputs
-    metric_name = metric
-    metric = Metrics[metric].value
-
-    for example in examples:
-        doc = {
-            k: v
-            for k, v in example.items()
-            if k in ["full_prompt", "choices", "gold_index", "original_query", "specific"]
-        }
-        doc["query"] = doc.pop("full_prompt")
-        doc = Doc(**doc)
-        error_msg = f"Metric {metric_name} failed on input {doc} from task {task_name}.\n"
-
-        match example["predictions"]:
-            case [first_element, *_] if isinstance(first_element, str):
-                # If the predictions are a list of strings, we assume it's a generative task
-                responses = [ModelResponse(text=example["predictions"], output_tokens=[[]], input_tokens=[])]
-            case [first_element, *_] if isinstance(first_element, float):
-                # If the predictions are a list of floats, we assume it's a logprob task
-                responses = [ModelResponse(logprobs=example["predictions"], output_tokens=[[]], input_tokens=[])]
-            case [first_element, *_] if len(first_element) == 2 and isinstance(first_element[1], bool):
-                # If the predictions are a list of lists with two elements, we assume it's a loglikelihood task with argmax
-                responses = [
-                    ModelResponse(
-                        logprobs=[pred[0] for pred in example["predictions"]],
-                        argmax_logits_eq_gold=[pred[1] for pred in example["predictions"]],
-                        output_tokens=[[]],
-                        input_tokens=[],
-                    )
-                ]
-            case _:
-                # If the predictions are not a list of strings or floats, we assume it's a custom task
-                responses = [ModelResponse(logprobs=example["predictions"][0], input_tokens=[])]
-
-        results = apply_metric(responses=responses, docs=[doc], metrics=[metric])[0]
-        assert responses is not None, error_msg
-
-        metric_result = {k: list(v) if isinstance(v, tuple) else v for k, v in results.items()}
-
-        metric_reference = {k: example[k] for k in results.keys()}
-        error_msg += f"Prediction: {results}\n"
-        error_msg += f"Reference: {metric_reference}\n"
-        error_msg += f"Returned : {metric_result}"
-
-        for key in metric_result.keys():
-            if type(metric_result[key]) in [
-                LogprobCorpusMetricInput,
-                GenerativeCorpusMetricInput,
-                PerplexityCorpusMetricInput,
-            ]:
-                cur_result_list = as_list(metric_result[key].to_dict())
-            else:
-                cur_result_list = as_list(metric_result[key])
-            cur_ref_list = as_list(metric_reference[key])
-
-            # item wise comparison of lists
-            if isinstance(cur_result_list[0], list):
-                for res, ref in zip(cur_result_list, cur_ref_list):
-                    try:
-                        assert res == pytest.approx(ref, rel=1e-8), error_msg
-                    except Exception:
-                        assert False, (
-                            key + "\n" + str(cur_result_list) + "\n" + str(cur_ref_list) + "\n" + task_name + "\n"
-                        )
-            else:
-                try:
-                    assert cur_result_list == pytest.approx(cur_ref_list, rel=1e-8), error_msg
-                except Exception:
-                    # assert False, error_msg + "\n" + str(e)
-                    assert False, (
-                        key + "\n" + str(cur_result_list) + "\n" + str(cur_ref_list) + "\n" + task_name + "\n"
-                    )

From 594b9423a3a862005df3791d85758343fb9255e3 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 27 Aug 2025 14:15:26 +0000
Subject: [PATCH 06/26] add pip as test dependency, for spacy to work correctly

---
 pyproject.toml | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 797a7f36b..15d28a403 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,14 +58,11 @@ dependencies = [
     "accelerate",
     "huggingface_hub[hf_xet]>=0.30.2",
     "torch>=2.0,<3.0",
-    "GitPython>=3.1.41",
-    # for logging
+    "GitPython>=3.1.41", # for logging
     "datasets>=4.0.0",
     "pydantic",
-    "numpy>=2",
-    # pinned to avoid incompatibilities
-    "hf-xet>=1.1.8",
-    # pinned to avoid failing test suite
+    "numpy>=2",  # pinned to avoid incompatibilities
+    "hf-xet>=1.1.8",  # pinned to avoid failing test suite
     # Prettiness
     "typer",
     "termcolor==2.3.0",
@@ -85,7 +82,6 @@ dependencies = [
     "fsspec>=2023.12.2",
     "httpx>=0.27.2",
     "latex2sympy2_extended==1.0.6",
-    "pip>=25.2",
 ]
 
 [project.optional-dependencies]
@@ -101,7 +97,7 @@ nanotron = [
 tensorboardX = ["tensorboardX"]
 vllm = ["vllm>=0.10.0", "ray", "more_itertools"]
 quality = ["ruff>=v0.11.0","pre-commit"]
-tests = ["pytest>=7.4.0","deepdiff"]
+tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"]
 dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
 docs = ["hf-doc-builder", "watchdog"]
 extended_tasks = [

From 9f7c2be565e5aa65bbcb0e593ea9ee98ccab160f Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Thu, 28 Aug 2025 10:03:01 +0000
Subject: [PATCH 07/26] fix tests and reorg files

---
 src/lighteval/metrics/metrics_sample.py       |   2 +-
 src/lighteval/tasks/extended/lcb/main.py      |   1 +
 tests/slow_tests/test_sglang_model.py         | 101 ++++++
 tests/test_unit_base_metrics.py               | 340 ------------------
 tests/test_unit_harness_metrics.py            | 139 -------
 tests/test_unit_harness_prompts.py            |  75 ----
 .../logging/test_evaluation_tracker.py        |   0
 .../models/endpoints/test_endpoint_model.py   |   0
 .../models/endpoints/test_tgi_model.py        |   0
 .../{ => unit}/models/test_abstract_model.py  |   0
 tests/{ => unit}/models/test_base_model.py    |   0
 tests/{ => unit}/models/test_model_input.py   |   0
 tests/{ => unit}/models/test_model_utils.py   |   0
 .../models/test_transformers_model.py         |   0
 .../{ => unit}/models/vllm/test_vllm_model.py |   0
 .../pipeline/test_reasoning_tags.py           |   0
 .../{ => unit/prompt}/test_prompt_manager.py  |   0
 .../prompt}/test_prompt_manager_class.py      |   0
 .../tasks/templates/test_continuation.py      |   0
 .../tasks/templates/test_copa.py              |   0
 .../tasks/templates/test_hellaswag.py         |   0
 .../tasks/templates/test_multichoice.py       |   0
 .../{metrics => }/tasks/templates/test_nli.py |   0
 .../tasks/templates/test_translation.py       |   0
 .../tasks/test_lighteval_task.py              |   0
 .../unit/{metrics => }/tasks/test_registry.py |   6 +-
 tests/{ => unit}/test_unit_reorder.py         |   0
 tests/{ => unit}/utils/test_caching.py        |   0
 tests/{ => unit}/utils/test_utils.py          |   0
 29 files changed, 106 insertions(+), 558 deletions(-)
 create mode 100644 tests/slow_tests/test_sglang_model.py
 delete mode 100644 tests/test_unit_base_metrics.py
 delete mode 100644 tests/test_unit_harness_metrics.py
 delete mode 100644 tests/test_unit_harness_prompts.py
 rename tests/{ => unit}/logging/test_evaluation_tracker.py (100%)
 rename tests/{ => unit}/models/endpoints/test_endpoint_model.py (100%)
 rename tests/{ => unit}/models/endpoints/test_tgi_model.py (100%)
 rename tests/{ => unit}/models/test_abstract_model.py (100%)
 rename tests/{ => unit}/models/test_base_model.py (100%)
 rename tests/{ => unit}/models/test_model_input.py (100%)
 rename tests/{ => unit}/models/test_model_utils.py (100%)
 rename tests/{ => unit}/models/test_transformers_model.py (100%)
 rename tests/{ => unit}/models/vllm/test_vllm_model.py (100%)
 rename tests/{ => unit}/pipeline/test_reasoning_tags.py (100%)
 rename tests/{ => unit/prompt}/test_prompt_manager.py (100%)
 rename tests/{ => unit/prompt}/test_prompt_manager_class.py (100%)
 rename tests/unit/{metrics => }/tasks/templates/test_continuation.py (100%)
 rename tests/unit/{metrics => }/tasks/templates/test_copa.py (100%)
 rename tests/unit/{metrics => }/tasks/templates/test_hellaswag.py (100%)
 rename tests/unit/{metrics => }/tasks/templates/test_multichoice.py (100%)
 rename tests/unit/{metrics => }/tasks/templates/test_nli.py (100%)
 rename tests/unit/{metrics => }/tasks/templates/test_translation.py (100%)
 rename tests/unit/{metrics => }/tasks/test_lighteval_task.py (100%)
 rename tests/unit/{metrics => }/tasks/test_registry.py (96%)
 rename tests/{ => unit}/test_unit_reorder.py (100%)
 rename tests/{ => unit}/utils/test_caching.py (100%)
 rename tests/{ => unit}/utils/test_utils.py (100%)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 3b3e6288e..17179899e 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -1218,7 +1218,7 @@ def compute(self, doc: Doc, model_response: ModelResponse):
         if len(golds) > 1:
             raise Exception("Cannot compute maj@k with several golds")
 
-        processed_choices = [self.preprocess(text=g) for g in docs.get_golds()]
+        processed_choices = [self.preprocess(text=g) for g in doc.get_golds()]
         new_doc = Doc(
             choices=processed_choices,
             query=doc.query,
diff --git a/src/lighteval/tasks/extended/lcb/main.py b/src/lighteval/tasks/extended/lcb/main.py
index ad49235fb..8ec526f64 100644
--- a/src/lighteval/tasks/extended/lcb/main.py
+++ b/src/lighteval/tasks/extended/lcb/main.py
@@ -113,6 +113,7 @@ def codegen_metric(model_response: ModelResponse, doc: Doc, **kwargs) -> float:
     higher_is_better=True,
     sample_level_fn=codegen_metric,
     corpus_level_fn=np.mean,
+    batched_compute=False,
 )
 
 
diff --git a/tests/slow_tests/test_sglang_model.py b/tests/slow_tests/test_sglang_model.py
new file mode 100644
index 000000000..c98b364ed
--- /dev/null
+++ b/tests/slow_tests/test_sglang_model.py
@@ -0,0 +1,101 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import json
+import os
+from functools import lru_cache, partial
+from typing import Callable, Tuple
+
+import pytest
+from deepdiff import DeepDiff
+
+from lighteval.main_sglang import sglang  # noqa: E402
+
+
+# Set env var for deterministic run of models
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+MODELS_ARGS = [
+    {
+        "model_name": "examples/model_configs/sglang_model_config.yaml",
+        "use_chat_template": True,
+        "results_file": "tests/reference_scores/Mistral-7B-Instruct-results-sglang.json",
+    }
+]
+
+TASKS_PATH = "examples/test_tasks.txt"
+CUSTOM_TASKS_PATH = "examples/custom_tasks_tests.py"
+
+ModelInput = Tuple[str, Callable[[], dict]]
+
+
+@lru_cache(maxsize=len(MODELS_ARGS))
+def run_model(model_name: str, use_chat_template: bool):
+    """Runs the full main as a black box, using the input model and tasks, on 10 samples without parallelism"""
+    results = sglang(
+        model_args=model_name,
+        tasks=TASKS_PATH,
+        use_chat_template=use_chat_template,
+        output_dir="",
+        dataset_loading_processes=1,
+        save_details=False,
+        max_samples=10,
+        custom_tasks=CUSTOM_TASKS_PATH,
+    )
+    return results
+
+
+def generate_tests() -> list[ModelInput]:
+    """Generate test parameters for all models and tasks."""
+    tests = []
+    for model_args in MODELS_ARGS:
+        predictions_lite = partial(run_model, model_args["model_name"], model_args["use_chat_template"])
+        tests.append((model_args, predictions_lite))
+    return tests
+
+
+# generates the model predictions parameters at test collection time
+tests: list[ModelInput] = generate_tests()
+ids = [f"{model_input[0]['model_name']}" for model_input in tests]
+
+
+@pytest.mark.parametrize("tests", tests, ids=ids)
+@pytest.mark.skip()
+def test_sglang_model(tests: list[ModelInput]):
+    """Evaluates a SGLang model on a full task - is parametrized using pytest_generate_test"""
+    model_args, get_predictions = tests
+
+    predictions = get_predictions()["results"]
+
+    # Load the reference results
+    with open(model_args["results_file"], "r") as f:
+        reference_results = json.load(f)["results"]
+
+    # Change the key names, replace '|' with ':'
+    reference_results = {k.replace("|", ":"): v for k, v in reference_results.items()}
+
+    # Convert defaultdict values to regular dict for comparison
+    predictions_dict = {k: dict(v) if hasattr(v, "default_factory") else v for k, v in predictions.items()}
+
+    diff = DeepDiff(reference_results, predictions_dict, ignore_numeric_type_changes=True)
+
+    assert diff == {}, f"Differences found: {diff}"
diff --git a/tests/test_unit_base_metrics.py b/tests/test_unit_base_metrics.py
deleted file mode 100644
index 575ebf595..000000000
--- a/tests/test_unit_base_metrics.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import numpy as np
-import pytest
-
-from lighteval.metrics.dynamic_metrics import (
-    LogLikelihoodAccMetric,
-    MultilingualQuasiExactMatchMetric,
-    MultilingualQuasiF1ScoreMetric,
-    NormalizedMultiChoiceProbMetric,
-    ProbabilityMetric,
-)
-from lighteval.metrics.metrics_sample import ExactMatches
-from lighteval.metrics.normalizations import LogProbCharNorm, helm_normalizer
-from lighteval.models.model_output import ModelResponse
-from lighteval.tasks.requests import Doc
-from lighteval.utils.language import Language
-
-
-class TestBaseMetrics:
-    def test_exact_match(self):
-        em = ExactMatches(strip_strings=True)
-
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "The quick brown fox jumps over the lazy dog",
-        )
-        assert res == 1
-
-        res = em.compute_one_item(
-            " The quick brown fox jumps over the lazy dog\n",
-            "\n The quick brown fox jumps over the lazy dog ",
-        )
-        assert res == 1
-
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "The quick brown fox jumps over the lazy dog.",
-        )
-        assert res == 0
-
-        res = em.compute_one_item("The quick brown fox jumps over the lazy dog", "")
-        assert res == 0
-
-        res = em.compute_one_item("", "")
-        assert res == 0
-
-    def test_quasi_exact_match(self):
-        em = ExactMatches(normalize_gold=helm_normalizer, normalize_pred=helm_normalizer)
-
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "The quick brown fox jumps over the lazy dog",
-        )
-        assert res == 1
-
-        res = em.compute_one_item(
-            " The quick brown fox jumps over the lazy dog\n",
-            "\n The quick brown fox jumps over the lazy dog ",
-        )
-        assert res == 1
-
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "The quick brown fox jumps over the lazy dog.",
-        )
-        assert res == 1
-
-        res = em.compute_one_item("the quick brown fox, jumps over lazy dog", "quick brown fox jumps over lazy dog.")
-        assert res == 1
-
-        res = em.compute_one_item("The quick brown fox jumps over the lazy dog", "")
-        assert res == 0
-
-        res = em.compute_one_item("", "")
-        assert res == 0
-
-    def test_prefix_exact_match(self):
-        em = ExactMatches(
-            strip_strings=True,
-            type_exact_match="prefix",
-        )
-
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "The quick brown fox jumps over the lazy dog",
-        )
-        assert res == 1
-
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "The quick brown fox jumps over the lazy dog. And some other stories.",
-        )
-        assert res == 1
-
-        res = em.compute_one_item(
-            "  The quick brown fox jumps over the lazy dog\n",
-            "\n The quick brown fox jumps over the lazy dog",
-        )
-        assert res == 1
-
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "The quick brown fox jumps over the lazy dog.",
-        )
-        assert res == 1
-
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "the quick brown fox jumps over lazy dog. And some other stories.",
-        )
-        assert res == 0
-
-        res = em.compute_one_item("The quick brown fox jumps over the lazy dog", "")
-        assert res == 0
-
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "Complete mismatch",
-        )
-        assert res == 0
-
-        res = em.compute_one_item("", "")
-        assert res == 0
-
-    def test_prefix_quasi_exact_match(self):
-        em = ExactMatches(
-            normalize_gold=helm_normalizer,
-            normalize_pred=helm_normalizer,
-            type_exact_match="prefix",
-        )
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "The quick brown fox jumps over the lazy dog",
-        )
-        assert res == 1
-
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "The quick brown fox jumps over the lazy dog. And some other stories.",
-        )
-        assert res == 1
-
-        res = em.compute_one_item(
-            "The quick Brown fox jumps over the lazy dog",
-            "the quick brown fox jumps over lazy dog. And some other stories.",
-        )
-        assert res == 1
-
-        res = em.compute_one_item(
-            "  The quick brown fox jumps over the lazy dog\n",
-            "\n The quick brown fox jumps over the lazy dog",
-        )
-        assert res == 1
-
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "The quick brown fox jumps over the lazy dog.",
-        )
-        assert res == 1
-
-        res = em.compute_one_item("The quick brown fox jumps over the lazy dog", "")
-        assert res == 0
-
-        res = em.compute_one_item(
-            "The quick brown fox jumps over the lazy dog",
-            "Complete mismatch",
-        )
-        assert res == 0
-
-        res = em.compute_one_item("", "")
-        assert res == 0
-
-    def test_prob(self):
-        doc = Doc(query="Test query", choices=["A", "B", "C"], gold_index=0, task_name="test")
-
-        # Simple case
-        model_response = ModelResponse(logprobs=np.log([0.7]))
-        prob_metric = ProbabilityMetric()
-        result = prob_metric.compute_sample(doc=doc, model_response=model_response)
-        assert result[prob_metric.metric_name] == pytest.approx(0.7)
-
-        # Aggregation function test
-        model_response = ModelResponse(logprobs=np.log([0.7, 0.1]))
-        prob_min_metric = ProbabilityMetric(aggregation_function=np.min)
-        result = prob_min_metric.compute_sample(doc=doc, model_response=model_response)
-        assert result[prob_metric.metric_name] == pytest.approx(0.1)
-
-    def test_mc_probability_metric(self):
-        doc = Doc(query="Test query", choices=["A", "B", "C"], gold_index=0, task_name="test")
-        model_response = ModelResponse(logprobs=np.log([0.35, 0.1, 0.05]))
-
-        mc_prob_metric = NormalizedMultiChoiceProbMetric()
-
-        result = mc_prob_metric.compute_sample(
-            doc=doc,
-            model_response=model_response,
-        )
-        assert result[mc_prob_metric.metric_name] == pytest.approx(0.7)
-
-        doc = Doc(query="Test query", choices=["AA", "BB", "CCC"], gold_index=1, task_name="test")
-        model_response = ModelResponse(logprobs=np.log([0.1**2, 0.35**2, 0.05**3]))
-
-        prob_norm_metric = NormalizedMultiChoiceProbMetric(normalization=LogProbCharNorm())
-        result = prob_norm_metric.compute_sample(
-            doc=doc,
-            model_response=model_response,
-        )
-        assert result[prob_norm_metric.metric_name] == pytest.approx(0.7)
-
-    def test_acc(self):
-        # Test without normalization
-        doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=0, task_name="test")
-        model_response = ModelResponse(logprobs=np.log([0.7, 0.2, 0.3, 0.4]))
-
-        acc_metric = LogLikelihoodAccMetric()
-        result = acc_metric.compute_sample(
-            doc=doc,
-            model_response=model_response,
-        )
-        assert result[acc_metric.metric_name] == 1  # The highest logprob (3.0) is at index 3, which is not in gold_ixs
-
-        # Test 0 acc
-        doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=0, task_name="test")
-        model_response = ModelResponse(logprobs=np.log([0.1, 0.2, 0.3, 0.4]))
-        result = acc_metric.compute_sample(
-            doc=doc,
-            model_response=model_response,
-        )
-        assert result[acc_metric.metric_name] == 0
-
-        # Test with normalization
-        doc = Doc(query="Test query", choices=["ABCDE", "AB"], gold_index=0, task_name="test")
-        model_response = ModelResponse(logprobs=np.log([0.5, 0.6]))
-        acc_norm_metric = LogLikelihoodAccMetric(normalization=LogProbCharNorm())
-        result_norm = acc_norm_metric.compute_sample(
-            doc=doc,
-            model_response=model_response,
-        )
-        assert (
-            result_norm[acc_norm_metric.metric_name] == 1
-        )  # After normalization, "ABCDE" should have the highest score
-
-        # Test with multiple correct solutions
-        doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=[1, 3], task_name="test")
-        model_response = ModelResponse(logprobs=np.log([0.5, 0.6, 0.7, 0.8]))
-        result_multi = acc_metric.compute_sample(
-            doc=doc,
-            model_response=model_response,
-        )
-        assert result_multi[acc_metric.metric_name] == 1
-
-        # Test when the highest logprob is not in gold_ixs
-        doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=[1, 2], task_name="test")
-        model_response = ModelResponse(logprobs=[0.5, 0.6, 0.7, 0.8])
-        result_incorrect = acc_metric.compute_sample(
-            doc=doc,
-            model_response=model_response,
-        )
-        assert result_incorrect[acc_metric.metric_name] == 0
-
-    def test_f1_dynamic_metric(self):
-        """
-        Tests that normalization works correctly. We don't test the behavior of the F1_score class as it should be already tested.
-        """
-
-        doc = Doc(query="Test query", choices=["hello world"], gold_index=[0], task_name="test")
-        model_response = ModelResponse(text=["hello, the world"])
-
-        # Normalization test
-        f1_metric = MultilingualQuasiF1ScoreMetric(language=Language.ENGLISH)
-        result = f1_metric.compute_sample(
-            doc=doc,
-            model_response=model_response,
-        )
-        assert result[f1_metric.metric_name] == 1
-
-        model_response = ModelResponse(text=["hello, the world how"])
-        f1_metric = MultilingualQuasiF1ScoreMetric(language=Language.ENGLISH, aggregation_function=np.min)
-        result = f1_metric.compute_sample(
-            doc=doc,
-            model_response=model_response,
-        )
-        # 2 * (precision * recall) / (precision + recall) = 2 * (1 * 2/3) / (1 + 2/3) = 0.8
-        assert result[f1_metric.metric_name] == 0.8
-
-    def test_exact_match_dynamic_metric(self):
-        """
-        Tests that normalization works correctly. We don't test the behavior of the ExactMatch class as it should be already tested.
-        """
-        doc = Doc(query="Test query", choices=["hello world"], gold_index=[0], task_name="test")
-        model_response = ModelResponse(text=["hello, the world"])
-
-        # Normalization test
-        em_metric = MultilingualQuasiExactMatchMetric(language=Language.ENGLISH, match_type="full")
-        result = em_metric.compute_sample(
-            doc=doc,
-            model_response=model_response,
-        )
-        assert result[em_metric.metric_name] == 1
-
-        model_response = ModelResponse(text=["hello, the world how"])
-        em_metric = MultilingualQuasiExactMatchMetric(language=Language.ENGLISH, match_type="full")
-        result = em_metric.compute_sample(
-            doc=doc,
-            model_response=model_response,
-        )
-        assert result[em_metric.metric_name] == 0
-
-    @pytest.mark.skip(reason="Need to understand what it does.")
-    def test_pass_at_k_estimator(self):
-        assert False
-
-    @pytest.mark.skip(reason="Using nltk metric function, no need to test.")
-    def test_f1_score_quasi(self):
-        assert False
-
-    @pytest.mark.skip(reason="Using nltk metric function, no need to test.")
-    def test_f1(self):
-        assert False
diff --git a/tests/test_unit_harness_metrics.py b/tests/test_unit_harness_metrics.py
deleted file mode 100644
index 6d1764593..000000000
--- a/tests/test_unit_harness_metrics.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import json
-import os
-
-import pytest
-
-from lighteval.metrics import apply_metric
-from lighteval.metrics.metrics import Metrics
-from lighteval.metrics.sample_preparator import (
-    GenerativeCorpusMetricInput,
-    LogprobCorpusMetricInput,
-    PerplexityCorpusMetricInput,
-)
-from lighteval.models.model_output import ModelResponse
-from lighteval.tasks.requests import Doc
-from lighteval.utils.utils import as_list
-
-
-PATH_TO_HARNESS_METRICS = os.path.join(os.path.dirname(__file__), "reference_scores/harness_metrics.json")
-
-
-def pytest_generate_tests(metafunc: pytest.Metafunc):
-    """Initializes the main test setup. This function is automatically called by pytest and
-    should not be called manually.
-
-    Every function with "model_input" as arguments will be sent the "parameters".
-    This function will be run only once, ensuring that each model is run only once on the selected tasks.
-    (This is better than using fixtures as fixtures are re-run once for each test, which is not a behavior we want).
-    """
-    parameters = []
-
-    # If model_input is a test function argument
-    # (= the function requires a fixture)
-    if "prompt_inputs" in metafunc.fixturenames:
-        with open(PATH_TO_HARNESS_METRICS) as f:
-            metric_to_examples = json.load(f)
-
-            for metric, examples in metric_to_examples.items():
-                for task_name, examples_list in examples.items():
-                    parameters.append((metric, task_name, examples_list))
-        metafunc.parametrize("prompt_inputs", parameters, scope="session")
-
-
-def test_model_prediction(prompt_inputs: tuple[str, str, list]):  # noqa: C901
-    """Evaluates a model on a full task - is parametrized using pytest_generate_test"""
-    metric, task_name, examples = prompt_inputs
-    metric_name = metric
-    metric = Metrics[metric].value
-
-    for example in examples:
-        doc = {
-            k: v
-            for k, v in example.items()
-            if k in ["full_prompt", "choices", "gold_index", "original_query", "specific"]
-        }
-        doc["query"] = doc.pop("full_prompt")
-        doc = Doc(**doc)
-        error_msg = f"Metric {metric_name} failed on input {doc} from task {task_name}.\n"
-
-        match example["predictions"]:
-            case [first_element, *_] if isinstance(first_element, str):
-                # If the predictions are a list of strings, we assume it's a generative task
-                responses = [ModelResponse(text=example["predictions"], output_tokens=[[]], input_tokens=[])]
-            case [first_element, *_] if isinstance(first_element, float):
-                # If the predictions are a list of floats, we assume it's a logprob task
-                responses = [ModelResponse(logprobs=example["predictions"], output_tokens=[[]], input_tokens=[])]
-            case [first_element, *_] if len(first_element) == 2 and isinstance(first_element[1], bool):
-                # If the predictions are a list of lists with two elements, we assume it's a loglikelihood task with argmax
-                responses = [
-                    ModelResponse(
-                        logprobs=[pred[0] for pred in example["predictions"]],
-                        argmax_logits_eq_gold=[pred[1] for pred in example["predictions"]],
-                        output_tokens=[[]],
-                        input_tokens=[],
-                    )
-                ]
-            case _:
-                # If the predictions are not a list of strings or floats, we assume it's a custom task
-                responses = [ModelResponse(logprobs=example["predictions"][0], input_tokens=[])]
-
-        results = apply_metric(responses=responses, docs=[doc], metrics=[metric])[0]
-        assert responses is not None, error_msg
-
-        metric_result = {k: list(v) if isinstance(v, tuple) else v for k, v in results.items()}
-
-        metric_reference = {k: example[k] for k in results.keys()}
-        error_msg += f"Prediction: {results}\n"
-        error_msg += f"Reference: {metric_reference}\n"
-        error_msg += f"Returned : {metric_result}"
-
-        for key in metric_result.keys():
-            if type(metric_result[key]) in [
-                LogprobCorpusMetricInput,
-                GenerativeCorpusMetricInput,
-                PerplexityCorpusMetricInput,
-            ]:
-                cur_result_list = as_list(metric_result[key].to_dict())
-            else:
-                cur_result_list = as_list(metric_result[key])
-            cur_ref_list = as_list(metric_reference[key])
-
-            # item wise comparison of lists
-            if isinstance(cur_result_list[0], list):
-                for res, ref in zip(cur_result_list, cur_ref_list):
-                    try:
-                        assert res == pytest.approx(ref, rel=1e-8), error_msg
-                    except Exception:
-                        assert False, (
-                            key + "\n" + str(cur_result_list) + "\n" + str(cur_ref_list) + "\n" + task_name + "\n"
-                        )
-            else:
-                try:
-                    assert cur_result_list == pytest.approx(cur_ref_list, rel=1e-8), error_msg
-                except Exception:
-                    # assert False, error_msg + "\n" + str(e)
-                    assert False, (
-                        key + "\n" + str(cur_result_list) + "\n" + str(cur_ref_list) + "\n" + task_name + "\n"
-                    )
diff --git a/tests/test_unit_harness_prompts.py b/tests/test_unit_harness_prompts.py
deleted file mode 100644
index 6c8233fdc..000000000
--- a/tests/test_unit_harness_prompts.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import json
-import os
-
-import pytest
-
-import lighteval.tasks.default_prompts as default_prompts
-from lighteval.tasks.requests import Doc
-
-
-PATH_TO_HARNESS_PROMPTS = os.path.join(os.path.dirname(__file__), "reference_scores/harness_prompts.json")
-
-
-def pytest_generate_tests(metafunc: pytest.Metafunc):
-    """Initializes the main test setup. This function is automatically called by pytest and
-    should not be called manually.
-
-    Every function with "model_input" as arguments will be sent the "parameters".
-    This function will be run only once, ensuring that each model is run only once on the selected tasks.
-    (This is better than using fixtures as fixtures are re-run once for each test, which is not a behavior we want).
-    """
-    parameters = []
-
-    # If model_input is a test function argument
-    # (= the function requires a fixture)
-    if "prompt_inputs" in metafunc.fixturenames:
-        with open(PATH_TO_HARNESS_PROMPTS) as f:
-            prompt_fn_to_examples = json.load(f)
-
-            for prompt_fn_name, examples in prompt_fn_to_examples.items():
-                formatter_fn = getattr(default_prompts, prompt_fn_name)
-
-                cur_params = []
-
-                for task_name, examples_list in examples.items():
-                    for input_line, reference_line in examples_list:
-                        cur_params.append((formatter_fn, input_line, reference_line, task_name))
-                parameters.append((prompt_fn_name, cur_params))
-        metafunc.parametrize("prompt_inputs", parameters, scope="session")
-
-
-def test_model_prediction(prompt_inputs: tuple[str, list]):
-    """Evaluates a model on a full task - is parametrized using pytest_generate_test"""
-    prompt_fn_name, examples = prompt_inputs
-    for prompt_fn, input_line, reference_line, task_name in examples:
-        formatted_line = prompt_fn(input_line, "")  # task_name)
-        reference_line = Doc(**reference_line)
-
-        error_msg = (
-            f"Prompt formatting function {prompt_fn_name} failed on input {input_line} from task {task_name}.\n"
-        )
-        error_msg += f"Reference: {reference_line}\n"
-        error_msg += f"Returned : {formatted_line}"
-        assert formatted_line == reference_line, error_msg
diff --git a/tests/logging/test_evaluation_tracker.py b/tests/unit/logging/test_evaluation_tracker.py
similarity index 100%
rename from tests/logging/test_evaluation_tracker.py
rename to tests/unit/logging/test_evaluation_tracker.py
diff --git a/tests/models/endpoints/test_endpoint_model.py b/tests/unit/models/endpoints/test_endpoint_model.py
similarity index 100%
rename from tests/models/endpoints/test_endpoint_model.py
rename to tests/unit/models/endpoints/test_endpoint_model.py
diff --git a/tests/models/endpoints/test_tgi_model.py b/tests/unit/models/endpoints/test_tgi_model.py
similarity index 100%
rename from tests/models/endpoints/test_tgi_model.py
rename to tests/unit/models/endpoints/test_tgi_model.py
diff --git a/tests/models/test_abstract_model.py b/tests/unit/models/test_abstract_model.py
similarity index 100%
rename from tests/models/test_abstract_model.py
rename to tests/unit/models/test_abstract_model.py
diff --git a/tests/models/test_base_model.py b/tests/unit/models/test_base_model.py
similarity index 100%
rename from tests/models/test_base_model.py
rename to tests/unit/models/test_base_model.py
diff --git a/tests/models/test_model_input.py b/tests/unit/models/test_model_input.py
similarity index 100%
rename from tests/models/test_model_input.py
rename to tests/unit/models/test_model_input.py
diff --git a/tests/models/test_model_utils.py b/tests/unit/models/test_model_utils.py
similarity index 100%
rename from tests/models/test_model_utils.py
rename to tests/unit/models/test_model_utils.py
diff --git a/tests/models/test_transformers_model.py b/tests/unit/models/test_transformers_model.py
similarity index 100%
rename from tests/models/test_transformers_model.py
rename to tests/unit/models/test_transformers_model.py
diff --git a/tests/models/vllm/test_vllm_model.py b/tests/unit/models/vllm/test_vllm_model.py
similarity index 100%
rename from tests/models/vllm/test_vllm_model.py
rename to tests/unit/models/vllm/test_vllm_model.py
diff --git a/tests/pipeline/test_reasoning_tags.py b/tests/unit/pipeline/test_reasoning_tags.py
similarity index 100%
rename from tests/pipeline/test_reasoning_tags.py
rename to tests/unit/pipeline/test_reasoning_tags.py
diff --git a/tests/test_prompt_manager.py b/tests/unit/prompt/test_prompt_manager.py
similarity index 100%
rename from tests/test_prompt_manager.py
rename to tests/unit/prompt/test_prompt_manager.py
diff --git a/tests/test_prompt_manager_class.py b/tests/unit/prompt/test_prompt_manager_class.py
similarity index 100%
rename from tests/test_prompt_manager_class.py
rename to tests/unit/prompt/test_prompt_manager_class.py
diff --git a/tests/unit/metrics/tasks/templates/test_continuation.py b/tests/unit/tasks/templates/test_continuation.py
similarity index 100%
rename from tests/unit/metrics/tasks/templates/test_continuation.py
rename to tests/unit/tasks/templates/test_continuation.py
diff --git a/tests/unit/metrics/tasks/templates/test_copa.py b/tests/unit/tasks/templates/test_copa.py
similarity index 100%
rename from tests/unit/metrics/tasks/templates/test_copa.py
rename to tests/unit/tasks/templates/test_copa.py
diff --git a/tests/unit/metrics/tasks/templates/test_hellaswag.py b/tests/unit/tasks/templates/test_hellaswag.py
similarity index 100%
rename from tests/unit/metrics/tasks/templates/test_hellaswag.py
rename to tests/unit/tasks/templates/test_hellaswag.py
diff --git a/tests/unit/metrics/tasks/templates/test_multichoice.py b/tests/unit/tasks/templates/test_multichoice.py
similarity index 100%
rename from tests/unit/metrics/tasks/templates/test_multichoice.py
rename to tests/unit/tasks/templates/test_multichoice.py
diff --git a/tests/unit/metrics/tasks/templates/test_nli.py b/tests/unit/tasks/templates/test_nli.py
similarity index 100%
rename from tests/unit/metrics/tasks/templates/test_nli.py
rename to tests/unit/tasks/templates/test_nli.py
diff --git a/tests/unit/metrics/tasks/templates/test_translation.py b/tests/unit/tasks/templates/test_translation.py
similarity index 100%
rename from tests/unit/metrics/tasks/templates/test_translation.py
rename to tests/unit/tasks/templates/test_translation.py
diff --git a/tests/unit/metrics/tasks/test_lighteval_task.py b/tests/unit/tasks/test_lighteval_task.py
similarity index 100%
rename from tests/unit/metrics/tasks/test_lighteval_task.py
rename to tests/unit/tasks/test_lighteval_task.py
diff --git a/tests/unit/metrics/tasks/test_registry.py b/tests/unit/tasks/test_registry.py
similarity index 96%
rename from tests/unit/metrics/tasks/test_registry.py
rename to tests/unit/tasks/test_registry.py
index caeb4e787..1a1f99b9d 100644
--- a/tests/unit/metrics/tasks/test_registry.py
+++ b/tests/unit/tasks/test_registry.py
@@ -48,7 +48,7 @@ def test_custom_task_groups():
     """
     Tests that task info selector correctly handles custom task groups.
     """
-    registry = Registry(custom_tasks="tests.tasks.test_registry")
+    registry = Registry(custom_tasks="tests.unit.tasks.test_registry")
     task_info = registry.taskinfo_selector("zero_and_one")
 
     assert set(task_info.keys()) == {"custom|test_task_revision"}
@@ -62,7 +62,7 @@ def test_custom_tasks():
     """
     Tests that task info selector correctly handles custom tasks.
     """
-    registry = Registry(custom_tasks="tests.tasks.test_registry")
+    registry = Registry(custom_tasks="tests.unit.tasks.test_registry")
     task_info = registry.taskinfo_selector("custom|test_task_revision|0|0")
 
     assert list(task_info.keys()) == ["custom|test_task_revision"]
@@ -131,7 +131,7 @@ def test_task_group_expansion_with_subset_expansion():
     """
     Tests that task info selector correctly handles a group with task superset is provided.
     """
-    registry = Registry(custom_tasks="tests.tasks.test_registry")
+    registry = Registry(custom_tasks="tests.unit.tasks.test_registry")
 
     task_info = registry.taskinfo_selector("all_mmlu")
 
diff --git a/tests/test_unit_reorder.py b/tests/unit/test_unit_reorder.py
similarity index 100%
rename from tests/test_unit_reorder.py
rename to tests/unit/test_unit_reorder.py
diff --git a/tests/utils/test_caching.py b/tests/unit/utils/test_caching.py
similarity index 100%
rename from tests/utils/test_caching.py
rename to tests/unit/utils/test_caching.py
diff --git a/tests/utils/test_utils.py b/tests/unit/utils/test_utils.py
similarity index 100%
rename from tests/utils/test_utils.py
rename to tests/unit/utils/test_utils.py

From e1a55ac48cf5ae37cf9c38156065d199cc9b7c0c Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Thu, 28 Aug 2025 12:26:02 +0000
Subject: [PATCH 08/26] fix tests and reorg files

---
 src/lighteval/metrics/metrics_sample.py       |  4 +-
 .../test_cases/acc_golds_likelihood.json      |  4 +-
 tests/unit/metrics/test_cases/avg_at_k.json   |  4 +-
 .../metrics/test_cases/avg_at_k_math.json     |  4 +-
 tests/unit/metrics/test_cases/copyright.json  |  4 +-
 tests/unit/metrics/test_cases/drop.json       |  4 +-
 .../test_cases/gpqa_instruct_metric.json      |  4 +-
 tests/unit/metrics/test_metrics_automated.py  | 75 ++++++++-----------
 8 files changed, 46 insertions(+), 57 deletions(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 17179899e..8d9ec5849 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -1182,8 +1182,8 @@ def compute(self, model_response: ModelResponse, doc: Doc):
             float: Aggregated score over the current sample's items.
         """
         all_scores = []
-        for i in range(self.k):
-            all_scores.append(self.score_sample(doc, model_response[i]))
+        for _ in range(self.k):
+            all_scores.append(self.score_sample(doc, model_response))
 
         avg_score = np.mean(all_scores)
         return avg_score
diff --git a/tests/unit/metrics/test_cases/acc_golds_likelihood.json b/tests/unit/metrics/test_cases/acc_golds_likelihood.json
index 5d0063739..fd1b0be02 100644
--- a/tests/unit/metrics/test_cases/acc_golds_likelihood.json
+++ b/tests/unit/metrics/test_cases/acc_golds_likelihood.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:75ac8d94b83730e83e9b4b7a3d34ef579a92ca0382f5806a75e469b428215b4c
-size 986
+oid sha256:a4a390601a185bf4a62ac31a52bfde0064b0b8d5eac34b3683e026e23d489338
+size 824
diff --git a/tests/unit/metrics/test_cases/avg_at_k.json b/tests/unit/metrics/test_cases/avg_at_k.json
index 275d0ccb0..db21a380c 100644
--- a/tests/unit/metrics/test_cases/avg_at_k.json
+++ b/tests/unit/metrics/test_cases/avg_at_k.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:925eaea4ae4fc9a773f5628916524116e666a91ffe15a2949123abd3295ceea1
-size 929
+oid sha256:656c2910fb67dc8a5b7ddfb4c2583f8a107cc6bd7962caeec5d94f4815497167
+size 634
diff --git a/tests/unit/metrics/test_cases/avg_at_k_math.json b/tests/unit/metrics/test_cases/avg_at_k_math.json
index c62f7f8b1..567219f1d 100644
--- a/tests/unit/metrics/test_cases/avg_at_k_math.json
+++ b/tests/unit/metrics/test_cases/avg_at_k_math.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:50f538b5160294a12d0340e1e7f0a867e61bb0491d3ea3b66ef8e565e30e1526
-size 959
+oid sha256:8e3e39166ce74c9d398736357daffda5c72e5c65c1bd027680ced9cc54e45ba0
+size 728
diff --git a/tests/unit/metrics/test_cases/copyright.json b/tests/unit/metrics/test_cases/copyright.json
index 56c7da7b9..e4491c7a1 100644
--- a/tests/unit/metrics/test_cases/copyright.json
+++ b/tests/unit/metrics/test_cases/copyright.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:31866d73fe46f534ec8eb8232151657a0f266b7f8251b81d7124dbb2c56da7f4
-size 1007
+oid sha256:954d886db79f9217d380eaa717a74e46969f88f632d3e7b608107eaaac89f294
+size 732
diff --git a/tests/unit/metrics/test_cases/drop.json b/tests/unit/metrics/test_cases/drop.json
index 9a15ce295..4fdc1442f 100644
--- a/tests/unit/metrics/test_cases/drop.json
+++ b/tests/unit/metrics/test_cases/drop.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d7fd23b2a4d60de9ed7e550021a7f943479117d3234c2191b2ba94872fe5c264
-size 1077
+oid sha256:450f78b0720b5706bcdbf6997cf89adaa5cfd240625b5cb0dd755f4862624393
+size 734
diff --git a/tests/unit/metrics/test_cases/gpqa_instruct_metric.json b/tests/unit/metrics/test_cases/gpqa_instruct_metric.json
index e9b421e91..d70b9dd59 100644
--- a/tests/unit/metrics/test_cases/gpqa_instruct_metric.json
+++ b/tests/unit/metrics/test_cases/gpqa_instruct_metric.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:11d94ac03ce4c4d4f6704d3f7e12c2569c8cf55bd64f5fc90170c4052fa6ba51
-size 999
+oid sha256:b574a7e5f16a3291f0154f71f929b0f59d896e9d0747f210885ac18d6febb464
+size 19623
diff --git a/tests/unit/metrics/test_metrics_automated.py b/tests/unit/metrics/test_metrics_automated.py
index e336f1d0b..d3b190114 100644
--- a/tests/unit/metrics/test_metrics_automated.py
+++ b/tests/unit/metrics/test_metrics_automated.py
@@ -159,55 +159,44 @@ def instantiate_metric(self, metric_class: str, metric_params: Dict[str, Any]):
 
     def run_test_case(self, test_case: MetricTestCase) -> Dict[str, Any]:
         """Run a single test case and return the result."""
-        try:
-            # Check if metric is available in METRIC_CLASSES
-            if test_case.metric_class not in self.METRIC_CLASSES:
-                return {
-                    "test_case": test_case.name,
-                    "success": True,  # Mark as success to skip
-                    "expected": test_case.expected_output,
-                    "actual": None,
-                    "error": None,
-                    "skipped": True,
-                    "skip_reason": f"Metric '{test_case.metric_class}' not available in METRIC_CLASSES",
-                }
-
-            # Get the metric from the Metrics enum
-            metric = self.instantiate_metric(test_case.metric_class, test_case.metric_params)
-
-            # Create input objects
-            doc = self.create_doc_from_dict(test_case.doc)
-            model_response = self.create_model_response_from_dict(test_case.model_response)
-
-            # Create sample_params for the metric
-            sample_params = {
-                "doc": doc,
-                "model_response": model_response,
-            }
-
-            # Run the metric using the Metrics enum value
-            actual_output = metric.compute_sample(**sample_params)
-
-            # Compare with expected output
-            success = self._compare_dict_outputs(actual_output, test_case.expected_output, test_case.tolerance)
+        # Check if metric is available in METRIC_CLASSES
+        if test_case.metric_class not in self.METRIC_CLASSES:
             return {
                 "test_case": test_case.name,
-                "success": success,
+                "success": True,  # Mark as success to skip
                 "expected": test_case.expected_output,
-                "actual": actual_output,
+                "actual": None,
                 "error": None,
-                "skipped": False,
+                "skipped": True,
+                "skip_reason": f"Metric '{test_case.metric_class}' not available in METRIC_CLASSES",
             }
 
-        except Exception as e:
-            return {
-                "test_case": test_case.name,
-                "success": False,
-                "expected": test_case.expected_output,
-                "actual": None,
-                "error": str(e),
-                "skipped": False,
-            }
+        # Get the metric from the Metrics enum
+        metric = self.instantiate_metric(test_case.metric_class, test_case.metric_params)
+
+        # Create input objects
+        doc = self.create_doc_from_dict(test_case.doc)
+        model_response = self.create_model_response_from_dict(test_case.model_response)
+
+        # Create sample_params for the metric
+        sample_params = {
+            "doc": doc,
+            "model_response": model_response,
+        }
+
+        # Run the metric using the Metrics enum value
+        actual_output = metric.compute_sample(**sample_params)
+
+        # Compare with expected output
+        success = self._compare_dict_outputs(actual_output, test_case.expected_output, test_case.tolerance)
+        return {
+            "test_case": test_case.name,
+            "success": success,
+            "expected": test_case.expected_output,
+            "actual": actual_output,
+            "error": None,
+            "skipped": False,
+        }
 
     def _compare_scalar_outputs(self, actual: Any, expected: float, tolerance: float) -> bool:
         """Compare scalar outputs with tolerance."""

From c9e7243a9c5092d67a83d278189e588f4812cf86 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Mon, 1 Sep 2025 14:20:57 +0000
Subject: [PATCH 09/26] better tests, passing

---
 src/lighteval/metrics/metrics_sample.py       | 12 +++--
 src/lighteval/metrics/utils/metric_utils.py   |  2 +
 .../test_cases/acc_golds_likelihood.json      |  4 +-
 tests/unit/metrics/test_cases/avg_at_k.json   |  4 +-
 .../metrics/test_cases/avg_at_k_math.json     |  4 +-
 tests/unit/metrics/test_cases/bleurt.json     |  4 +-
 tests/unit/metrics/test_cases/copyright.json  |  4 +-
 tests/unit/metrics/test_cases/drop.json       |  4 +-
 .../metrics/test_cases/extractiveness.json    |  4 +-
 tests/unit/metrics/test_cases/f1_score.json   |  4 +-
 .../unit/metrics/test_cases/g_pass_at_k.json  |  4 +-
 .../metrics/test_cases/g_pass_at_k_latex.json |  4 +-
 .../metrics/test_cases/g_pass_at_k_math.json  |  4 +-
 .../test_cases/gpqa_instruct_pass_at_k.json   |  4 +-
 .../metrics/test_cases/loglikelihood_acc.json |  4 +-
 .../metrics/test_cases/loglikelihood_f1.json  |  4 +-
 tests/unit/metrics/test_cases/maj_at_k.json   |  4 +-
 tests/unit/metrics/test_cases/mrr.json        |  4 +-
 tests/unit/metrics/test_cases/pass_at_k.json  |  4 +-
 .../metrics/test_cases/pass_at_k_letters.json |  4 +-
 .../metrics/test_cases/pass_at_k_math.json    |  4 +-
 .../unit/metrics/test_cases/recall_at_k.json  |  4 +-
 tests/unit/metrics/test_cases/rouge2.json     |  4 +-
 tests/unit/metrics/test_cases/rougeL.json     |  4 +-
 tests/unit/metrics/test_cases/rougeLsum.json  |  4 +-
 tests/unit/metrics/test_cases/rouge_t5.json   |  4 +-
 .../test_cases/truthfulqa_mc_metrics.json     |  4 +-
 tests/unit/metrics/test_metrics_automated.py  | 50 ++++++++++---------
 28 files changed, 85 insertions(+), 79 deletions(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 8d9ec5849..cf8b7d2ab 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -1214,7 +1214,9 @@ def compute(self, doc: Doc, model_response: ModelResponse):
         """
         if self.k is None:
             raise Exception("You did not set the value of k")
+
         golds = doc.get_golds()
+
         if len(golds) > 1:
             raise Exception("Cannot compute maj@k with several golds")
 
@@ -1222,7 +1224,7 @@ def compute(self, doc: Doc, model_response: ModelResponse):
         new_doc = Doc(
             choices=processed_choices,
             query=doc.query,
-            gold_index=doc.gold_index,
+            gold_index=list(range(len(processed_choices))),
         )
         all_answers = []
         for pred in model_response.final_text[: self.k]:
@@ -1406,8 +1408,8 @@ def compute_mg_pass_at_k(n, c, k):
         metrics = {}
         for k in ks:
             for t in thresholds:
-                metrics[f"{self.name}@{k}_{t}"] = compute_g_pass_at_k(n, c, k, t)
-            metrics[f"m{self.name}@{k}"] = compute_mg_pass_at_k(n, c, k)
+                metrics[f"{self.name}{k}_{t}"] = compute_g_pass_at_k(n, c, k, t)
+            metrics[f"m{self.name}{k}"] = compute_mg_pass_at_k(n, c, k)
 
         return metrics
 
@@ -1419,8 +1421,8 @@ def metric_names(self):
         metrics = []
         for k in ks:
             for t in thresholds:
-                metrics.append(f"{self.name}@{k}_{t}")
-            metrics.append(f"m{self.name}@{k}")
+                metrics.append(f"{self.name}{k}_{t}")
+            metrics.append(f"m{self.name}{k}")
 
         return metrics
 
diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py
index 85b1e2bc6..fe9e9f40e 100644
--- a/src/lighteval/metrics/utils/metric_utils.py
+++ b/src/lighteval/metrics/utils/metric_utils.py
@@ -83,6 +83,8 @@ def __call__(self, sample_params: dict | None):
 
         # Once the parameters are updated, we need to adjust the
         # metric name to what will be returned
+        # if "math-g-pass" in self.metric_name:
+        # breakpoint()
         sample_params_name = "&".join(sample_params.keys())
         if isinstance(self, MetricGrouping):
             if hasattr(self.sample_level_fn, "metric_names"):
diff --git a/tests/unit/metrics/test_cases/acc_golds_likelihood.json b/tests/unit/metrics/test_cases/acc_golds_likelihood.json
index fd1b0be02..b41dfd131 100644
--- a/tests/unit/metrics/test_cases/acc_golds_likelihood.json
+++ b/tests/unit/metrics/test_cases/acc_golds_likelihood.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a4a390601a185bf4a62ac31a52bfde0064b0b8d5eac34b3683e026e23d489338
-size 824
+oid sha256:f486ec84db5c556b13368da3317bd91629eb93f6a25f869c4972cfed61977656
+size 2012
diff --git a/tests/unit/metrics/test_cases/avg_at_k.json b/tests/unit/metrics/test_cases/avg_at_k.json
index db21a380c..5e315bc51 100644
--- a/tests/unit/metrics/test_cases/avg_at_k.json
+++ b/tests/unit/metrics/test_cases/avg_at_k.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:656c2910fb67dc8a5b7ddfb4c2583f8a107cc6bd7962caeec5d94f4815497167
-size 634
+oid sha256:3e1be6df6efbe74c5bf2c217c81a232e2e154414619e5ffec660ac8a5e0f7aae
+size 1766
diff --git a/tests/unit/metrics/test_cases/avg_at_k_math.json b/tests/unit/metrics/test_cases/avg_at_k_math.json
index 567219f1d..8005cf7d0 100644
--- a/tests/unit/metrics/test_cases/avg_at_k_math.json
+++ b/tests/unit/metrics/test_cases/avg_at_k_math.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8e3e39166ce74c9d398736357daffda5c72e5c65c1bd027680ced9cc54e45ba0
-size 728
+oid sha256:7eb34bbc8b34721da79ea6a367160a7f43a16fd5162b5b653f8af67b04c1ca92
+size 1572
diff --git a/tests/unit/metrics/test_cases/bleurt.json b/tests/unit/metrics/test_cases/bleurt.json
index fa28d1606..8774db6bf 100644
--- a/tests/unit/metrics/test_cases/bleurt.json
+++ b/tests/unit/metrics/test_cases/bleurt.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac1081a08f33547bd1158bb4eb535c8ae1dd90d05d1db5de6e99ee21e6abd97c
-size 907
+oid sha256:408bb775a6c12744227254d3f1a7511aee9cbfe2160acd23d79dfeca094d1856
+size 1864
diff --git a/tests/unit/metrics/test_cases/copyright.json b/tests/unit/metrics/test_cases/copyright.json
index e4491c7a1..6459816c6 100644
--- a/tests/unit/metrics/test_cases/copyright.json
+++ b/tests/unit/metrics/test_cases/copyright.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:954d886db79f9217d380eaa717a74e46969f88f632d3e7b608107eaaac89f294
-size 732
+oid sha256:286a7519ab83375e6d8ccf2264fbc55266260d08c7cb88dfca897b598f74b22d
+size 1994
diff --git a/tests/unit/metrics/test_cases/drop.json b/tests/unit/metrics/test_cases/drop.json
index 4fdc1442f..e87bf89b0 100644
--- a/tests/unit/metrics/test_cases/drop.json
+++ b/tests/unit/metrics/test_cases/drop.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:450f78b0720b5706bcdbf6997cf89adaa5cfd240625b5cb0dd755f4862624393
-size 734
+oid sha256:675c6cc4313bb41e8a8d27253dcffde62a25fe659ef8e7b762e26ca667c58851
+size 1714
diff --git a/tests/unit/metrics/test_cases/extractiveness.json b/tests/unit/metrics/test_cases/extractiveness.json
index e473d6d8a..da6232b39 100644
--- a/tests/unit/metrics/test_cases/extractiveness.json
+++ b/tests/unit/metrics/test_cases/extractiveness.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7603583d63d162186c8e46be3ca4b8ba1dc15afdef99d2009c8172f8360d798e
-size 946
+oid sha256:c7357863b5a005819fff204ae0a67287635c2598d2c3948cece0a41c23a1066d
+size 2451
diff --git a/tests/unit/metrics/test_cases/f1_score.json b/tests/unit/metrics/test_cases/f1_score.json
index 507d6806b..2f1a78e15 100644
--- a/tests/unit/metrics/test_cases/f1_score.json
+++ b/tests/unit/metrics/test_cases/f1_score.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1f1e9e4123ac0aabf5588b726c52fd0fa76c9a6a72001eb50eb6549b982e55d1
-size 693
+oid sha256:a141b848bb169c28764742219f077aea9fc60bc6a209ee9b043b8c2614add34b
+size 4358
diff --git a/tests/unit/metrics/test_cases/g_pass_at_k.json b/tests/unit/metrics/test_cases/g_pass_at_k.json
index b164628e4..d8f3870be 100644
--- a/tests/unit/metrics/test_cases/g_pass_at_k.json
+++ b/tests/unit/metrics/test_cases/g_pass_at_k.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dfd2b8f9b839368eebc90e624081301945d8b4f238b23d2f1aba25328577deab
-size 905
+oid sha256:3fba8477eaa1cb5efb54d0afb1f5cddb528a1086c15cac79dc6f16fea0012abc
+size 9368
diff --git a/tests/unit/metrics/test_cases/g_pass_at_k_latex.json b/tests/unit/metrics/test_cases/g_pass_at_k_latex.json
index c94a9b7c7..2491e9e3e 100644
--- a/tests/unit/metrics/test_cases/g_pass_at_k_latex.json
+++ b/tests/unit/metrics/test_cases/g_pass_at_k_latex.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d5300d1c0ba4e886e27efa190449b4ef9afc9cae8ad32d7a84259ac0562c04b5
-size 1130
+oid sha256:687a25df0c903d98d3fabb433552d69c30630dc634f8f9f1582e641eacf60faa
+size 6911
diff --git a/tests/unit/metrics/test_cases/g_pass_at_k_math.json b/tests/unit/metrics/test_cases/g_pass_at_k_math.json
index dcae880bb..97f9aca37 100644
--- a/tests/unit/metrics/test_cases/g_pass_at_k_math.json
+++ b/tests/unit/metrics/test_cases/g_pass_at_k_math.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d9a23faf6fa94e35e4ef147a08dfcccefcf3d6296e99f51ffa0fd74bebc983a7
-size 1108
+oid sha256:33f317039e4adf1ac7a44ac2a94b7e8f37095161ab496c51732e9521bfcd551c
+size 9907
diff --git a/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json b/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json
index 655f270bc..27de62abc 100644
--- a/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json
+++ b/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:357a97f311d4421e6575e96524b119ff02aa04d9e2fb7899ec8e4725a2307f94
-size 1025
+oid sha256:9b82a383f67eb0d6ef1fe0c35c3d9e17acf1956efe03590015d9882283372ae6
+size 8648
diff --git a/tests/unit/metrics/test_cases/loglikelihood_acc.json b/tests/unit/metrics/test_cases/loglikelihood_acc.json
index 3046bb396..eaa8fb6e2 100644
--- a/tests/unit/metrics/test_cases/loglikelihood_acc.json
+++ b/tests/unit/metrics/test_cases/loglikelihood_acc.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9e48acb928cc759b938e2f8d3acd5a65b26bbbef39acd100f580f20aa4d75421
-size 721
+oid sha256:a00ac480425c5b37efb69b5a01d87542dfa96fffeb82d01fda8a7006a66603fb
+size 8133
diff --git a/tests/unit/metrics/test_cases/loglikelihood_f1.json b/tests/unit/metrics/test_cases/loglikelihood_f1.json
index 5deb7a3ae..2ccd76b0f 100644
--- a/tests/unit/metrics/test_cases/loglikelihood_f1.json
+++ b/tests/unit/metrics/test_cases/loglikelihood_f1.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ea1a1da0d5651cca5268172136a7a1951dd6f68c6fda93464fd2ba9dd3e151c7
-size 965
+oid sha256:44675eaa9844cac9e4f71b8b825f114626649d56c46ed14e77f253ab426ef5d1
+size 8828
diff --git a/tests/unit/metrics/test_cases/maj_at_k.json b/tests/unit/metrics/test_cases/maj_at_k.json
index 8bbf1c6e8..9f8cae279 100644
--- a/tests/unit/metrics/test_cases/maj_at_k.json
+++ b/tests/unit/metrics/test_cases/maj_at_k.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0c0a1a99a62f391296510cc8d7b2c30de6ba9a4cc672a12605ca7d44b73cae29
-size 698
+oid sha256:4f18b15293b933ded1d24cf5aac842eab03c3604d00b0bb45ed96956a83355c1
+size 2227
diff --git a/tests/unit/metrics/test_cases/mrr.json b/tests/unit/metrics/test_cases/mrr.json
index 654dbbc35..3c5ffd306 100644
--- a/tests/unit/metrics/test_cases/mrr.json
+++ b/tests/unit/metrics/test_cases/mrr.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:20d4a5e143b068600bc2ad3e345061128c53a90eb8580840fd3da4776f3e989e
-size 884
+oid sha256:a79c93f65e5c6e419125efaceea598b3e500fb01e7cfa0b57f09f0831f1e140f
+size 2386
diff --git a/tests/unit/metrics/test_cases/pass_at_k.json b/tests/unit/metrics/test_cases/pass_at_k.json
index 3fd01b414..1b67789ca 100644
--- a/tests/unit/metrics/test_cases/pass_at_k.json
+++ b/tests/unit/metrics/test_cases/pass_at_k.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:855466ba73e0faf312b68666169a0077fa2308d1aa0410e7b29d4a1a4d328882
-size 936
+oid sha256:a9110dc53c847bc95648b270d3c5622967884ae9cd398c0e75268424fc2d26eb
+size 1905
diff --git a/tests/unit/metrics/test_cases/pass_at_k_letters.json b/tests/unit/metrics/test_cases/pass_at_k_letters.json
index ed483a09d..50e4ed073 100644
--- a/tests/unit/metrics/test_cases/pass_at_k_letters.json
+++ b/tests/unit/metrics/test_cases/pass_at_k_letters.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e4f0439d333537ae8485d4f6e3553eebfd0365db97460bee2f956f8f1d3bc582
-size 984
+oid sha256:d7f9b2aefb62a7b04440759a21323605df76ed30eff9cc99a62f9dc5f667bacc
+size 1878
diff --git a/tests/unit/metrics/test_cases/pass_at_k_math.json b/tests/unit/metrics/test_cases/pass_at_k_math.json
index 967c62406..91db182a6 100644
--- a/tests/unit/metrics/test_cases/pass_at_k_math.json
+++ b/tests/unit/metrics/test_cases/pass_at_k_math.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b266f73f7141d0a97568e9e9cc3bb9b75be94b87b566f27e8fa86cdcfa6663d
-size 637
+oid sha256:330bb04632ce82da1bbfcf57bbb9ff5d36bfe0dc1c0d298706a8a0a24786c420
+size 1633
diff --git a/tests/unit/metrics/test_cases/recall_at_k.json b/tests/unit/metrics/test_cases/recall_at_k.json
index 8c6e4190f..b41ef29ba 100644
--- a/tests/unit/metrics/test_cases/recall_at_k.json
+++ b/tests/unit/metrics/test_cases/recall_at_k.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:db8df096318bc9d072bda2dd77c2f43a0ab0ce341928453dc18b4791b89e758a
-size 935
+oid sha256:8a786b6a64057501d3d65bb251709595fd1c982e1f533ed12ac968da8c61522e
+size 1977
diff --git a/tests/unit/metrics/test_cases/rouge2.json b/tests/unit/metrics/test_cases/rouge2.json
index 6f5ab48f9..a53038b33 100644
--- a/tests/unit/metrics/test_cases/rouge2.json
+++ b/tests/unit/metrics/test_cases/rouge2.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da3f20ce95aae69fc9dfb39f6b64ab1cbc9e9d4df75eafaad5fbd755c8e5db19
-size 903
+oid sha256:553b4de4f3568fe3907dd067d19c8bbce0004972da9841e010ecf2c05db67fc7
+size 1881
diff --git a/tests/unit/metrics/test_cases/rougeL.json b/tests/unit/metrics/test_cases/rougeL.json
index a05067c84..b3c3e8883 100644
--- a/tests/unit/metrics/test_cases/rougeL.json
+++ b/tests/unit/metrics/test_cases/rougeL.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c12497e66af2359af1f9bebcf96aeb495ce15cde9ab71c37279a68c16b2c07db
-size 903
+oid sha256:b2b219b759e1d3aae2da9c885edb11a55e5e55e38589865894d2498aca4534dd
+size 1877
diff --git a/tests/unit/metrics/test_cases/rougeLsum.json b/tests/unit/metrics/test_cases/rougeLsum.json
index 00a91d02d..8b7f00302 100644
--- a/tests/unit/metrics/test_cases/rougeLsum.json
+++ b/tests/unit/metrics/test_cases/rougeLsum.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fb44e69dbbb59ac026a9b0e356efdd191e0443a633b8d6e70a16e177338d1b5d
-size 924
+oid sha256:32f6d4f7261fee58c3da493b6156bf001afa6d501bdfdcf8fcb33169542f8aa8
+size 1958
diff --git a/tests/unit/metrics/test_cases/rouge_t5.json b/tests/unit/metrics/test_cases/rouge_t5.json
index 0798b3ba8..49d2aa56c 100644
--- a/tests/unit/metrics/test_cases/rouge_t5.json
+++ b/tests/unit/metrics/test_cases/rouge_t5.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e7d7ec4b45e3c67dbd3431c3aa7cde973d994e79d039031febff027f938b0988
-size 989
+oid sha256:9792b0ef28716f36663975024a84cfb15284a17e2f5a6648363a6284697e0ad3
+size 2208
diff --git a/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json b/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json
index 131c42c16..78507add7 100644
--- a/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json
+++ b/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a6e70aa07d9fcdbd5020bc81f14f6e7904f88cc36681d5134df0bd5c5808f0a7
-size 1604
+oid sha256:f91a5be1cd5cb437c35632184a8152f8c44e95001c364b27477e3c6015b949e7
+size 2424
diff --git a/tests/unit/metrics/test_metrics_automated.py b/tests/unit/metrics/test_metrics_automated.py
index d3b190114..64c579fdd 100644
--- a/tests/unit/metrics/test_metrics_automated.py
+++ b/tests/unit/metrics/test_metrics_automated.py
@@ -28,6 +28,7 @@
 automatically run them and verify the results.
 """
 
+import copy
 import json
 import logging
 from dataclasses import field
@@ -71,34 +72,34 @@ class AutomatedMetricTester:
     # Mapping of metric names to Metrics enum values
     METRIC_CLASSES = {
         # Map metric names to their corresponding Metrics enum values
-        "exact_match": Metrics.exact_match,
+        "exact_match": Metrics.exact_match,  #
         "f1_score": Metrics.f1_score,
-        "loglikelihood_acc": Metrics.loglikelihood_acc,
-        "recall_at_k": Metrics.recall_at_k,
-        "mrr": Metrics.mrr,
+        "loglikelihood_acc": Metrics.loglikelihood_acc,  #
+        "recall_at_k": Metrics.recall_at_k,  #
+        "mrr": Metrics.mrr,  #
         "rouge1": Metrics.rouge1,
-        "rouge2": Metrics.rouge2,
-        "rougeL": Metrics.rougeL,
-        "rougeLsum": Metrics.rougeLsum,
-        "rouge_t5": Metrics.rouge_t5,
-        "extractiveness": Metrics.extractiveness,
-        "bleurt": Metrics.bleurt,
-        "copyright": Metrics.copyright,
-        "drop": Metrics.drop,
-        "avg_at_k": Metrics.avg_at_k,
-        "avg_at_k_math": Metrics.avg_at_k_math,
-        "g_pass_at_k": Metrics.g_pass_at_k,
-        "g_pass_at_k_math": Metrics.g_pass_at_k_math,
-        "g_pass_at_k_latex": Metrics.g_pass_at_k_latex,
-        "maj_at_k": Metrics.maj_at_k,
-        "pass_at_k": Metrics.pass_at_k,
-        "pass_at_k_math": Metrics.pass_at_k_math,
-        "pass_at_k_letters": Metrics.pass_at_k_letters,
+        "rouge2": Metrics.rouge2,  #
+        "rougeL": Metrics.rougeL,  #
+        "rougeLsum": Metrics.rougeLsum,  #
+        "rouge_t5": Metrics.rouge_t5,  #
+        "extractiveness": Metrics.extractiveness,  #
+        "bleurt": Metrics.bleurt,  #
+        "copyright": Metrics.copyright,  #
+        "drop": Metrics.drop,  #
+        "avg_at_k": Metrics.avg_at_k,  #
+        "avg_at_k_math": Metrics.avg_at_k_math,  #
+        "g_pass_at_k": Metrics.g_pass_at_k,  #
+        "g_pass_at_k_math": Metrics.g_pass_at_k_math,  #
+        "g_pass_at_k_latex": Metrics.g_pass_at_k_latex,  #
+        "maj_at_k": Metrics.maj_at_k,  #
+        "pass_at_k": Metrics.pass_at_k,  #
+        "pass_at_k_math": Metrics.pass_at_k_math,  #
+        "pass_at_k_letters": Metrics.pass_at_k_letters,  #
         "gpqa_instruct_metric": Metrics.gpqa_instruct_metric,
         "gpqa_instruct_pass_at_k": Metrics.gpqa_instruct_pass_at_k,
         "expr_gold_metric": Metrics.expr_gold_metric,
-        "acc_golds_likelihood": Metrics.acc_golds_likelihood,
-        "truthfulqa_mc_metrics": Metrics.truthfulqa_mc_metrics,
+        "acc_golds_likelihood": Metrics.acc_golds_likelihood,  #
+        "truthfulqa_mc_metrics": Metrics.truthfulqa_mc_metrics,  #
         # "faithfulness": Metrics.faithfulness, issue with tokenizer
         # "prediction_perplexity": Metrics.prediction_perplexity,
         # "target_perplexity": Metrics.target_perplexity,
@@ -149,7 +150,8 @@ def instantiate_metric(self, metric_class: str, metric_params: Dict[str, Any]):
 
         # Get the metric from the Metrics enum
         if metric_params != {}:
-            metric_enum_value = self.METRIC_CLASSES[metric_class].value(metric_params)
+            metric = self.METRIC_CLASSES[metric_class].value
+            metric_enum_value = copy.deepcopy(metric)(metric_params)
         else:
             metric_enum_value = self.METRIC_CLASSES[metric_class].value
 

From 3d7b448bd16e0131a0ab29cda184a07061b04724 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Mon, 1 Sep 2025 14:39:30 +0000
Subject: [PATCH 10/26] fix tests

---
 tests/unit/tasks/test_registry.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/unit/tasks/test_registry.py b/tests/unit/tasks/test_registry.py
index 106708549..377ea7d6c 100644
--- a/tests/unit/tasks/test_registry.py
+++ b/tests/unit/tasks/test_registry.py
@@ -48,7 +48,7 @@ def test_custom_task_groups():
     """
     Tests that task info selector correctly handles custom task groups.
     """
-    registry = Registry(tasks="zero_and_one", custom_tasks="tests.tasks.test_registry")
+    registry = Registry(tasks="zero_and_one", custom_tasks="tests.unit.tasks.test_registry")
 
     assert set(registry.tasks_list) == {"custom|test_task_revision|0", "custom|test_task_revision|1"}
 
@@ -62,7 +62,7 @@ def test_custom_tasks():
     """
     Tests that task info selector correctly handles custom tasks.
     """
-    registry = Registry(tasks="custom|test_task_revision|0", custom_tasks="tests.tasks.test_registry")
+    registry = Registry(tasks="custom|test_task_revision|0", custom_tasks="tests.unit.tasks.test_registry")
 
     assert registry.tasks_list == ["custom|test_task_revision|0"]
     assert set(registry.task_to_configs.keys()) == {"custom|test_task_revision"}
@@ -133,7 +133,7 @@ def test_task_group_expansion_with_subset_expansion():
     """
     Tests that task info selector correctly handles a group with task superset is provided.
     """
-    registry = Registry(tasks="all_mmlu", custom_tasks="tests.tasks.test_registry")
+    registry = Registry(tasks="all_mmlu", custom_tasks="tests.unit.tasks.test_registry")
 
     # We have all mmlu tasks
     assert len(registry.task_to_configs.keys()) == 57
@@ -152,7 +152,7 @@ def test_task_duplicates():
     Tests that task info selector correctly handles if duplicate tasks are provided.
     """
     registry = Registry(
-        tasks="custom|test_task_revision|0,custom|test_task_revision|0", custom_tasks="tests.tasks.test_registry"
+        tasks="custom|test_task_revision|0,custom|test_task_revision|0", custom_tasks="tests.unit.tasks.test_registry"
     )
 
     assert list(registry.tasks_list) == ["custom|test_task_revision|0"]

From 0c4a554437761e0066f3bdba0f9c3ca053c481a9 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 2 Sep 2025 08:59:17 +0000
Subject: [PATCH 11/26] fix faithfullness metric

---
 src/lighteval/metrics/imports/summac.py       |   1 -
 .../unit/metrics/test_cases/faithfulness.json |   4 +-
 tests/unit/metrics/test_metrics_automated.py  | 170 +++---------------
 3 files changed, 26 insertions(+), 149 deletions(-)

diff --git a/src/lighteval/metrics/imports/summac.py b/src/lighteval/metrics/imports/summac.py
index e64dab863..bda317b79 100644
--- a/src/lighteval/metrics/imports/summac.py
+++ b/src/lighteval/metrics/imports/summac.py
@@ -221,7 +221,6 @@ def build_image(self, original, generated):
                     truncation=True,
                     max_length=self.max_input_length,
                     return_tensors="pt",
-                    truncation_strategy="only_first",
                 )
                 batch_tokens = {k: v.to(self.device) for k, v in batch_tokens.items()}
                 with torch.no_grad():
diff --git a/tests/unit/metrics/test_cases/faithfulness.json b/tests/unit/metrics/test_cases/faithfulness.json
index 7baddec23..a86f256e7 100644
--- a/tests/unit/metrics/test_cases/faithfulness.json
+++ b/tests/unit/metrics/test_cases/faithfulness.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:157f820c24bfee8ec961df6d57844fc170c5e52f8a463669918640256f53c361
-size 1022
+oid sha256:2e98307b93588bce80ac28f1614f432e31a1417abc72d169838b8818650d4f30
+size 2848
diff --git a/tests/unit/metrics/test_metrics_automated.py b/tests/unit/metrics/test_metrics_automated.py
index 64c579fdd..892984307 100644
--- a/tests/unit/metrics/test_metrics_automated.py
+++ b/tests/unit/metrics/test_metrics_automated.py
@@ -72,35 +72,35 @@ class AutomatedMetricTester:
     # Mapping of metric names to Metrics enum values
     METRIC_CLASSES = {
         # Map metric names to their corresponding Metrics enum values
-        "exact_match": Metrics.exact_match,  #
+        "exact_match": Metrics.exact_match,
         "f1_score": Metrics.f1_score,
-        "loglikelihood_acc": Metrics.loglikelihood_acc,  #
-        "recall_at_k": Metrics.recall_at_k,  #
-        "mrr": Metrics.mrr,  #
+        "loglikelihood_acc": Metrics.loglikelihood_acc,
+        "recall_at_k": Metrics.recall_at_k,
+        "mrr": Metrics.mrr,
         "rouge1": Metrics.rouge1,
-        "rouge2": Metrics.rouge2,  #
-        "rougeL": Metrics.rougeL,  #
-        "rougeLsum": Metrics.rougeLsum,  #
-        "rouge_t5": Metrics.rouge_t5,  #
-        "extractiveness": Metrics.extractiveness,  #
-        "bleurt": Metrics.bleurt,  #
-        "copyright": Metrics.copyright,  #
-        "drop": Metrics.drop,  #
-        "avg_at_k": Metrics.avg_at_k,  #
-        "avg_at_k_math": Metrics.avg_at_k_math,  #
-        "g_pass_at_k": Metrics.g_pass_at_k,  #
-        "g_pass_at_k_math": Metrics.g_pass_at_k_math,  #
-        "g_pass_at_k_latex": Metrics.g_pass_at_k_latex,  #
-        "maj_at_k": Metrics.maj_at_k,  #
-        "pass_at_k": Metrics.pass_at_k,  #
-        "pass_at_k_math": Metrics.pass_at_k_math,  #
-        "pass_at_k_letters": Metrics.pass_at_k_letters,  #
+        "rouge2": Metrics.rouge2,
+        "rougeL": Metrics.rougeL,
+        "rougeLsum": Metrics.rougeLsum,
+        "rouge_t5": Metrics.rouge_t5,
+        "extractiveness": Metrics.extractiveness,
+        "bleurt": Metrics.bleurt,
+        "copyright": Metrics.copyright,
+        "drop": Metrics.drop,
+        "avg_at_k": Metrics.avg_at_k,
+        "avg_at_k_math": Metrics.avg_at_k_math,
+        "g_pass_at_k": Metrics.g_pass_at_k,
+        "g_pass_at_k_math": Metrics.g_pass_at_k_math,
+        "g_pass_at_k_latex": Metrics.g_pass_at_k_latex,
+        "maj_at_k": Metrics.maj_at_k,
+        "pass_at_k": Metrics.pass_at_k,
+        "pass_at_k_math": Metrics.pass_at_k_math,
+        "pass_at_k_letters": Metrics.pass_at_k_letters,
         "gpqa_instruct_metric": Metrics.gpqa_instruct_metric,
         "gpqa_instruct_pass_at_k": Metrics.gpqa_instruct_pass_at_k,
         "expr_gold_metric": Metrics.expr_gold_metric,
-        "acc_golds_likelihood": Metrics.acc_golds_likelihood,  #
-        "truthfulqa_mc_metrics": Metrics.truthfulqa_mc_metrics,  #
-        # "faithfulness": Metrics.faithfulness, issue with tokenizer
+        "acc_golds_likelihood": Metrics.acc_golds_likelihood,
+        "truthfulqa_mc_metrics": Metrics.truthfulqa_mc_metrics,
+        "faithfulness": Metrics.faithfulness,  # issue with tokenizer
         # "prediction_perplexity": Metrics.prediction_perplexity,
         # "target_perplexity": Metrics.target_perplexity,
         # "bert_score": Metrics.bert_score, issue with the scoring function, int too big to convert
@@ -273,125 +273,3 @@ def run_test_suites_from_file(self, file_path: Union[str, Path]) -> List[Dict[st
             # Single test suite
             test_suite = MetricTestSuite(**data)
             return self.run_test_suite(test_suite)
-
-    def save_test_suite_to_file(self, test_suite: MetricTestSuite, file_path: Union[str, Path]):
-        """Save a test suite to a JSON file."""
-        with open(file_path, "w") as f:
-            json.dump(test_suite.dict(), f, indent=2)
-
-    def create_example_test_suite(self) -> MetricTestSuite:
-        """Create an example test suite with various metrics."""
-        return MetricTestSuite(
-            name="Example Test Suite",
-            description="Example test cases for various metrics",
-            test_cases=[
-                MetricTestCase(
-                    name="Exact Match - Perfect Match",
-                    metric_class="exact_match",
-                    metric_params={},
-                    doc={
-                        "query": "What is the capital of France?",
-                        "choices": ["Paris", "London", "Berlin"],
-                        "gold_index": 0,
-                        "task_name": "test",
-                    },
-                    model_response={
-                        "text": ["Paris"],
-                        "logprobs": [],
-                        "output_tokens": [],
-                    },
-                    expected_output={"em": 1.0},
-                    description="Test exact match with perfect prediction",
-                ),
-                MetricTestCase(
-                    name="Exact Match - No Match",
-                    metric_class="exact_match",
-                    metric_params={},
-                    doc={
-                        "query": "What is the capital of France?",
-                        "choices": ["Paris", "London", "Berlin"],
-                        "gold_index": 0,
-                        "task_name": "test",
-                    },
-                    model_response={
-                        "text": ["London"],
-                        "logprobs": [],
-                        "output_tokens": [],
-                    },
-                    expected_output={"em": 0.0},
-                    description="Test exact match with wrong prediction",
-                ),
-                MetricTestCase(
-                    name="F1 Score - Good Match",
-                    metric_class="f1_score",
-                    metric_params={},
-                    doc={
-                        "query": "Summarize the text",
-                        "choices": ["The quick brown fox jumps over the lazy dog"],
-                        "gold_index": 0,
-                        "task_name": "test",
-                    },
-                    model_response={
-                        "text": ["The quick brown fox jumps over the lazy dog"],
-                        "logprobs": [],
-                        "output_tokens": [],
-                    },
-                    expected_output={"f1": 1.0},
-                    description="Test F1 score with perfect match",
-                ),
-                MetricTestCase(
-                    name="Loglikelihood Accuracy - Correct Choice",
-                    metric_class="loglikelihood_acc",
-                    metric_params={},
-                    doc={
-                        "query": "Choose the correct answer",
-                        "choices": ["A", "B", "C"],
-                        "gold_index": 0,
-                        "task_name": "test",
-                    },
-                    model_response={
-                        "text": ["A"],
-                        "logprobs": [0.5, 0.3, 0.2],  # A has highest logprob
-                        "output_tokens": [[1], [2], [3]],
-                    },
-                    expected_output={"acc": 1},
-                    description="Test loglikelihood accuracy with correct choice",
-                ),
-                MetricTestCase(
-                    name="ROUGE Score",
-                    metric_class="rouge1",
-                    metric_params={"methods": ["rouge1"]},
-                    doc={
-                        "query": "Summarize the text",
-                        "choices": ["The quick brown fox jumps over the lazy dog"],
-                        "gold_index": 0,
-                        "task_name": "test",
-                    },
-                    model_response={
-                        "text": ["The quick brown fox jumps over the lazy dog"],
-                        "logprobs": [],
-                        "output_tokens": [],
-                    },
-                    expected_output={"rouge1": 1.0},
-                    description="Test ROUGE score with perfect match",
-                ),
-            ],
-        )
-
-
-if __name__ == "__main__":
-    # Example usage
-    tester = AutomatedMetricTester()
-
-    # Create and run example test suite
-    example_suite = tester.create_example_test_suite()
-    results = tester.run_test_suite(example_suite)
-
-    # Print summary
-    passed = sum(1 for r in results if r["success"])
-    total = len(results)
-    print(f"\nTest Summary: {passed}/{total} tests passed")
-
-    # Save example test suite to file
-    tester.save_test_suite_to_file(example_suite, "example_test_suite.json")
-    print("Example test suite saved to example_test_suite.json")

From 594c2691728f46bdf09ba011a2be89c8d4fabe27 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 3 Sep 2025 13:57:22 +0000
Subject: [PATCH 12/26] adds corpus level metric testing

---
 src/lighteval/metrics/metrics.py              |   2 +-
 src/lighteval/metrics/metrics_corpus.py       |   6 +-
 src/lighteval/metrics/utils/metric_utils.py   |   2 -
 tests/test_unit_base_metrics.py               | 340 ++++++++++++++++++
 tests/test_unit_harness_metrics.py            | 139 +++++++
 tests/test_unit_harness_prompts.py            |  75 ++++
 tests/unit/metrics/test_cases/bleu.json       |   4 +-
 tests/unit/metrics/test_cases/bleu_1.json     |   4 +-
 tests/unit/metrics/test_cases/bleu_4.json     |   4 +-
 tests/unit/metrics/test_cases/chrf.json       |   4 +-
 tests/unit/metrics/test_cases/chrf_plus.json  |   4 +-
 .../metrics/test_cases/f1_score_macro.json    |   4 +-
 .../metrics/test_cases/f1_score_micro.json    |   4 +-
 tests/unit/metrics/test_cases/mcc.json        |   4 +-
 .../metrics/test_cases/multi_f1_numeric.json  |   4 +-
 .../metrics/test_cases/target_perplexity.json |   4 +-
 tests/unit/metrics/test_cases/ter.json        |   4 +-
 .../metrics/test_cases/word_perplexity.json   |   4 +-
 tests/unit/metrics/test_metrics_automated.py  | 121 +++++--
 19 files changed, 668 insertions(+), 65 deletions(-)
 create mode 100644 tests/test_unit_base_metrics.py
 create mode 100644 tests/test_unit_harness_metrics.py
 create mode 100644 tests/test_unit_harness_prompts.py

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index a0c75c133..b3215a6c1 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -345,7 +345,7 @@ class Metrics(Enum):
         metric_name="mf1",
         sample_level_fn=LoglikelihoodPreparator(is_single_token=True),
         category=SamplingMethod.LOGPROBS,
-        corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3),
+        corpus_level_fn=CorpusLevelF1Score(average="micro", num_classes=3),
         higher_is_better=True,
     )
     pass_at_k = SampleLevelMetric(
diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
index 09018bf70..b7d4290f5 100644
--- a/src/lighteval/metrics/metrics_corpus.py
+++ b/src/lighteval/metrics/metrics_corpus.py
@@ -94,7 +94,11 @@ def compute_corpus(self, items: list[LogprobCorpusMetricInput]):
         # Multi f1
         f1s = []
         for i in range(self.num_classes):
-            f1s.append(sklearn.metrics.f1_score(y_true=golds == i, y_pred=preds == i))
+            f1s.append(
+                sklearn.metrics.f1_score(
+                    y_true=[g == i for g in golds], y_pred=[p == i for p in preds], average=self.average
+                )
+            )
         return float(np.mean(f1s))
 
 
diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py
index fe9e9f40e..85b1e2bc6 100644
--- a/src/lighteval/metrics/utils/metric_utils.py
+++ b/src/lighteval/metrics/utils/metric_utils.py
@@ -83,8 +83,6 @@ def __call__(self, sample_params: dict | None):
 
         # Once the parameters are updated, we need to adjust the
         # metric name to what will be returned
-        # if "math-g-pass" in self.metric_name:
-        # breakpoint()
         sample_params_name = "&".join(sample_params.keys())
         if isinstance(self, MetricGrouping):
             if hasattr(self.sample_level_fn, "metric_names"):
diff --git a/tests/test_unit_base_metrics.py b/tests/test_unit_base_metrics.py
new file mode 100644
index 000000000..575ebf595
--- /dev/null
+++ b/tests/test_unit_base_metrics.py
@@ -0,0 +1,340 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import numpy as np
+import pytest
+
+from lighteval.metrics.dynamic_metrics import (
+    LogLikelihoodAccMetric,
+    MultilingualQuasiExactMatchMetric,
+    MultilingualQuasiF1ScoreMetric,
+    NormalizedMultiChoiceProbMetric,
+    ProbabilityMetric,
+)
+from lighteval.metrics.metrics_sample import ExactMatches
+from lighteval.metrics.normalizations import LogProbCharNorm, helm_normalizer
+from lighteval.models.model_output import ModelResponse
+from lighteval.tasks.requests import Doc
+from lighteval.utils.language import Language
+
+
+class TestBaseMetrics:
+    def test_exact_match(self):
+        em = ExactMatches(strip_strings=True)
+
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "The quick brown fox jumps over the lazy dog",
+        )
+        assert res == 1
+
+        res = em.compute_one_item(
+            " The quick brown fox jumps over the lazy dog\n",
+            "\n The quick brown fox jumps over the lazy dog ",
+        )
+        assert res == 1
+
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "The quick brown fox jumps over the lazy dog.",
+        )
+        assert res == 0
+
+        res = em.compute_one_item("The quick brown fox jumps over the lazy dog", "")
+        assert res == 0
+
+        res = em.compute_one_item("", "")
+        assert res == 0
+
+    def test_quasi_exact_match(self):
+        em = ExactMatches(normalize_gold=helm_normalizer, normalize_pred=helm_normalizer)
+
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "The quick brown fox jumps over the lazy dog",
+        )
+        assert res == 1
+
+        res = em.compute_one_item(
+            " The quick brown fox jumps over the lazy dog\n",
+            "\n The quick brown fox jumps over the lazy dog ",
+        )
+        assert res == 1
+
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "The quick brown fox jumps over the lazy dog.",
+        )
+        assert res == 1
+
+        res = em.compute_one_item("the quick brown fox, jumps over lazy dog", "quick brown fox jumps over lazy dog.")
+        assert res == 1
+
+        res = em.compute_one_item("The quick brown fox jumps over the lazy dog", "")
+        assert res == 0
+
+        res = em.compute_one_item("", "")
+        assert res == 0
+
+    def test_prefix_exact_match(self):
+        em = ExactMatches(
+            strip_strings=True,
+            type_exact_match="prefix",
+        )
+
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "The quick brown fox jumps over the lazy dog",
+        )
+        assert res == 1
+
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "The quick brown fox jumps over the lazy dog. And some other stories.",
+        )
+        assert res == 1
+
+        res = em.compute_one_item(
+            "  The quick brown fox jumps over the lazy dog\n",
+            "\n The quick brown fox jumps over the lazy dog",
+        )
+        assert res == 1
+
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "The quick brown fox jumps over the lazy dog.",
+        )
+        assert res == 1
+
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "the quick brown fox jumps over lazy dog. And some other stories.",
+        )
+        assert res == 0
+
+        res = em.compute_one_item("The quick brown fox jumps over the lazy dog", "")
+        assert res == 0
+
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "Complete mismatch",
+        )
+        assert res == 0
+
+        res = em.compute_one_item("", "")
+        assert res == 0
+
+    def test_prefix_quasi_exact_match(self):
+        em = ExactMatches(
+            normalize_gold=helm_normalizer,
+            normalize_pred=helm_normalizer,
+            type_exact_match="prefix",
+        )
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "The quick brown fox jumps over the lazy dog",
+        )
+        assert res == 1
+
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "The quick brown fox jumps over the lazy dog. And some other stories.",
+        )
+        assert res == 1
+
+        res = em.compute_one_item(
+            "The quick Brown fox jumps over the lazy dog",
+            "the quick brown fox jumps over lazy dog. And some other stories.",
+        )
+        assert res == 1
+
+        res = em.compute_one_item(
+            "  The quick brown fox jumps over the lazy dog\n",
+            "\n The quick brown fox jumps over the lazy dog",
+        )
+        assert res == 1
+
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "The quick brown fox jumps over the lazy dog.",
+        )
+        assert res == 1
+
+        res = em.compute_one_item("The quick brown fox jumps over the lazy dog", "")
+        assert res == 0
+
+        res = em.compute_one_item(
+            "The quick brown fox jumps over the lazy dog",
+            "Complete mismatch",
+        )
+        assert res == 0
+
+        res = em.compute_one_item("", "")
+        assert res == 0
+
+    def test_prob(self):
+        doc = Doc(query="Test query", choices=["A", "B", "C"], gold_index=0, task_name="test")
+
+        # Simple case
+        model_response = ModelResponse(logprobs=np.log([0.7]))
+        prob_metric = ProbabilityMetric()
+        result = prob_metric.compute_sample(doc=doc, model_response=model_response)
+        assert result[prob_metric.metric_name] == pytest.approx(0.7)
+
+        # Aggregation function test
+        model_response = ModelResponse(logprobs=np.log([0.7, 0.1]))
+        prob_min_metric = ProbabilityMetric(aggregation_function=np.min)
+        result = prob_min_metric.compute_sample(doc=doc, model_response=model_response)
+        assert result[prob_metric.metric_name] == pytest.approx(0.1)
+
+    def test_mc_probability_metric(self):
+        doc = Doc(query="Test query", choices=["A", "B", "C"], gold_index=0, task_name="test")
+        model_response = ModelResponse(logprobs=np.log([0.35, 0.1, 0.05]))
+
+        mc_prob_metric = NormalizedMultiChoiceProbMetric()
+
+        result = mc_prob_metric.compute_sample(
+            doc=doc,
+            model_response=model_response,
+        )
+        assert result[mc_prob_metric.metric_name] == pytest.approx(0.7)
+
+        doc = Doc(query="Test query", choices=["AA", "BB", "CCC"], gold_index=1, task_name="test")
+        model_response = ModelResponse(logprobs=np.log([0.1**2, 0.35**2, 0.05**3]))
+
+        prob_norm_metric = NormalizedMultiChoiceProbMetric(normalization=LogProbCharNorm())
+        result = prob_norm_metric.compute_sample(
+            doc=doc,
+            model_response=model_response,
+        )
+        assert result[prob_norm_metric.metric_name] == pytest.approx(0.7)
+
+    def test_acc(self):
+        # Test without normalization
+        doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=0, task_name="test")
+        model_response = ModelResponse(logprobs=np.log([0.7, 0.2, 0.3, 0.4]))
+
+        acc_metric = LogLikelihoodAccMetric()
+        result = acc_metric.compute_sample(
+            doc=doc,
+            model_response=model_response,
+        )
+        assert result[acc_metric.metric_name] == 1  # The highest logprob (3.0) is at index 3, which is not in gold_ixs
+
+        # Test 0 acc
+        doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=0, task_name="test")
+        model_response = ModelResponse(logprobs=np.log([0.1, 0.2, 0.3, 0.4]))
+        result = acc_metric.compute_sample(
+            doc=doc,
+            model_response=model_response,
+        )
+        assert result[acc_metric.metric_name] == 0
+
+        # Test with normalization
+        doc = Doc(query="Test query", choices=["ABCDE", "AB"], gold_index=0, task_name="test")
+        model_response = ModelResponse(logprobs=np.log([0.5, 0.6]))
+        acc_norm_metric = LogLikelihoodAccMetric(normalization=LogProbCharNorm())
+        result_norm = acc_norm_metric.compute_sample(
+            doc=doc,
+            model_response=model_response,
+        )
+        assert (
+            result_norm[acc_norm_metric.metric_name] == 1
+        )  # After normalization, "ABCDE" should have the highest score
+
+        # Test with multiple correct solutions
+        doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=[1, 3], task_name="test")
+        model_response = ModelResponse(logprobs=np.log([0.5, 0.6, 0.7, 0.8]))
+        result_multi = acc_metric.compute_sample(
+            doc=doc,
+            model_response=model_response,
+        )
+        assert result_multi[acc_metric.metric_name] == 1
+
+        # Test when the highest logprob is not in gold_ixs
+        doc = Doc(query="Test query", choices=["A", "B", "C", "D"], gold_index=[1, 2], task_name="test")
+        model_response = ModelResponse(logprobs=[0.5, 0.6, 0.7, 0.8])
+        result_incorrect = acc_metric.compute_sample(
+            doc=doc,
+            model_response=model_response,
+        )
+        assert result_incorrect[acc_metric.metric_name] == 0
+
+    def test_f1_dynamic_metric(self):
+        """
+        Tests that normalization works correctly. We don't test the behavior of the F1_score class as it should be already tested.
+        """
+
+        doc = Doc(query="Test query", choices=["hello world"], gold_index=[0], task_name="test")
+        model_response = ModelResponse(text=["hello, the world"])
+
+        # Normalization test
+        f1_metric = MultilingualQuasiF1ScoreMetric(language=Language.ENGLISH)
+        result = f1_metric.compute_sample(
+            doc=doc,
+            model_response=model_response,
+        )
+        assert result[f1_metric.metric_name] == 1
+
+        model_response = ModelResponse(text=["hello, the world how"])
+        f1_metric = MultilingualQuasiF1ScoreMetric(language=Language.ENGLISH, aggregation_function=np.min)
+        result = f1_metric.compute_sample(
+            doc=doc,
+            model_response=model_response,
+        )
+        # 2 * (precision * recall) / (precision + recall) = 2 * (1 * 2/3) / (1 + 2/3) = 0.8
+        assert result[f1_metric.metric_name] == 0.8
+
+    def test_exact_match_dynamic_metric(self):
+        """
+        Tests that normalization works correctly. We don't test the behavior of the ExactMatch class as it should be already tested.
+        """
+        doc = Doc(query="Test query", choices=["hello world"], gold_index=[0], task_name="test")
+        model_response = ModelResponse(text=["hello, the world"])
+
+        # Normalization test
+        em_metric = MultilingualQuasiExactMatchMetric(language=Language.ENGLISH, match_type="full")
+        result = em_metric.compute_sample(
+            doc=doc,
+            model_response=model_response,
+        )
+        assert result[em_metric.metric_name] == 1
+
+        model_response = ModelResponse(text=["hello, the world how"])
+        em_metric = MultilingualQuasiExactMatchMetric(language=Language.ENGLISH, match_type="full")
+        result = em_metric.compute_sample(
+            doc=doc,
+            model_response=model_response,
+        )
+        assert result[em_metric.metric_name] == 0
+
+    @pytest.mark.skip(reason="Need to understand what it does.")
+    def test_pass_at_k_estimator(self):
+        assert False
+
+    @pytest.mark.skip(reason="Using nltk metric function, no need to test.")
+    def test_f1_score_quasi(self):
+        assert False
+
+    @pytest.mark.skip(reason="Using nltk metric function, no need to test.")
+    def test_f1(self):
+        assert False
diff --git a/tests/test_unit_harness_metrics.py b/tests/test_unit_harness_metrics.py
new file mode 100644
index 000000000..6d1764593
--- /dev/null
+++ b/tests/test_unit_harness_metrics.py
@@ -0,0 +1,139 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import json
+import os
+
+import pytest
+
+from lighteval.metrics import apply_metric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.sample_preparator import (
+    GenerativeCorpusMetricInput,
+    LogprobCorpusMetricInput,
+    PerplexityCorpusMetricInput,
+)
+from lighteval.models.model_output import ModelResponse
+from lighteval.tasks.requests import Doc
+from lighteval.utils.utils import as_list
+
+
+PATH_TO_HARNESS_METRICS = os.path.join(os.path.dirname(__file__), "reference_scores/harness_metrics.json")
+
+
+def pytest_generate_tests(metafunc: pytest.Metafunc):
+    """Initializes the main test setup. This function is automatically called by pytest and
+    should not be called manually.
+
+    Every function with "model_input" as arguments will be sent the "parameters".
+    This function will be run only once, ensuring that each model is run only once on the selected tasks.
+    (This is better than using fixtures as fixtures are re-run once for each test, which is not a behavior we want).
+    """
+    parameters = []
+
+    # If model_input is a test function argument
+    # (= the function requires a fixture)
+    if "prompt_inputs" in metafunc.fixturenames:
+        with open(PATH_TO_HARNESS_METRICS) as f:
+            metric_to_examples = json.load(f)
+
+            for metric, examples in metric_to_examples.items():
+                for task_name, examples_list in examples.items():
+                    parameters.append((metric, task_name, examples_list))
+        metafunc.parametrize("prompt_inputs", parameters, scope="session")
+
+
+def test_model_prediction(prompt_inputs: tuple[str, str, list]):  # noqa: C901
+    """Evaluates a model on a full task - is parametrized using pytest_generate_test"""
+    metric, task_name, examples = prompt_inputs
+    metric_name = metric
+    metric = Metrics[metric].value
+
+    for example in examples:
+        doc = {
+            k: v
+            for k, v in example.items()
+            if k in ["full_prompt", "choices", "gold_index", "original_query", "specific"]
+        }
+        doc["query"] = doc.pop("full_prompt")
+        doc = Doc(**doc)
+        error_msg = f"Metric {metric_name} failed on input {doc} from task {task_name}.\n"
+
+        match example["predictions"]:
+            case [first_element, *_] if isinstance(first_element, str):
+                # If the predictions are a list of strings, we assume it's a generative task
+                responses = [ModelResponse(text=example["predictions"], output_tokens=[[]], input_tokens=[])]
+            case [first_element, *_] if isinstance(first_element, float):
+                # If the predictions are a list of floats, we assume it's a logprob task
+                responses = [ModelResponse(logprobs=example["predictions"], output_tokens=[[]], input_tokens=[])]
+            case [first_element, *_] if len(first_element) == 2 and isinstance(first_element[1], bool):
+                # If the predictions are a list of lists with two elements, we assume it's a loglikelihood task with argmax
+                responses = [
+                    ModelResponse(
+                        logprobs=[pred[0] for pred in example["predictions"]],
+                        argmax_logits_eq_gold=[pred[1] for pred in example["predictions"]],
+                        output_tokens=[[]],
+                        input_tokens=[],
+                    )
+                ]
+            case _:
+                # If the predictions are not a list of strings or floats, we assume it's a custom task
+                responses = [ModelResponse(logprobs=example["predictions"][0], input_tokens=[])]
+
+        results = apply_metric(responses=responses, docs=[doc], metrics=[metric])[0]
+        assert responses is not None, error_msg
+
+        metric_result = {k: list(v) if isinstance(v, tuple) else v for k, v in results.items()}
+
+        metric_reference = {k: example[k] for k in results.keys()}
+        error_msg += f"Prediction: {results}\n"
+        error_msg += f"Reference: {metric_reference}\n"
+        error_msg += f"Returned : {metric_result}"
+
+        for key in metric_result.keys():
+            if type(metric_result[key]) in [
+                LogprobCorpusMetricInput,
+                GenerativeCorpusMetricInput,
+                PerplexityCorpusMetricInput,
+            ]:
+                cur_result_list = as_list(metric_result[key].to_dict())
+            else:
+                cur_result_list = as_list(metric_result[key])
+            cur_ref_list = as_list(metric_reference[key])
+
+            # item wise comparison of lists
+            if isinstance(cur_result_list[0], list):
+                for res, ref in zip(cur_result_list, cur_ref_list):
+                    try:
+                        assert res == pytest.approx(ref, rel=1e-8), error_msg
+                    except Exception:
+                        assert False, (
+                            key + "\n" + str(cur_result_list) + "\n" + str(cur_ref_list) + "\n" + task_name + "\n"
+                        )
+            else:
+                try:
+                    assert cur_result_list == pytest.approx(cur_ref_list, rel=1e-8), error_msg
+                except Exception:
+                    # assert False, error_msg + "\n" + str(e)
+                    assert False, (
+                        key + "\n" + str(cur_result_list) + "\n" + str(cur_ref_list) + "\n" + task_name + "\n"
+                    )
diff --git a/tests/test_unit_harness_prompts.py b/tests/test_unit_harness_prompts.py
new file mode 100644
index 000000000..6c8233fdc
--- /dev/null
+++ b/tests/test_unit_harness_prompts.py
@@ -0,0 +1,75 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import json
+import os
+
+import pytest
+
+import lighteval.tasks.default_prompts as default_prompts
+from lighteval.tasks.requests import Doc
+
+
+PATH_TO_HARNESS_PROMPTS = os.path.join(os.path.dirname(__file__), "reference_scores/harness_prompts.json")
+
+
+def pytest_generate_tests(metafunc: pytest.Metafunc):
+    """Initializes the main test setup. This function is automatically called by pytest and
+    should not be called manually.
+
+    Every function with "model_input" as arguments will be sent the "parameters".
+    This function will be run only once, ensuring that each model is run only once on the selected tasks.
+    (This is better than using fixtures as fixtures are re-run once for each test, which is not a behavior we want).
+    """
+    parameters = []
+
+    # If model_input is a test function argument
+    # (= the function requires a fixture)
+    if "prompt_inputs" in metafunc.fixturenames:
+        with open(PATH_TO_HARNESS_PROMPTS) as f:
+            prompt_fn_to_examples = json.load(f)
+
+            for prompt_fn_name, examples in prompt_fn_to_examples.items():
+                formatter_fn = getattr(default_prompts, prompt_fn_name)
+
+                cur_params = []
+
+                for task_name, examples_list in examples.items():
+                    for input_line, reference_line in examples_list:
+                        cur_params.append((formatter_fn, input_line, reference_line, task_name))
+                parameters.append((prompt_fn_name, cur_params))
+        metafunc.parametrize("prompt_inputs", parameters, scope="session")
+
+
+def test_model_prediction(prompt_inputs: tuple[str, list]):
+    """Evaluates a model on a full task - is parametrized using pytest_generate_test"""
+    prompt_fn_name, examples = prompt_inputs
+    for prompt_fn, input_line, reference_line, task_name in examples:
+        formatted_line = prompt_fn(input_line, "")  # task_name)
+        reference_line = Doc(**reference_line)
+
+        error_msg = (
+            f"Prompt formatting function {prompt_fn_name} failed on input {input_line} from task {task_name}.\n"
+        )
+        error_msg += f"Reference: {reference_line}\n"
+        error_msg += f"Returned : {formatted_line}"
+        assert formatted_line == reference_line, error_msg
diff --git a/tests/unit/metrics/test_cases/bleu.json b/tests/unit/metrics/test_cases/bleu.json
index 15e03d907..444fb8bab 100644
--- a/tests/unit/metrics/test_cases/bleu.json
+++ b/tests/unit/metrics/test_cases/bleu.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bac803950c223280611f63dda6d0bbc6e78bac0b270a7674429311406ddc5035
-size 891
+oid sha256:a828db1108f217aeece39ca279745ac933d706dcd8bd940269b767f40c3c4fe7
+size 4453
diff --git a/tests/unit/metrics/test_cases/bleu_1.json b/tests/unit/metrics/test_cases/bleu_1.json
index 238a62928..645689001 100644
--- a/tests/unit/metrics/test_cases/bleu_1.json
+++ b/tests/unit/metrics/test_cases/bleu_1.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c7c63beea1027629eb285c861b5850fc04740106a568ecf8d19622163706283e
-size 903
+oid sha256:e4b245d309e6a9f6d6bf080b44646153eefe4d56aceab565dcd832fab46cc3a3
+size 2805
diff --git a/tests/unit/metrics/test_cases/bleu_4.json b/tests/unit/metrics/test_cases/bleu_4.json
index 252c4b02e..37cdb4c70 100644
--- a/tests/unit/metrics/test_cases/bleu_4.json
+++ b/tests/unit/metrics/test_cases/bleu_4.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0191660dc5bbdf7dd04cd58b2910ec8c741a93c6252d5cb8c2686382137da073
-size 903
+oid sha256:4e2a2b2381d1d3c0184c11c22c97028313c178a2f94dd58059866695b77c7eac
+size 3432
diff --git a/tests/unit/metrics/test_cases/chrf.json b/tests/unit/metrics/test_cases/chrf.json
index 6d8613f29..d250f2f2b 100644
--- a/tests/unit/metrics/test_cases/chrf.json
+++ b/tests/unit/metrics/test_cases/chrf.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3e144f94ef8e119ec32454573c11d969090c6ddf0aa85b17354543223b2d1a92
-size 891
+oid sha256:14e677f08edfb5075319e10a70756ee1da9a9d6a850fdfb36798aaeb641077c4
+size 5653
diff --git a/tests/unit/metrics/test_cases/chrf_plus.json b/tests/unit/metrics/test_cases/chrf_plus.json
index fb63d59e4..caa14fb1d 100644
--- a/tests/unit/metrics/test_cases/chrf_plus.json
+++ b/tests/unit/metrics/test_cases/chrf_plus.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c87e1da3227bcd0ce18af1463f47c0c19299350ec247b1813233b0cc139de145
-size 923
+oid sha256:e1abfc1c9a2c74215af46cedce6183e9cf519347121f435c9a6706bac70d9d3d
+size 4564
diff --git a/tests/unit/metrics/test_cases/f1_score_macro.json b/tests/unit/metrics/test_cases/f1_score_macro.json
index 219b3815e..3bfe7b48d 100644
--- a/tests/unit/metrics/test_cases/f1_score_macro.json
+++ b/tests/unit/metrics/test_cases/f1_score_macro.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3fb1c48d29ea568c0b3e1928fc7852f0dc58205ba17bb2caf849d7390e6d52e2
-size 949
+oid sha256:16afb1546b7c1d3a45f4e14aea9c537b1249fa6b9281f4550d0e1d858a41eae2
+size 4433
diff --git a/tests/unit/metrics/test_cases/f1_score_micro.json b/tests/unit/metrics/test_cases/f1_score_micro.json
index bffa0896f..0816a25a0 100644
--- a/tests/unit/metrics/test_cases/f1_score_micro.json
+++ b/tests/unit/metrics/test_cases/f1_score_micro.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5ff067c9e17d82788867c4bff4c4e4fcc9390da0d2d327a5b5c3ec9c4a102fcc
-size 949
+oid sha256:8c7f8820db3a770299e494ebc051c4892eadcc17c97ffe7e2947299611b1eea2
+size 4435
diff --git a/tests/unit/metrics/test_cases/mcc.json b/tests/unit/metrics/test_cases/mcc.json
index 7fe61d007..d3e983260 100644
--- a/tests/unit/metrics/test_cases/mcc.json
+++ b/tests/unit/metrics/test_cases/mcc.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e689b7971e13f8dcec41c5f873158b32d2e0646feba762fe92405dd0bd39215c
-size 884
+oid sha256:8a788e8bdaed81f8fe63081297b60986ad101b4bd2c6681cef850da64b532a17
+size 1227
diff --git a/tests/unit/metrics/test_cases/multi_f1_numeric.json b/tests/unit/metrics/test_cases/multi_f1_numeric.json
index 17d18c1d7..596f700f8 100644
--- a/tests/unit/metrics/test_cases/multi_f1_numeric.json
+++ b/tests/unit/metrics/test_cases/multi_f1_numeric.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5033944de260bfe4a0fe14eebb87b1e370f9a92d1c54883722134f60fa032d93
-size 961
+oid sha256:f3c67192247f89487d12384b15c95bd4a64ec2cbcf882ad00339c99754b3b794
+size 4955
diff --git a/tests/unit/metrics/test_cases/target_perplexity.json b/tests/unit/metrics/test_cases/target_perplexity.json
index 1c63104e0..f4c859650 100644
--- a/tests/unit/metrics/test_cases/target_perplexity.json
+++ b/tests/unit/metrics/test_cases/target_perplexity.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9f5d79b4c0f5ef2e65a20974d50fe322b57263bc598599d2a7c257d88b30b38e
-size 982
+oid sha256:d4176078edb4639416286ca6f12d0b2903f3f232f8d1b7374becbe1da88a52ce
+size 2913
diff --git a/tests/unit/metrics/test_cases/ter.json b/tests/unit/metrics/test_cases/ter.json
index 3bcf09f7c..724103bfa 100644
--- a/tests/unit/metrics/test_cases/ter.json
+++ b/tests/unit/metrics/test_cases/ter.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:628eb548f3cff4994449eb6788ca374bec65b3e20b73dd69f58deefe6522e589
-size 884
+oid sha256:cb94c167efc2fa8da3c58ae0552cbfb87b4cced5bb7474e1d1b7965680fc4d3d
+size 4733
diff --git a/tests/unit/metrics/test_cases/word_perplexity.json b/tests/unit/metrics/test_cases/word_perplexity.json
index 6fd35f398..4aa518a0b 100644
--- a/tests/unit/metrics/test_cases/word_perplexity.json
+++ b/tests/unit/metrics/test_cases/word_perplexity.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1389311e25a87a629aef62751d274fc55a743564078f2cbb90e67d159fe8a4e5
-size 968
+oid sha256:c6c97b916e429463d07d9e8680e392ee757b409c614e758047599b133119bd1c
+size 3421
diff --git a/tests/unit/metrics/test_metrics_automated.py b/tests/unit/metrics/test_metrics_automated.py
index 892984307..7db477920 100644
--- a/tests/unit/metrics/test_metrics_automated.py
+++ b/tests/unit/metrics/test_metrics_automated.py
@@ -31,9 +31,10 @@
 import copy
 import json
 import logging
+import math
 from dataclasses import field
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -50,20 +51,35 @@ class MetricTestCase(BaseModel):
 
     name: str
     metric_class: str
-    metric_params: Dict[str, Any] = field(default_factory=dict)
-    doc: Dict[str, Any]
-    model_response: Dict[str, Any]
-    expected_output: Union[float, Dict[str, float]]
+    metric_params: dict[str, Any] = field(default_factory=dict)
+    doc: dict[str, Any]
+    model_response: dict[str, Any]
+    expected_output: dict[str, float]
     tolerance: float = 1e-2
-    description: Optional[str] = None
+    description: str | None = None
+
+
+class CorpusLevelMetricTestCase(BaseModel):
+    """A test case for a corpus level metric with input and expected output."""
+
+    name: str
+    metric_class: str
+    metric_name: str
+    metric_params: dict[str, Any] = field(default_factory=dict)
+    docs: list[dict[str, Any]]
+    model_responses: list[dict[str, Any]]
+    expected_output: float
+    tolerance: float = 1e-2
+    description: str | None = None
 
 
 class MetricTestSuite(BaseModel):
     """A collection of test cases for metrics."""
 
     name: str
-    test_cases: List[MetricTestCase]
-    description: Optional[str] = None
+    test_cases: list[MetricTestCase | CorpusLevelMetricTestCase]
+    corpus_level: bool = False
+    description: str | None = None
 
 
 class AutomatedMetricTester:
@@ -100,31 +116,31 @@ class AutomatedMetricTester:
         "expr_gold_metric": Metrics.expr_gold_metric,
         "acc_golds_likelihood": Metrics.acc_golds_likelihood,
         "truthfulqa_mc_metrics": Metrics.truthfulqa_mc_metrics,
-        "faithfulness": Metrics.faithfulness,  # issue with tokenizer
-        # "prediction_perplexity": Metrics.prediction_perplexity,
-        # "target_perplexity": Metrics.target_perplexity,
+        # "faithfulness": Metrics.faithfulness,  # need GPU to run
         # "bert_score": Metrics.bert_score, issue with the scoring function, int too big to convert
+        "prediction_perplexity": Metrics.prediction_perplexity,
         # "simpleqa_judge": Metrics.simpleqa_judge, Batched metrics not supported yet
-        # "bleu": Metrics.bleu,
-        # "bleu_1": Metrics.bleu_1,
-        # "bleu_4": Metrics.bleu_4,
-        # "bits_per_byte": Metrics.bits_per_byte,
-        # "byte_perplexity": Metrics.byte_perplexity,
-        # "chrf": Metrics.chrf,
-        # "chrf_plus": Metrics.chrf_plus,
-        # "loglikelihood_f1": Metrics.loglikelihood_f1,
-        # "multi_f1_numeric": Metrics.multi_f1_numeric,
-        # "ter": Metrics.ter,
-        # "word_perplexity": Metrics.word_perplexity,
-        # "f1_score_macro": Metrics.f1_score_macro,
-        # "f1_score_micro": Metrics.f1_score_micro,
-        # "mcc": Metrics.mcc,
+        "bleu": Metrics.bleu,
+        "bleu_1": Metrics.bleu_1,
+        "bleu_4": Metrics.bleu_4,
+        "bits_per_byte": Metrics.bits_per_byte,
+        "byte_perplexity": Metrics.byte_perplexity,
+        "target_perplexity": Metrics.target_perplexity,
+        "chrf": Metrics.chrf,
+        "chrf_plus": Metrics.chrf_plus,
+        "loglikelihood_f1": Metrics.loglikelihood_f1,
+        "multi_f1_numeric": Metrics.multi_f1_numeric,
+        "ter": Metrics.ter,
+        "word_perplexity": Metrics.word_perplexity,
+        "f1_score_macro": Metrics.f1_score_macro,
+        "f1_score_micro": Metrics.f1_score_micro,
+        "mcc": Metrics.mcc,
     }
 
     def __init__(self):
         self.test_results = []
 
-    def create_doc_from_dict(self, doc_dict: Dict[str, Any]) -> Doc:
+    def create_doc_from_dict(self, doc_dict: dict[str, Any]) -> Doc:
         """Create a Doc object from a dictionary representation."""
         return Doc(
             query=doc_dict.get("query", ""),
@@ -134,7 +150,7 @@ def create_doc_from_dict(self, doc_dict: Dict[str, Any]) -> Doc:
             specific=doc_dict.get("specific", {}),
         )
 
-    def create_model_response_from_dict(self, response_dict: Dict[str, Any]) -> ModelResponse:
+    def create_model_response_from_dict(self, response_dict: dict[str, Any]) -> ModelResponse:
         """Create a ModelResponse object from a dictionary representation."""
         return ModelResponse(
             text=response_dict.get("text", []),
@@ -143,7 +159,7 @@ def create_model_response_from_dict(self, response_dict: Dict[str, Any]) -> Mode
             argmax_logits_eq_gold=response_dict.get("argmax_logits_eq_gold", []),
         )
 
-    def instantiate_metric(self, metric_class: str, metric_params: Dict[str, Any]):
+    def instantiate_metric(self, metric_class: str, metric_params: dict[str, Any]):
         """Get a metric from the Metrics enum with the given parameters."""
         if metric_class not in self.METRIC_CLASSES:
             raise ValueError(f"Unknown metric class: {metric_class}")
@@ -159,7 +175,7 @@ def instantiate_metric(self, metric_class: str, metric_params: Dict[str, Any]):
         # The metric_params are ignored for now since the Metrics enum values are pre-configured
         return metric_enum_value
 
-    def run_test_case(self, test_case: MetricTestCase) -> Dict[str, Any]:
+    def run_test_case(self, test_case: MetricTestCase | CorpusLevelMetricTestCase) -> dict[str, Any]:
         """Run a single test case and return the result."""
         # Check if metric is available in METRIC_CLASSES
         if test_case.metric_class not in self.METRIC_CLASSES:
@@ -176,7 +192,30 @@ def run_test_case(self, test_case: MetricTestCase) -> Dict[str, Any]:
         # Get the metric from the Metrics enum
         metric = self.instantiate_metric(test_case.metric_class, test_case.metric_params)
 
-        # Create input objects
+        if isinstance(test_case, CorpusLevelMetricTestCase):
+            docs = [self.create_doc_from_dict(doc) for doc in test_case.docs]
+            model_responses = [
+                self.create_model_response_from_dict(response) for response in test_case.model_responses
+            ]
+            aggregation_function = metric.get_corpus_aggregations()[metric.metric_name]
+            outputs_per_sample = [
+                metric.compute_sample(doc=doc, model_response=model_response)[test_case.metric_name]
+                for doc, model_response in zip(docs, model_responses)
+            ]
+            actual_output = aggregation_function(outputs_per_sample)
+
+            success = self._compare_dict_outputs(actual_output, test_case.expected_output, test_case.tolerance)
+
+            return {
+                "test_case": test_case.name,
+                "success": success,
+                "error": None,
+                "skipped": False,
+                "skip_reason": None,
+                "actual": actual_output,
+                "expected": test_case.expected_output,
+            }
+
         doc = self.create_doc_from_dict(test_case.doc)
         model_response = self.create_model_response_from_dict(test_case.model_response)
 
@@ -200,20 +239,28 @@ def run_test_case(self, test_case: MetricTestCase) -> Dict[str, Any]:
             "skipped": False,
         }
 
-    def _compare_scalar_outputs(self, actual: Any, expected: float, tolerance: float) -> bool:
+    def _compare_scalar_outputs(self, actual: Any, expected: Any, tolerance: float) -> bool:
         """Compare scalar outputs with tolerance."""
         if isinstance(actual, (int, float)) and isinstance(expected, (int, float)):
-            return abs(actual - expected) <= tolerance
+            # For small values, use absolute tolerance only to avoid relative tolerance issues
+            # For values >= 1.0, we can use relative tolerance
+            if abs(expected) < 1.0:
+                return math.isclose(actual, expected, abs_tol=tolerance)
+            else:
+                return math.isclose(actual, expected, rel_tol=tolerance, abs_tol=tolerance)
         return actual == expected
 
-    def _compare_dict_outputs(self, actual: Dict[str, Any], expected: Dict[str, float], tolerance: float) -> bool:
-        """Compare dictionary outputs with tolerance."""
+    def _compare_dict_outputs(self, actual: Any, expected: Any, tolerance: float) -> bool:
+        """Compare outputs with tolerance. Handles both dict and scalar types."""
+        # If either is not a dict, treat as scalar comparison
         if not isinstance(actual, dict) or not isinstance(expected, dict):
-            return actual == expected
+            return self._compare_scalar_outputs(actual, expected, tolerance)
 
+        # Both are dicts, compare keys first
         if set(actual.keys()) != set(expected.keys()):
             return False
 
+        # Compare each value
         for key in actual.keys():
             actual_value = actual[key]
             expected_value = expected[key]
@@ -231,7 +278,7 @@ def _compare_dict_outputs(self, actual: Dict[str, Any], expected: Dict[str, floa
 
         return True
 
-    def run_test_suite(self, test_suite: MetricTestSuite) -> List[Dict[str, Any]]:
+    def run_test_suite(self, test_suite: MetricTestSuite) -> list[dict[str, Any]]:
         """Run a complete test suite and return results."""
         logger.info(f"Running test suite: {test_suite.name}")
         if test_suite.description:
@@ -256,7 +303,7 @@ def run_test_suite(self, test_suite: MetricTestSuite) -> List[Dict[str, Any]]:
 
         return results
 
-    def run_test_suites_from_file(self, file_path: Union[str, Path]) -> List[Dict[str, Any]]:
+    def run_test_suites_from_file(self, file_path: str | Path) -> list[dict[str, Any]]:
         """Run test suites from a JSON file."""
         with open(file_path, "r") as f:
             data = json.load(f)

From fc01e6b70fcb156ab5f6999e94b855c7e4a974e0 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 3 Sep 2025 14:07:54 +0000
Subject: [PATCH 13/26] fix bleu metric

---
 src/lighteval/metrics/metrics_corpus.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
index b7d4290f5..238c5ecde 100644
--- a/src/lighteval/metrics/metrics_corpus.py
+++ b/src/lighteval/metrics/metrics_corpus.py
@@ -114,6 +114,9 @@ def __init__(self, metric_type: str, lang: Literal["zh", "ja", "ko", ""] = ""):
 
     def get_metric(self):
         if self.metric_type == "bleu":
+            import nltk
+
+            nltk.download("punkt_tab")
             return sacrebleu.BLEU(trg_lang=self.lang)
         elif self.metric_type == "chrf":
             return sacrebleu.CHRF()

From c574035611def77d3cd2baa51bbd8baac075b957 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 3 Sep 2025 14:13:12 +0000
Subject: [PATCH 14/26] fix bleu metric

---
 src/lighteval/metrics/metrics_sample.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index cf8b7d2ab..eb2dac36c 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -811,6 +811,9 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
         Returns:
             float: Score over the current sample's items.
         """
+        import nltk
+
+        nltk.download("punkt_tab")
         golds = doc.get_golds()
         predictions = model_response.final_text
         return np.mean([self._bleu_score(golds, p) for p in predictions])

From 51db82806fd69cd826d337f449620ffa254bf7fd Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Mon, 8 Sep 2025 12:55:53 +0000
Subject: [PATCH 15/26] fix tests after merge

---
 src/lighteval/metrics/metrics_sample.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index a3bbe330c..4569d4e84 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -1109,6 +1109,7 @@ def __init__(
                 raise ValueError(f"Unknown normalization function: {normalize}")
         else:
             self.normalize = normalize
+
         self.strip_strings = strip_strings
 
         if callable(sample_scoring_function):
@@ -1203,19 +1204,18 @@ def __init__(self, k: int | None = None, **kwargs):
             k (int): The number of top choices to consider.
             **kwargs: Additional keyword arguments.
         """
-        super().__init__(kwargs)
+        super().__init__(**kwargs)
 
         self.k = k
         self.attribute_must_be_set = ["k"]
 
-    def compute(self, doc: Doc, model_response: ModelResponse):
+    def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
         """Computes the metric over a list of golds and predictions for one single sample.
-        It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones,
-        then compares it to the gold.
+        It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones, then compares it to the gold.
 
         Args:
+            doc (Doc): The document containing gold references.
             model_response (ModelResponse): The model's response containing predictions.
-            docs (Doc): The document containing gold references.
             **kwargs: Additional keyword arguments.
 
         Returns:

From 70a5a10a3bb3b4eafe79cdfc356c792085866e27 Mon Sep 17 00:00:00 2001
From: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Date: Mon, 8 Sep 2025 14:59:48 +0200
Subject: [PATCH 16/26] Delete tests/slow_tests/test_sglang_model.py

---
 tests/slow_tests/test_sglang_model.py | 101 --------------------------
 1 file changed, 101 deletions(-)
 delete mode 100644 tests/slow_tests/test_sglang_model.py

diff --git a/tests/slow_tests/test_sglang_model.py b/tests/slow_tests/test_sglang_model.py
deleted file mode 100644
index c98b364ed..000000000
--- a/tests/slow_tests/test_sglang_model.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import json
-import os
-from functools import lru_cache, partial
-from typing import Callable, Tuple
-
-import pytest
-from deepdiff import DeepDiff
-
-from lighteval.main_sglang import sglang  # noqa: E402
-
-
-# Set env var for deterministic run of models
-os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-
-MODELS_ARGS = [
-    {
-        "model_name": "examples/model_configs/sglang_model_config.yaml",
-        "use_chat_template": True,
-        "results_file": "tests/reference_scores/Mistral-7B-Instruct-results-sglang.json",
-    }
-]
-
-TASKS_PATH = "examples/test_tasks.txt"
-CUSTOM_TASKS_PATH = "examples/custom_tasks_tests.py"
-
-ModelInput = Tuple[str, Callable[[], dict]]
-
-
-@lru_cache(maxsize=len(MODELS_ARGS))
-def run_model(model_name: str, use_chat_template: bool):
-    """Runs the full main as a black box, using the input model and tasks, on 10 samples without parallelism"""
-    results = sglang(
-        model_args=model_name,
-        tasks=TASKS_PATH,
-        use_chat_template=use_chat_template,
-        output_dir="",
-        dataset_loading_processes=1,
-        save_details=False,
-        max_samples=10,
-        custom_tasks=CUSTOM_TASKS_PATH,
-    )
-    return results
-
-
-def generate_tests() -> list[ModelInput]:
-    """Generate test parameters for all models and tasks."""
-    tests = []
-    for model_args in MODELS_ARGS:
-        predictions_lite = partial(run_model, model_args["model_name"], model_args["use_chat_template"])
-        tests.append((model_args, predictions_lite))
-    return tests
-
-
-# generates the model predictions parameters at test collection time
-tests: list[ModelInput] = generate_tests()
-ids = [f"{model_input[0]['model_name']}" for model_input in tests]
-
-
-@pytest.mark.parametrize("tests", tests, ids=ids)
-@pytest.mark.skip()
-def test_sglang_model(tests: list[ModelInput]):
-    """Evaluates a SGLang model on a full task - is parametrized using pytest_generate_test"""
-    model_args, get_predictions = tests
-
-    predictions = get_predictions()["results"]
-
-    # Load the reference results
-    with open(model_args["results_file"], "r") as f:
-        reference_results = json.load(f)["results"]
-
-    # Change the key names, replace '|' with ':'
-    reference_results = {k.replace("|", ":"): v for k, v in reference_results.items()}
-
-    # Convert defaultdict values to regular dict for comparison
-    predictions_dict = {k: dict(v) if hasattr(v, "default_factory") else v for k, v in predictions.items()}
-
-    diff = DeepDiff(reference_results, predictions_dict, ignore_numeric_type_changes=True)
-
-    assert diff == {}, f"Differences found: {diff}"

From 6384835ea735882dac12bbede1eeaf05906642f1 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Mon, 8 Sep 2025 13:26:14 +0000
Subject: [PATCH 17/26] test simpleqa judge

---
 src/lighteval/metrics/metrics_sample.py       |  2 +-
 .../test_cases/acc_golds_likelihood.json      |  4 ++--
 .../metrics/test_cases/simpleqa_judge.json    |  4 ++--
 tests/unit/metrics/test_metrics_automated.py  | 24 ++++++++++++++-----
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 4569d4e84..a259ccfcd 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -1167,7 +1167,7 @@ def __init__(self, k: int | None = None, **kwargs):
         self.k = k
         self.attribute_must_be_set = ["k"]
 
-    def compute(self, model_response: ModelResponse, doc: Doc):
+    def compute(self, model_response: ModelResponse, doc: Doc, **kwargs):
         """Computes the metric over a list of golds and predictions for one single sample.
         It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones,
         then compares it to the gold.
diff --git a/tests/unit/metrics/test_cases/acc_golds_likelihood.json b/tests/unit/metrics/test_cases/acc_golds_likelihood.json
index b41dfd131..a6b4cf5ca 100644
--- a/tests/unit/metrics/test_cases/acc_golds_likelihood.json
+++ b/tests/unit/metrics/test_cases/acc_golds_likelihood.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f486ec84db5c556b13368da3317bd91629eb93f6a25f869c4972cfed61977656
-size 2012
+oid sha256:5fcce7ab58aed69f3f6bbcab853d40ab7867edc75297ce960a0bed80047d1589
+size 1251
diff --git a/tests/unit/metrics/test_cases/simpleqa_judge.json b/tests/unit/metrics/test_cases/simpleqa_judge.json
index 9b565d011..e9b3b9aaa 100644
--- a/tests/unit/metrics/test_cases/simpleqa_judge.json
+++ b/tests/unit/metrics/test_cases/simpleqa_judge.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fd3867c275c1afc6a76bdd7aa1cfc4835d4379f5e1b105167c6738a146854d48
-size 953
+oid sha256:4a64b4778c6c7f8b4a69aaf7eb269b156292eb24fff1a737266dadfb4e04a33a
+size 730
diff --git a/tests/unit/metrics/test_metrics_automated.py b/tests/unit/metrics/test_metrics_automated.py
index 7db477920..c2a937e20 100644
--- a/tests/unit/metrics/test_metrics_automated.py
+++ b/tests/unit/metrics/test_metrics_automated.py
@@ -118,8 +118,8 @@ class AutomatedMetricTester:
         "truthfulqa_mc_metrics": Metrics.truthfulqa_mc_metrics,
         # "faithfulness": Metrics.faithfulness,  # need GPU to run
         # "bert_score": Metrics.bert_score, issue with the scoring function, int too big to convert
+        # "simpleqa_judge": Metrics.simpleqa_judge, # Need to setup for compute costs
         "prediction_perplexity": Metrics.prediction_perplexity,
-        # "simpleqa_judge": Metrics.simpleqa_judge, Batched metrics not supported yet
         "bleu": Metrics.bleu,
         "bleu_1": Metrics.bleu_1,
         "bleu_4": Metrics.bleu_4,
@@ -219,15 +219,27 @@ def run_test_case(self, test_case: MetricTestCase | CorpusLevelMetricTestCase) -
         doc = self.create_doc_from_dict(test_case.doc)
         model_response = self.create_model_response_from_dict(test_case.model_response)
 
-        # Create sample_params for the metric
-        sample_params = {
-            "doc": doc,
-            "model_response": model_response,
-        }
+        # Check if this is a batched metric
+        if hasattr(metric, "batched_compute") and metric.batched_compute:
+            # For batched metrics, we need to pass lists of docs and responses
+            sample_params = {
+                "docs": [doc],
+                "responses": [model_response],
+            }
+        else:
+            # For non-batched metrics, use individual doc and model_response
+            sample_params = {
+                "doc": doc,
+                "model_response": model_response,
+            }
 
         # Run the metric using the Metrics enum value
         actual_output = metric.compute_sample(**sample_params)
 
+        # For batched metrics, extract the first result since we're only testing with one sample
+        if hasattr(metric, "batched_compute") and metric.batched_compute and isinstance(actual_output, list):
+            actual_output = actual_output[0]
+
         # Compare with expected output
         success = self._compare_dict_outputs(actual_output, test_case.expected_output, test_case.tolerance)
         return {

From b5b82a89f04c7ea6e420aeaf8ce67b69c0694646 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 9 Sep 2025 08:10:39 +0000
Subject: [PATCH 18/26] fix avg at k

---
 src/lighteval/metrics/metrics_sample.py          | 4 ++--
 src/lighteval/models/model_output.py             | 2 +-
 tests/unit/metrics/test_cases/avg_at_k.json      | 4 ++--
 tests/unit/metrics/test_cases/avg_at_k_math.json | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index a259ccfcd..a618900a7 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -1181,8 +1181,8 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs):
             float: Aggregated score over the current sample's items.
         """
         all_scores = []
-        for _ in range(self.k):
-            all_scores.append(self.score_sample(doc, model_response))
+        for i in range(self.k):
+            all_scores.append(self.score_sample(doc, model_response[i]))
 
         avg_score = np.mean(all_scores)
         return avg_score
diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py
index db72cb7df..b10ce7f56 100644
--- a/src/lighteval/models/model_output.py
+++ b/src/lighteval/models/model_output.py
@@ -149,7 +149,7 @@ def __getitem__(self, index: int) -> "ModelResponse":
             input=self.input,
             input_tokens=self.input_tokens,
             text=[self.text[index]],
-            output_tokens=[self.output_tokens[index]],
+            output_tokens=[self.output_tokens[index]] if self.output_tokens else [],
             logprobs=[self.logprobs[index]] if self.logprobs else [],
             argmax_logits_eq_gold=[self.argmax_logits_eq_gold[index]] if self.argmax_logits_eq_gold else [],
             logits=[self.logits[index]] if self.logits else None,
diff --git a/tests/unit/metrics/test_cases/avg_at_k.json b/tests/unit/metrics/test_cases/avg_at_k.json
index 5e315bc51..fe7bdee09 100644
--- a/tests/unit/metrics/test_cases/avg_at_k.json
+++ b/tests/unit/metrics/test_cases/avg_at_k.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3e1be6df6efbe74c5bf2c217c81a232e2e154414619e5ffec660ac8a5e0f7aae
-size 1766
+oid sha256:485015fda47e313244e67866e2446e19e8dc837502765cea5200c28646960c9b
+size 1767
diff --git a/tests/unit/metrics/test_cases/avg_at_k_math.json b/tests/unit/metrics/test_cases/avg_at_k_math.json
index 8005cf7d0..c057f7242 100644
--- a/tests/unit/metrics/test_cases/avg_at_k_math.json
+++ b/tests/unit/metrics/test_cases/avg_at_k_math.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7eb34bbc8b34721da79ea6a367160a7f43a16fd5162b5b653f8af67b04c1ca92
+oid sha256:6a7ab341c79ea040f57b5c68dcb53830e6763dfd8006a15bf70e23a5156bd794
 size 1572

From bf740a3dc7fe4a1a4d3f8ce69ed3fb1c58ff6ad9 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Mon, 15 Sep 2025 12:57:54 +0000
Subject: [PATCH 19/26] remove test files from git lfs cache

---
 .../unit/metrics/test_cases/acc_golds_likelihood.json  |  3 ---
 tests/unit/metrics/test_cases/avg_at_k.json            |  3 ---
 tests/unit/metrics/test_cases/avg_at_k_math.json       |  3 ---
 tests/unit/metrics/test_cases/bert_score.json          |  3 ---
 tests/unit/metrics/test_cases/bits_per_byte.json       |  3 ---
 tests/unit/metrics/test_cases/bleu.json                |  3 ---
 tests/unit/metrics/test_cases/bleu_1.json              |  3 ---
 tests/unit/metrics/test_cases/bleu_4.json              |  3 ---
 tests/unit/metrics/test_cases/bleurt.json              |  3 ---
 tests/unit/metrics/test_cases/byte_perplexity.json     |  3 ---
 tests/unit/metrics/test_cases/chrf.json                |  3 ---
 tests/unit/metrics/test_cases/chrf_plus.json           |  3 ---
 tests/unit/metrics/test_cases/copyright.json           |  3 ---
 tests/unit/metrics/test_cases/drop.json                |  3 ---
 tests/unit/metrics/test_cases/exact_match.json         |  3 ---
 tests/unit/metrics/test_cases/expr_gold_metric.json    |  3 ---
 tests/unit/metrics/test_cases/extractiveness.json      |  3 ---
 tests/unit/metrics/test_cases/f1_score.json            |  3 ---
 tests/unit/metrics/test_cases/f1_score_macro.json      |  3 ---
 tests/unit/metrics/test_cases/f1_score_micro.json      |  3 ---
 tests/unit/metrics/test_cases/faithfulness.json        |  3 ---
 tests/unit/metrics/test_cases/g_pass_at_k.json         |  3 ---
 tests/unit/metrics/test_cases/g_pass_at_k_latex.json   |  3 ---
 tests/unit/metrics/test_cases/g_pass_at_k_math.json    |  3 ---
 .../unit/metrics/test_cases/gpqa_instruct_metric.json  |  3 ---
 .../metrics/test_cases/gpqa_instruct_pass_at_k.json    |  3 ---
 tests/unit/metrics/test_cases/loglikelihood_acc.json   |  3 ---
 tests/unit/metrics/test_cases/loglikelihood_f1.json    |  3 ---
 tests/unit/metrics/test_cases/maj_at_k.json            |  3 ---
 tests/unit/metrics/test_cases/mcc.json                 |  3 ---
 tests/unit/metrics/test_cases/mrr.json                 |  3 ---
 tests/unit/metrics/test_cases/multi_f1_numeric.json    |  3 ---
 tests/unit/metrics/test_cases/pass_at_k.json           |  3 ---
 tests/unit/metrics/test_cases/pass_at_k_letters.json   |  3 ---
 tests/unit/metrics/test_cases/pass_at_k_math.json      |  3 ---
 .../unit/metrics/test_cases/prediction_perplexity.json |  3 ---
 tests/unit/metrics/test_cases/recall_at_k.json         |  3 ---
 tests/unit/metrics/test_cases/rouge1.json              |  3 ---
 tests/unit/metrics/test_cases/rouge2.json              |  3 ---
 tests/unit/metrics/test_cases/rougeL.json              |  3 ---
 tests/unit/metrics/test_cases/rougeLsum.json           |  3 ---
 tests/unit/metrics/test_cases/rouge_t5.json            |  3 ---
 tests/unit/metrics/test_cases/simpleqa_judge.json      |  3 ---
 tests/unit/metrics/test_cases/target_perplexity.json   |  3 ---
 tests/unit/metrics/test_cases/ter.json                 |  3 ---
 .../unit/metrics/test_cases/truthfulqa_mc_metrics.json |  3 ---
 tests/unit/metrics/test_cases/word_perplexity.json     |  3 ---
 tests/unit/metrics/test_metrics_automated.py           | 10 +++-------
 48 files changed, 3 insertions(+), 148 deletions(-)
 delete mode 100644 tests/unit/metrics/test_cases/acc_golds_likelihood.json
 delete mode 100644 tests/unit/metrics/test_cases/avg_at_k.json
 delete mode 100644 tests/unit/metrics/test_cases/avg_at_k_math.json
 delete mode 100644 tests/unit/metrics/test_cases/bert_score.json
 delete mode 100644 tests/unit/metrics/test_cases/bits_per_byte.json
 delete mode 100644 tests/unit/metrics/test_cases/bleu.json
 delete mode 100644 tests/unit/metrics/test_cases/bleu_1.json
 delete mode 100644 tests/unit/metrics/test_cases/bleu_4.json
 delete mode 100644 tests/unit/metrics/test_cases/bleurt.json
 delete mode 100644 tests/unit/metrics/test_cases/byte_perplexity.json
 delete mode 100644 tests/unit/metrics/test_cases/chrf.json
 delete mode 100644 tests/unit/metrics/test_cases/chrf_plus.json
 delete mode 100644 tests/unit/metrics/test_cases/copyright.json
 delete mode 100644 tests/unit/metrics/test_cases/drop.json
 delete mode 100644 tests/unit/metrics/test_cases/exact_match.json
 delete mode 100644 tests/unit/metrics/test_cases/expr_gold_metric.json
 delete mode 100644 tests/unit/metrics/test_cases/extractiveness.json
 delete mode 100644 tests/unit/metrics/test_cases/f1_score.json
 delete mode 100644 tests/unit/metrics/test_cases/f1_score_macro.json
 delete mode 100644 tests/unit/metrics/test_cases/f1_score_micro.json
 delete mode 100644 tests/unit/metrics/test_cases/faithfulness.json
 delete mode 100644 tests/unit/metrics/test_cases/g_pass_at_k.json
 delete mode 100644 tests/unit/metrics/test_cases/g_pass_at_k_latex.json
 delete mode 100644 tests/unit/metrics/test_cases/g_pass_at_k_math.json
 delete mode 100644 tests/unit/metrics/test_cases/gpqa_instruct_metric.json
 delete mode 100644 tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json
 delete mode 100644 tests/unit/metrics/test_cases/loglikelihood_acc.json
 delete mode 100644 tests/unit/metrics/test_cases/loglikelihood_f1.json
 delete mode 100644 tests/unit/metrics/test_cases/maj_at_k.json
 delete mode 100644 tests/unit/metrics/test_cases/mcc.json
 delete mode 100644 tests/unit/metrics/test_cases/mrr.json
 delete mode 100644 tests/unit/metrics/test_cases/multi_f1_numeric.json
 delete mode 100644 tests/unit/metrics/test_cases/pass_at_k.json
 delete mode 100644 tests/unit/metrics/test_cases/pass_at_k_letters.json
 delete mode 100644 tests/unit/metrics/test_cases/pass_at_k_math.json
 delete mode 100644 tests/unit/metrics/test_cases/prediction_perplexity.json
 delete mode 100644 tests/unit/metrics/test_cases/recall_at_k.json
 delete mode 100644 tests/unit/metrics/test_cases/rouge1.json
 delete mode 100644 tests/unit/metrics/test_cases/rouge2.json
 delete mode 100644 tests/unit/metrics/test_cases/rougeL.json
 delete mode 100644 tests/unit/metrics/test_cases/rougeLsum.json
 delete mode 100644 tests/unit/metrics/test_cases/rouge_t5.json
 delete mode 100644 tests/unit/metrics/test_cases/simpleqa_judge.json
 delete mode 100644 tests/unit/metrics/test_cases/target_perplexity.json
 delete mode 100644 tests/unit/metrics/test_cases/ter.json
 delete mode 100644 tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json
 delete mode 100644 tests/unit/metrics/test_cases/word_perplexity.json

diff --git a/tests/unit/metrics/test_cases/acc_golds_likelihood.json b/tests/unit/metrics/test_cases/acc_golds_likelihood.json
deleted file mode 100644
index a6b4cf5ca..000000000
--- a/tests/unit/metrics/test_cases/acc_golds_likelihood.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5fcce7ab58aed69f3f6bbcab853d40ab7867edc75297ce960a0bed80047d1589
-size 1251
diff --git a/tests/unit/metrics/test_cases/avg_at_k.json b/tests/unit/metrics/test_cases/avg_at_k.json
deleted file mode 100644
index fe7bdee09..000000000
--- a/tests/unit/metrics/test_cases/avg_at_k.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:485015fda47e313244e67866e2446e19e8dc837502765cea5200c28646960c9b
-size 1767
diff --git a/tests/unit/metrics/test_cases/avg_at_k_math.json b/tests/unit/metrics/test_cases/avg_at_k_math.json
deleted file mode 100644
index c057f7242..000000000
--- a/tests/unit/metrics/test_cases/avg_at_k_math.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6a7ab341c79ea040f57b5c68dcb53830e6763dfd8006a15bf70e23a5156bd794
-size 1572
diff --git a/tests/unit/metrics/test_cases/bert_score.json b/tests/unit/metrics/test_cases/bert_score.json
deleted file mode 100644
index fd9b329e7..000000000
--- a/tests/unit/metrics/test_cases/bert_score.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1f32c2eae678b162629ee1a17cb11c85e29ed774b19a0e769feb3761266a09a2
-size 929
diff --git a/tests/unit/metrics/test_cases/bits_per_byte.json b/tests/unit/metrics/test_cases/bits_per_byte.json
deleted file mode 100644
index 8aa7007e8..000000000
--- a/tests/unit/metrics/test_cases/bits_per_byte.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ba7c2f558287c1cbed6ec62ce42eee3e3864ce3d59fcf20d20b22b21e94e5a17
-size 954
diff --git a/tests/unit/metrics/test_cases/bleu.json b/tests/unit/metrics/test_cases/bleu.json
deleted file mode 100644
index 444fb8bab..000000000
--- a/tests/unit/metrics/test_cases/bleu.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a828db1108f217aeece39ca279745ac933d706dcd8bd940269b767f40c3c4fe7
-size 4453
diff --git a/tests/unit/metrics/test_cases/bleu_1.json b/tests/unit/metrics/test_cases/bleu_1.json
deleted file mode 100644
index 645689001..000000000
--- a/tests/unit/metrics/test_cases/bleu_1.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e4b245d309e6a9f6d6bf080b44646153eefe4d56aceab565dcd832fab46cc3a3
-size 2805
diff --git a/tests/unit/metrics/test_cases/bleu_4.json b/tests/unit/metrics/test_cases/bleu_4.json
deleted file mode 100644
index 37cdb4c70..000000000
--- a/tests/unit/metrics/test_cases/bleu_4.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4e2a2b2381d1d3c0184c11c22c97028313c178a2f94dd58059866695b77c7eac
-size 3432
diff --git a/tests/unit/metrics/test_cases/bleurt.json b/tests/unit/metrics/test_cases/bleurt.json
deleted file mode 100644
index 8774db6bf..000000000
--- a/tests/unit/metrics/test_cases/bleurt.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:408bb775a6c12744227254d3f1a7511aee9cbfe2160acd23d79dfeca094d1856
-size 1864
diff --git a/tests/unit/metrics/test_cases/byte_perplexity.json b/tests/unit/metrics/test_cases/byte_perplexity.json
deleted file mode 100644
index 88419852d..000000000
--- a/tests/unit/metrics/test_cases/byte_perplexity.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4116e450910250997b6a24b4e51149a88cd0f29da2c6a160d9a4e3a05de8b830
-size 968
diff --git a/tests/unit/metrics/test_cases/chrf.json b/tests/unit/metrics/test_cases/chrf.json
deleted file mode 100644
index d250f2f2b..000000000
--- a/tests/unit/metrics/test_cases/chrf.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:14e677f08edfb5075319e10a70756ee1da9a9d6a850fdfb36798aaeb641077c4
-size 5653
diff --git a/tests/unit/metrics/test_cases/chrf_plus.json b/tests/unit/metrics/test_cases/chrf_plus.json
deleted file mode 100644
index caa14fb1d..000000000
--- a/tests/unit/metrics/test_cases/chrf_plus.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e1abfc1c9a2c74215af46cedce6183e9cf519347121f435c9a6706bac70d9d3d
-size 4564
diff --git a/tests/unit/metrics/test_cases/copyright.json b/tests/unit/metrics/test_cases/copyright.json
deleted file mode 100644
index 6459816c6..000000000
--- a/tests/unit/metrics/test_cases/copyright.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:286a7519ab83375e6d8ccf2264fbc55266260d08c7cb88dfca897b598f74b22d
-size 1994
diff --git a/tests/unit/metrics/test_cases/drop.json b/tests/unit/metrics/test_cases/drop.json
deleted file mode 100644
index e87bf89b0..000000000
--- a/tests/unit/metrics/test_cases/drop.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:675c6cc4313bb41e8a8d27253dcffde62a25fe659ef8e7b762e26ca667c58851
-size 1714
diff --git a/tests/unit/metrics/test_cases/exact_match.json b/tests/unit/metrics/test_cases/exact_match.json
deleted file mode 100644
index 8f028902b..000000000
--- a/tests/unit/metrics/test_cases/exact_match.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:710acbfe499fbe88f152b50efaef99c091813fb529b67dcd602007ea277c3060
-size 1223
diff --git a/tests/unit/metrics/test_cases/expr_gold_metric.json b/tests/unit/metrics/test_cases/expr_gold_metric.json
deleted file mode 100644
index 5e360ad51..000000000
--- a/tests/unit/metrics/test_cases/expr_gold_metric.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ae16455625d67590bdf24fdb28b91684f732952db8110d53145b16295d5883fd
-size 975
diff --git a/tests/unit/metrics/test_cases/extractiveness.json b/tests/unit/metrics/test_cases/extractiveness.json
deleted file mode 100644
index da6232b39..000000000
--- a/tests/unit/metrics/test_cases/extractiveness.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c7357863b5a005819fff204ae0a67287635c2598d2c3948cece0a41c23a1066d
-size 2451
diff --git a/tests/unit/metrics/test_cases/f1_score.json b/tests/unit/metrics/test_cases/f1_score.json
deleted file mode 100644
index 2f1a78e15..000000000
--- a/tests/unit/metrics/test_cases/f1_score.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a141b848bb169c28764742219f077aea9fc60bc6a209ee9b043b8c2614add34b
-size 4358
diff --git a/tests/unit/metrics/test_cases/f1_score_macro.json b/tests/unit/metrics/test_cases/f1_score_macro.json
deleted file mode 100644
index 3bfe7b48d..000000000
--- a/tests/unit/metrics/test_cases/f1_score_macro.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:16afb1546b7c1d3a45f4e14aea9c537b1249fa6b9281f4550d0e1d858a41eae2
-size 4433
diff --git a/tests/unit/metrics/test_cases/f1_score_micro.json b/tests/unit/metrics/test_cases/f1_score_micro.json
deleted file mode 100644
index 0816a25a0..000000000
--- a/tests/unit/metrics/test_cases/f1_score_micro.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8c7f8820db3a770299e494ebc051c4892eadcc17c97ffe7e2947299611b1eea2
-size 4435
diff --git a/tests/unit/metrics/test_cases/faithfulness.json b/tests/unit/metrics/test_cases/faithfulness.json
deleted file mode 100644
index a86f256e7..000000000
--- a/tests/unit/metrics/test_cases/faithfulness.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2e98307b93588bce80ac28f1614f432e31a1417abc72d169838b8818650d4f30
-size 2848
diff --git a/tests/unit/metrics/test_cases/g_pass_at_k.json b/tests/unit/metrics/test_cases/g_pass_at_k.json
deleted file mode 100644
index d8f3870be..000000000
--- a/tests/unit/metrics/test_cases/g_pass_at_k.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3fba8477eaa1cb5efb54d0afb1f5cddb528a1086c15cac79dc6f16fea0012abc
-size 9368
diff --git a/tests/unit/metrics/test_cases/g_pass_at_k_latex.json b/tests/unit/metrics/test_cases/g_pass_at_k_latex.json
deleted file mode 100644
index 2491e9e3e..000000000
--- a/tests/unit/metrics/test_cases/g_pass_at_k_latex.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:687a25df0c903d98d3fabb433552d69c30630dc634f8f9f1582e641eacf60faa
-size 6911
diff --git a/tests/unit/metrics/test_cases/g_pass_at_k_math.json b/tests/unit/metrics/test_cases/g_pass_at_k_math.json
deleted file mode 100644
index 97f9aca37..000000000
--- a/tests/unit/metrics/test_cases/g_pass_at_k_math.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:33f317039e4adf1ac7a44ac2a94b7e8f37095161ab496c51732e9521bfcd551c
-size 9907
diff --git a/tests/unit/metrics/test_cases/gpqa_instruct_metric.json b/tests/unit/metrics/test_cases/gpqa_instruct_metric.json
deleted file mode 100644
index d70b9dd59..000000000
--- a/tests/unit/metrics/test_cases/gpqa_instruct_metric.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b574a7e5f16a3291f0154f71f929b0f59d896e9d0747f210885ac18d6febb464
-size 19623
diff --git a/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json b/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json
deleted file mode 100644
index 27de62abc..000000000
--- a/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9b82a383f67eb0d6ef1fe0c35c3d9e17acf1956efe03590015d9882283372ae6
-size 8648
diff --git a/tests/unit/metrics/test_cases/loglikelihood_acc.json b/tests/unit/metrics/test_cases/loglikelihood_acc.json
deleted file mode 100644
index eaa8fb6e2..000000000
--- a/tests/unit/metrics/test_cases/loglikelihood_acc.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a00ac480425c5b37efb69b5a01d87542dfa96fffeb82d01fda8a7006a66603fb
-size 8133
diff --git a/tests/unit/metrics/test_cases/loglikelihood_f1.json b/tests/unit/metrics/test_cases/loglikelihood_f1.json
deleted file mode 100644
index 2ccd76b0f..000000000
--- a/tests/unit/metrics/test_cases/loglikelihood_f1.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:44675eaa9844cac9e4f71b8b825f114626649d56c46ed14e77f253ab426ef5d1
-size 8828
diff --git a/tests/unit/metrics/test_cases/maj_at_k.json b/tests/unit/metrics/test_cases/maj_at_k.json
deleted file mode 100644
index 9f8cae279..000000000
--- a/tests/unit/metrics/test_cases/maj_at_k.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4f18b15293b933ded1d24cf5aac842eab03c3604d00b0bb45ed96956a83355c1
-size 2227
diff --git a/tests/unit/metrics/test_cases/mcc.json b/tests/unit/metrics/test_cases/mcc.json
deleted file mode 100644
index d3e983260..000000000
--- a/tests/unit/metrics/test_cases/mcc.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8a788e8bdaed81f8fe63081297b60986ad101b4bd2c6681cef850da64b532a17
-size 1227
diff --git a/tests/unit/metrics/test_cases/mrr.json b/tests/unit/metrics/test_cases/mrr.json
deleted file mode 100644
index 3c5ffd306..000000000
--- a/tests/unit/metrics/test_cases/mrr.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a79c93f65e5c6e419125efaceea598b3e500fb01e7cfa0b57f09f0831f1e140f
-size 2386
diff --git a/tests/unit/metrics/test_cases/multi_f1_numeric.json b/tests/unit/metrics/test_cases/multi_f1_numeric.json
deleted file mode 100644
index 596f700f8..000000000
--- a/tests/unit/metrics/test_cases/multi_f1_numeric.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f3c67192247f89487d12384b15c95bd4a64ec2cbcf882ad00339c99754b3b794
-size 4955
diff --git a/tests/unit/metrics/test_cases/pass_at_k.json b/tests/unit/metrics/test_cases/pass_at_k.json
deleted file mode 100644
index 1b67789ca..000000000
--- a/tests/unit/metrics/test_cases/pass_at_k.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a9110dc53c847bc95648b270d3c5622967884ae9cd398c0e75268424fc2d26eb
-size 1905
diff --git a/tests/unit/metrics/test_cases/pass_at_k_letters.json b/tests/unit/metrics/test_cases/pass_at_k_letters.json
deleted file mode 100644
index 50e4ed073..000000000
--- a/tests/unit/metrics/test_cases/pass_at_k_letters.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d7f9b2aefb62a7b04440759a21323605df76ed30eff9cc99a62f9dc5f667bacc
-size 1878
diff --git a/tests/unit/metrics/test_cases/pass_at_k_math.json b/tests/unit/metrics/test_cases/pass_at_k_math.json
deleted file mode 100644
index 91db182a6..000000000
--- a/tests/unit/metrics/test_cases/pass_at_k_math.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:330bb04632ce82da1bbfcf57bbb9ff5d36bfe0dc1c0d298706a8a0a24786c420
-size 1633
diff --git a/tests/unit/metrics/test_cases/prediction_perplexity.json b/tests/unit/metrics/test_cases/prediction_perplexity.json
deleted file mode 100644
index 3afd599e2..000000000
--- a/tests/unit/metrics/test_cases/prediction_perplexity.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6772f57e5e8e144a4c24049441c127fce4daded47081327ae064c6613f94779e
-size 992
diff --git a/tests/unit/metrics/test_cases/recall_at_k.json b/tests/unit/metrics/test_cases/recall_at_k.json
deleted file mode 100644
index b41ef29ba..000000000
--- a/tests/unit/metrics/test_cases/recall_at_k.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8a786b6a64057501d3d65bb251709595fd1c982e1f533ed12ac968da8c61522e
-size 1977
diff --git a/tests/unit/metrics/test_cases/rouge1.json b/tests/unit/metrics/test_cases/rouge1.json
deleted file mode 100644
index 92d7f945d..000000000
--- a/tests/unit/metrics/test_cases/rouge1.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:201cc4f2c59de282b3cc9ccac2dfbb080cb17ccda6c89fa497d4d1e7a1e44052
-size 689
diff --git a/tests/unit/metrics/test_cases/rouge2.json b/tests/unit/metrics/test_cases/rouge2.json
deleted file mode 100644
index a53038b33..000000000
--- a/tests/unit/metrics/test_cases/rouge2.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:553b4de4f3568fe3907dd067d19c8bbce0004972da9841e010ecf2c05db67fc7
-size 1881
diff --git a/tests/unit/metrics/test_cases/rougeL.json b/tests/unit/metrics/test_cases/rougeL.json
deleted file mode 100644
index b3c3e8883..000000000
--- a/tests/unit/metrics/test_cases/rougeL.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b2b219b759e1d3aae2da9c885edb11a55e5e55e38589865894d2498aca4534dd
-size 1877
diff --git a/tests/unit/metrics/test_cases/rougeLsum.json b/tests/unit/metrics/test_cases/rougeLsum.json
deleted file mode 100644
index 8b7f00302..000000000
--- a/tests/unit/metrics/test_cases/rougeLsum.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:32f6d4f7261fee58c3da493b6156bf001afa6d501bdfdcf8fcb33169542f8aa8
-size 1958
diff --git a/tests/unit/metrics/test_cases/rouge_t5.json b/tests/unit/metrics/test_cases/rouge_t5.json
deleted file mode 100644
index 49d2aa56c..000000000
--- a/tests/unit/metrics/test_cases/rouge_t5.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9792b0ef28716f36663975024a84cfb15284a17e2f5a6648363a6284697e0ad3
-size 2208
diff --git a/tests/unit/metrics/test_cases/simpleqa_judge.json b/tests/unit/metrics/test_cases/simpleqa_judge.json
deleted file mode 100644
index e9b3b9aaa..000000000
--- a/tests/unit/metrics/test_cases/simpleqa_judge.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4a64b4778c6c7f8b4a69aaf7eb269b156292eb24fff1a737266dadfb4e04a33a
-size 730
diff --git a/tests/unit/metrics/test_cases/target_perplexity.json b/tests/unit/metrics/test_cases/target_perplexity.json
deleted file mode 100644
index f4c859650..000000000
--- a/tests/unit/metrics/test_cases/target_perplexity.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d4176078edb4639416286ca6f12d0b2903f3f232f8d1b7374becbe1da88a52ce
-size 2913
diff --git a/tests/unit/metrics/test_cases/ter.json b/tests/unit/metrics/test_cases/ter.json
deleted file mode 100644
index 724103bfa..000000000
--- a/tests/unit/metrics/test_cases/ter.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cb94c167efc2fa8da3c58ae0552cbfb87b4cced5bb7474e1d1b7965680fc4d3d
-size 4733
diff --git a/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json b/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json
deleted file mode 100644
index 78507add7..000000000
--- a/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f91a5be1cd5cb437c35632184a8152f8c44e95001c364b27477e3c6015b949e7
-size 2424
diff --git a/tests/unit/metrics/test_cases/word_perplexity.json b/tests/unit/metrics/test_cases/word_perplexity.json
deleted file mode 100644
index 4aa518a0b..000000000
--- a/tests/unit/metrics/test_cases/word_perplexity.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c6c97b916e429463d07d9e8680e392ee757b409c614e758047599b133119bd1c
-size 3421
diff --git a/tests/unit/metrics/test_metrics_automated.py b/tests/unit/metrics/test_metrics_automated.py
index c2a937e20..c705e672a 100644
--- a/tests/unit/metrics/test_metrics_automated.py
+++ b/tests/unit/metrics/test_metrics_automated.py
@@ -31,11 +31,11 @@
 import copy
 import json
 import logging
-import math
 from dataclasses import field
 from pathlib import Path
 from typing import Any
 
+import pytest
 from pydantic import BaseModel
 
 from lighteval.metrics.metrics import Metrics
@@ -254,12 +254,8 @@ def run_test_case(self, test_case: MetricTestCase | CorpusLevelMetricTestCase) -
     def _compare_scalar_outputs(self, actual: Any, expected: Any, tolerance: float) -> bool:
         """Compare scalar outputs with tolerance."""
         if isinstance(actual, (int, float)) and isinstance(expected, (int, float)):
-            # For small values, use absolute tolerance only to avoid relative tolerance issues
-            # For values >= 1.0, we can use relative tolerance
-            if abs(expected) < 1.0:
-                return math.isclose(actual, expected, abs_tol=tolerance)
-            else:
-                return math.isclose(actual, expected, rel_tol=tolerance, abs_tol=tolerance)
+            # Use pytest.approx for float comparison
+            return actual == pytest.approx(expected, abs=tolerance)
         return actual == expected
 
     def _compare_dict_outputs(self, actual: Any, expected: Any, tolerance: float) -> bool:

From ef216dcbf1455950f32d321209c6ef7f5b30953f Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Mon, 15 Sep 2025 12:58:39 +0000
Subject: [PATCH 20/26] re-add test-files to actual repo

---
 .gitattributes                                |   1 +
 .../test_cases/acc_golds_likelihood.json      |  44 ++
 tests/unit/metrics/test_cases/avg_at_k.json   |  63 +++
 .../metrics/test_cases/avg_at_k_math.json     |  63 +++
 tests/unit/metrics/test_cases/bert_score.json |  47 ++
 .../metrics/test_cases/bits_per_byte.json     |  47 ++
 tests/unit/metrics/test_cases/bleu.json       | 167 +++++++
 tests/unit/metrics/test_cases/bleu_1.json     | 101 ++++
 tests/unit/metrics/test_cases/bleu_4.json     | 120 +++++
 tests/unit/metrics/test_cases/bleurt.json     |  69 +++
 .../metrics/test_cases/byte_perplexity.json   |  47 ++
 tests/unit/metrics/test_cases/chrf.json       | 207 ++++++++
 tests/unit/metrics/test_cases/chrf_plus.json  | 167 +++++++
 tests/unit/metrics/test_cases/copyright.json  |  69 +++
 tests/unit/metrics/test_cases/drop.json       |  75 +++
 .../unit/metrics/test_cases/exact_match.json  |  48 ++
 .../metrics/test_cases/expr_gold_metric.json  |  47 ++
 .../metrics/test_cases/extractiveness.json    |  78 +++
 tests/unit/metrics/test_cases/f1_score.json   | 153 ++++++
 .../metrics/test_cases/f1_score_macro.json    | 167 +++++++
 .../metrics/test_cases/f1_score_micro.json    | 167 +++++++
 .../unit/metrics/test_cases/faithfulness.json |  90 ++++
 .../unit/metrics/test_cases/g_pass_at_k.json  | 316 +++++++++++++
 .../metrics/test_cases/g_pass_at_k_latex.json | 223 +++++++++
 .../metrics/test_cases/g_pass_at_k_math.json  | 347 ++++++++++++++
 .../test_cases/gpqa_instruct_metric.json      | 447 ++++++++++++++++++
 .../test_cases/gpqa_instruct_pass_at_k.json   | 281 +++++++++++
 .../metrics/test_cases/loglikelihood_acc.json | 266 +++++++++++
 .../metrics/test_cases/loglikelihood_f1.json  | 286 +++++++++++
 tests/unit/metrics/test_cases/maj_at_k.json   |  82 ++++
 tests/unit/metrics/test_cases/mcc.json        |  47 ++
 tests/unit/metrics/test_cases/mrr.json        |  90 ++++
 .../metrics/test_cases/multi_f1_numeric.json  | 167 +++++++
 tests/unit/metrics/test_cases/pass_at_k.json  |  69 +++
 .../metrics/test_cases/pass_at_k_letters.json |  69 +++
 .../metrics/test_cases/pass_at_k_math.json    |  63 +++
 .../test_cases/prediction_perplexity.json     |  47 ++
 .../unit/metrics/test_cases/recall_at_k.json  |  69 +++
 tests/unit/metrics/test_cases/rouge1.json     |  28 ++
 tests/unit/metrics/test_cases/rouge2.json     |  69 +++
 tests/unit/metrics/test_cases/rougeL.json     |  69 +++
 tests/unit/metrics/test_cases/rougeLsum.json  |  69 +++
 tests/unit/metrics/test_cases/rouge_t5.json   |  78 +++
 .../metrics/test_cases/simpleqa_judge.json    |  31 ++
 .../metrics/test_cases/target_perplexity.json | 101 ++++
 tests/unit/metrics/test_cases/ter.json        | 167 +++++++
 .../test_cases/truthfulqa_mc_metrics.json     |  81 ++++
 .../metrics/test_cases/word_perplexity.json   | 127 +++++
 48 files changed, 5726 insertions(+)
 create mode 100644 tests/unit/metrics/test_cases/acc_golds_likelihood.json
 create mode 100644 tests/unit/metrics/test_cases/avg_at_k.json
 create mode 100644 tests/unit/metrics/test_cases/avg_at_k_math.json
 create mode 100644 tests/unit/metrics/test_cases/bert_score.json
 create mode 100644 tests/unit/metrics/test_cases/bits_per_byte.json
 create mode 100644 tests/unit/metrics/test_cases/bleu.json
 create mode 100644 tests/unit/metrics/test_cases/bleu_1.json
 create mode 100644 tests/unit/metrics/test_cases/bleu_4.json
 create mode 100644 tests/unit/metrics/test_cases/bleurt.json
 create mode 100644 tests/unit/metrics/test_cases/byte_perplexity.json
 create mode 100644 tests/unit/metrics/test_cases/chrf.json
 create mode 100644 tests/unit/metrics/test_cases/chrf_plus.json
 create mode 100644 tests/unit/metrics/test_cases/copyright.json
 create mode 100644 tests/unit/metrics/test_cases/drop.json
 create mode 100644 tests/unit/metrics/test_cases/exact_match.json
 create mode 100644 tests/unit/metrics/test_cases/expr_gold_metric.json
 create mode 100644 tests/unit/metrics/test_cases/extractiveness.json
 create mode 100644 tests/unit/metrics/test_cases/f1_score.json
 create mode 100644 tests/unit/metrics/test_cases/f1_score_macro.json
 create mode 100644 tests/unit/metrics/test_cases/f1_score_micro.json
 create mode 100644 tests/unit/metrics/test_cases/faithfulness.json
 create mode 100644 tests/unit/metrics/test_cases/g_pass_at_k.json
 create mode 100644 tests/unit/metrics/test_cases/g_pass_at_k_latex.json
 create mode 100644 tests/unit/metrics/test_cases/g_pass_at_k_math.json
 create mode 100644 tests/unit/metrics/test_cases/gpqa_instruct_metric.json
 create mode 100644 tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json
 create mode 100644 tests/unit/metrics/test_cases/loglikelihood_acc.json
 create mode 100644 tests/unit/metrics/test_cases/loglikelihood_f1.json
 create mode 100644 tests/unit/metrics/test_cases/maj_at_k.json
 create mode 100644 tests/unit/metrics/test_cases/mcc.json
 create mode 100644 tests/unit/metrics/test_cases/mrr.json
 create mode 100644 tests/unit/metrics/test_cases/multi_f1_numeric.json
 create mode 100644 tests/unit/metrics/test_cases/pass_at_k.json
 create mode 100644 tests/unit/metrics/test_cases/pass_at_k_letters.json
 create mode 100644 tests/unit/metrics/test_cases/pass_at_k_math.json
 create mode 100644 tests/unit/metrics/test_cases/prediction_perplexity.json
 create mode 100644 tests/unit/metrics/test_cases/recall_at_k.json
 create mode 100644 tests/unit/metrics/test_cases/rouge1.json
 create mode 100644 tests/unit/metrics/test_cases/rouge2.json
 create mode 100644 tests/unit/metrics/test_cases/rougeL.json
 create mode 100644 tests/unit/metrics/test_cases/rougeLsum.json
 create mode 100644 tests/unit/metrics/test_cases/rouge_t5.json
 create mode 100644 tests/unit/metrics/test_cases/simpleqa_judge.json
 create mode 100644 tests/unit/metrics/test_cases/target_perplexity.json
 create mode 100644 tests/unit/metrics/test_cases/ter.json
 create mode 100644 tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json
 create mode 100644 tests/unit/metrics/test_cases/word_perplexity.json

diff --git a/.gitattributes b/.gitattributes
index 7fe70d7f0..0e12e71de 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,2 @@
 *.json filter=lfs diff=lfs merge=lfs -text
+tests/unit/metrics/test_cases/*.json -filter -diff -merge text
diff --git a/tests/unit/metrics/test_cases/acc_golds_likelihood.json b/tests/unit/metrics/test_cases/acc_golds_likelihood.json
new file mode 100644
index 000000000..90a37d8cf
--- /dev/null
+++ b/tests/unit/metrics/test_cases/acc_golds_likelihood.json
@@ -0,0 +1,44 @@
+{
+  "name": "Acc Golds Likelihood Test Suite",
+  "description": "Test cases for acc_golds_likelihood metric",
+  "test_cases": [
+    {
+      "name": "Acc Golds Likelihood - Correct Likelihood",
+      "metric_class": "acc_golds_likelihood",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["Paris", "London", "Berlin"],
+        "gold_index": 0,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "argmax_logits_eq_gold": [1, 0, 0]
+      },
+      "expected_output": {
+        "acc": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test acc golds likelihood with correct likelihood"
+    },
+    {
+      "name": "Acc Golds Likelihood - Incorrect Likelihood",
+      "metric_class": "acc_golds_likelihood",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["Paris", "London", "Berlin"],
+        "gold_index": 0,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "argmax_logits_eq_gold": [0, 0, 0]
+      },
+      "expected_output": {
+        "acc": 0
+      },
+      "tolerance": 0.01,
+      "description": "Test acc golds likelihood with incorrect likelihood"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/avg_at_k.json b/tests/unit/metrics/test_cases/avg_at_k.json
new file mode 100644
index 000000000..882a6fa4d
--- /dev/null
+++ b/tests/unit/metrics/test_cases/avg_at_k.json
@@ -0,0 +1,63 @@
+{
+  "name": "Avg At K Test Suite",
+  "description": "Test cases for avg_at_k metric",
+  "test_cases": [
+    {
+      "name": "Avg at K - Correct in Top K",
+      "metric_class": "avg_at_k",
+      "metric_params": {"k": 2},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["Paris", "London", "Berlin"]
+      },
+      "expected_output": {
+        "avg@k_with_k": 0.5
+      },
+      "tolerance": 0.01,
+      "description": "Test avg at k with correct answer in top k"
+    },
+    {
+      "name": "Avg at K - Not in Top K",
+      "metric_class": "avg_at_k",
+      "metric_params": {"k": 1},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["London", "Berlin", "Paris"]
+      },
+      "expected_output": {
+        "avg@k_with_k": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test avg at k with correct answer not in top k"
+    },
+    {
+      "name": "Avg at K - Multiple Correct",
+      "metric_class": "avg_at_k",
+      "metric_params": {"k": 3},
+      "doc": {
+        "query": "Which are European capitals?",
+        "choices": ["London", "Paris", "Tokyo", "Berlin"],
+        "gold_index": [0, 1, 3],
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["Paris", "London", "Berlin", "Tokyo"]
+      },
+      "expected_output": {
+        "avg@k_with_k": 0.33
+      },
+      "tolerance": 0.01,
+      "description": "Test avg at k with multiple correct answers"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/avg_at_k_math.json b/tests/unit/metrics/test_cases/avg_at_k_math.json
new file mode 100644
index 000000000..0dd2e4dd3
--- /dev/null
+++ b/tests/unit/metrics/test_cases/avg_at_k_math.json
@@ -0,0 +1,63 @@
+{
+  "name": "Avg At K Math Test Suite",
+  "description": "Test cases for avg_at_k_math metric",
+  "test_cases": [
+    {
+      "name": "Avg at K Math - Correct Math",
+      "metric_class": "avg_at_k_math",
+      "metric_params": {"k": 1},
+      "doc": {
+        "query": "What is 2 + 2?",
+        "choices": ["4"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["4"]
+      },
+      "expected_output": {
+        "avg@k_with_k": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test avg at k math with correct math answer"
+    },
+    {
+      "name": "Avg at K Math - Wrong Math",
+      "metric_class": "avg_at_k_math",
+      "metric_params": {"k": 1},
+      "doc": {
+        "query": "What is 2 + 2?",
+        "choices": ["4"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["5"]
+      },
+      "expected_output": {
+        "avg@k_with_k": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test avg at k math with wrong math answer"
+    },
+    {
+      "name": "Avg at K Math - Multiple Attempts",
+      "metric_class": "avg_at_k_math",
+      "metric_params": {"k": 2},
+      "doc": {
+        "query": "What is 3 * 4?",
+        "choices": ["12"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["12", "15"]
+      },
+      "expected_output": {
+        "avg@k_with_k": 0.5
+      },
+      "tolerance": 0.01,
+      "description": "Test avg at k math with multiple attempts"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/bert_score.json b/tests/unit/metrics/test_cases/bert_score.json
new file mode 100644
index 000000000..13cda7625
--- /dev/null
+++ b/tests/unit/metrics/test_cases/bert_score.json
@@ -0,0 +1,47 @@
+{
+  "name": "Bert Score Test Suite",
+  "description": "Test cases for bert_score metric",
+  "test_cases": [
+    {
+      "name": "Bert Score - Basic Test",
+      "metric_class": "bert_score",
+      "metric_params": {},
+      "doc": {
+        "query": "Test query for bert_score",
+        "choices": [
+          "Test choice 1",
+          "Test choice 2",
+          "Test choice 3"
+        ],
+        "gold_index": 0,
+        "task_name": "test"
+      },
+      "model_response": {
+        "text": [
+          "Test choice 1"
+        ],
+        "logprobs": [
+          0.5,
+          0.3,
+          0.2
+        ],
+        "output_tokens": [
+          [
+            1
+          ],
+          [
+            2
+          ],
+          [
+            3
+          ]
+        ]
+      },
+      "expected_output": {
+        "result": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Basic test case for bert_score metric"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/bits_per_byte.json b/tests/unit/metrics/test_cases/bits_per_byte.json
new file mode 100644
index 000000000..8470678fa
--- /dev/null
+++ b/tests/unit/metrics/test_cases/bits_per_byte.json
@@ -0,0 +1,47 @@
+{
+  "name": "Bits Per Byte Test Suite",
+  "description": "Test cases for bits_per_byte metric",
+  "test_cases": [
+    {
+      "name": "Bits Per Byte - Basic Test",
+      "metric_class": "bits_per_byte",
+      "metric_params": {},
+      "doc": {
+        "query": "Test query for bits_per_byte",
+        "choices": [
+          "Test choice 1",
+          "Test choice 2",
+          "Test choice 3"
+        ],
+        "gold_index": 0,
+        "task_name": "test"
+      },
+      "model_response": {
+        "text": [
+          "Test choice 1"
+        ],
+        "logprobs": [
+          0.5,
+          0.3,
+          0.2
+        ],
+        "output_tokens": [
+          [
+            1
+          ],
+          [
+            2
+          ],
+          [
+            3
+          ]
+        ]
+      },
+      "expected_output": {
+        "bits_per_byte": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Basic test case for bits_per_byte metric"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/bleu.json b/tests/unit/metrics/test_cases/bleu.json
new file mode 100644
index 000000000..7171fba7a
--- /dev/null
+++ b/tests/unit/metrics/test_cases/bleu.json
@@ -0,0 +1,167 @@
+{
+  "name": "BLEU Test Suite",
+  "description": "Test cases for bleu metric (corpus-level BLEU)",
+  "corpus_level": true,
+  "test_cases": [
+    {
+      "name": "BLEU - Perfect Translations",
+      "metric_class": "bleu",
+      "metric_params": {},
+      "metric_name": "bleu",
+      "docs": [
+        {
+          "query": "Translate to French: Hello world",
+          "choices": ["Bonjour le monde"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: Good morning",
+          "choices": ["Buenos días"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: Thank you",
+          "choices": ["Danke schön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Bonjour le monde"]
+        },
+        {
+          "text": ["Buenos días"]
+        },
+        {
+          "text": ["Danke schön"]
+        }
+      ],
+      "expected_output": 0.0,
+      "tolerance": 0.01,
+      "description": "Perfect translations - exact word overlap (BLEU = 100.0)"
+    },
+    {
+      "name": "BLEU - High Similarity",
+      "metric_class": "bleu",
+      "metric_params": {},
+      "metric_name": "bleu",
+      "docs": [
+        {
+          "query": "Translate to French: The cat is sleeping",
+          "choices": ["Le chat dort"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: I like pizza",
+          "choices": ["Me gusta la pizza"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: The weather is nice",
+          "choices": ["Das Wetter ist schön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Le chat dort"]
+        },
+        {
+          "text": ["Me gusta pizza"]
+        },
+        {
+          "text": ["Das Wetter ist schön"]
+        }
+      ],
+      "expected_output": 85.0,
+      "tolerance": 5.0,
+      "description": "High similarity - minor word differences (BLEU ≈ 85.0)"
+    },
+    {
+      "name": "BLEU - Moderate Similarity",
+      "metric_class": "bleu",
+      "metric_params": {},
+      "metric_name": "bleu",
+      "docs": [
+        {
+          "query": "Translate to French: The quick brown fox",
+          "choices": ["Le renard brun rapide"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: Artificial intelligence",
+          "choices": ["La inteligencia artificial"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: Machine learning",
+          "choices": ["Maschinelles Lernen"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Le renard rapide"]
+        },
+        {
+          "text": ["La IA"]
+        },
+        {
+          "text": ["ML"]
+        }
+      ],
+      "expected_output": 45.0,
+      "tolerance": 10.0,
+      "description": "Moderate similarity - significant word omissions (BLEU ≈ 45.0)"
+    },
+    {
+      "name": "BLEU - Low Similarity",
+      "metric_class": "bleu",
+      "metric_params": {},
+      "metric_name": "bleu",
+      "docs": [
+        {
+          "query": "Translate to French: The sun is bright",
+          "choices": ["Le soleil est brillant"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: The moon is full",
+          "choices": ["La luna está llena"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: The stars are beautiful",
+          "choices": ["Die Sterne sind wunderschön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Il pleut"]
+        },
+        {
+          "text": ["Hace frío"]
+        },
+        {
+          "text": ["Es heiß"]
+        }
+      ],
+      "expected_output": 15.0,
+      "tolerance": 10.0,
+      "description": "Low similarity - minimal word overlap (BLEU ≈ 15.0)"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/bleu_1.json b/tests/unit/metrics/test_cases/bleu_1.json
new file mode 100644
index 000000000..05dd676af
--- /dev/null
+++ b/tests/unit/metrics/test_cases/bleu_1.json
@@ -0,0 +1,101 @@
+{
+  "name": "BLEU-1 Test Suite",
+  "description": "Test cases for bleu_1 metric (sample-level BLEU-1 with 1-gram overlap)",
+  "test_cases": [
+    {
+      "name": "BLEU-1 - Perfect Match",
+      "metric_class": "bleu_1",
+      "metric_params": {},
+      "doc": {
+        "query": "Translate to French: Hello world",
+        "choices": ["Bonjour le monde"],
+        "gold_index": 0,
+        "task_name": "translation"
+      },
+      "model_response": {
+        "text": ["Bonjour le monde"]
+      },
+      "expected_output": {
+        "bleu_1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Perfect match - exact 1-gram overlap (BLEU-1 = 1.0)"
+    },
+    {
+      "name": "BLEU-1 - High Similarity",
+      "metric_class": "bleu_1",
+      "metric_params": {},
+      "doc": {
+        "query": "Translate to French: The cat is sleeping",
+        "choices": ["Le chat dort"],
+        "gold_index": 0,
+        "task_name": "translation"
+      },
+      "model_response": {
+        "text": ["Le chat dort"]
+      },
+      "expected_output": {
+        "bleu_1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "High similarity - exact 1-gram match (BLEU-1 = 1.0)"
+    },
+    {
+      "name": "BLEU-1 - Partial Match",
+      "metric_class": "bleu_1",
+      "metric_params": {},
+      "doc": {
+        "query": "Translate to French: The quick brown fox",
+        "choices": ["Le renard brun rapide"],
+        "gold_index": 0,
+        "task_name": "translation"
+      },
+      "model_response": {
+        "text": ["Le renard rapide"]
+      },
+      "expected_output": {
+        "bleu_1": 0.75
+      },
+      "tolerance": 0.1,
+      "description": "Partial match - 3 out of 4 words match (BLEU-1 = 0.75)"
+    },
+    {
+      "name": "BLEU-1 - Low Similarity",
+      "metric_class": "bleu_1",
+      "metric_params": {},
+      "doc": {
+        "query": "Translate to French: The sun is bright",
+        "choices": ["Le soleil est brillant"],
+        "gold_index": 0,
+        "task_name": "translation"
+      },
+      "model_response": {
+        "text": ["Il pleut"]
+      },
+      "expected_output": {
+        "bleu_1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Low similarity - no 1-gram overlap (BLEU-1 = 0.0)"
+    },
+    {
+      "name": "BLEU-1 - Word Order Change",
+      "metric_class": "bleu_1",
+      "metric_params": {},
+      "doc": {
+        "query": "Translate to French: The weather is nice",
+        "choices": ["Le temps est agréable"],
+        "gold_index": 0,
+        "task_name": "translation"
+      },
+      "model_response": {
+        "text": ["Le agréable temps est"]
+      },
+      "expected_output": {
+        "bleu_1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Word order change - same 1-grams, different order (BLEU-1 = 1.0)"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/bleu_4.json b/tests/unit/metrics/test_cases/bleu_4.json
new file mode 100644
index 000000000..e6e8d2814
--- /dev/null
+++ b/tests/unit/metrics/test_cases/bleu_4.json
@@ -0,0 +1,120 @@
+{
+  "name": "BLEU-4 Test Suite",
+  "description": "Test cases for bleu_4 metric (sample-level BLEU-4 with 4-gram overlap)",
+  "test_cases": [
+    {
+      "name": "BLEU-4 - Perfect Match",
+      "metric_class": "bleu_4",
+      "metric_params": {},
+      "doc": {
+        "query": "Translate to French: The quick brown fox jumps",
+        "choices": ["Le renard brun rapide saute"],
+        "gold_index": 0,
+        "task_name": "translation"
+      },
+      "model_response": {
+        "text": ["Le renard brun rapide saute"]
+      },
+      "expected_output": {
+        "bleu_4": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Perfect match - exact 4-gram overlap (BLEU-4 = 1.0)"
+    },
+    {
+      "name": "BLEU-4 - High Similarity",
+      "metric_class": "bleu_4",
+      "metric_params": {},
+      "doc": {
+        "query": "Translate to French: The cat is sleeping now",
+        "choices": ["Le chat dort maintenant"],
+        "gold_index": 0,
+        "task_name": "translation"
+      },
+      "model_response": {
+        "text": ["Le chat dort maintenant"]
+      },
+      "expected_output": {
+        "bleu_4": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "High similarity - exact 4-gram match (BLEU-4 = 1.0)"
+    },
+    {
+      "name": "BLEU-4 - Partial Match",
+      "metric_class": "bleu_4",
+      "metric_params": {},
+      "doc": {
+        "query": "Translate to French: The weather is very nice",
+        "choices": ["Le temps est très agréable"],
+        "gold_index": 0,
+        "task_name": "translation"
+      },
+      "model_response": {
+        "text": ["Le temps est agréable"]
+      },
+      "expected_output": {
+        "bleu_4": 0.0
+      },
+      "tolerance": 0.1,
+      "description": "Partial match - some 4-grams match (BLEU-4 = 0.6)"
+    },
+    {
+      "name": "BLEU-4 - Low Similarity",
+      "metric_class": "bleu_4",
+      "metric_params": {},
+      "doc": {
+        "query": "Translate to French: The sun is bright today",
+        "choices": ["Le soleil est brillant aujourd'hui"],
+        "gold_index": 0,
+        "task_name": "translation"
+      },
+      "model_response": {
+        "text": ["Il pleut beaucoup"]
+      },
+      "expected_output": {
+        "bleu_4": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Low similarity - no 4-gram overlap (BLEU-4 = 0.0)"
+    },
+    {
+      "name": "BLEU-4 - Word Order Change",
+      "metric_class": "bleu_4",
+      "metric_params": {},
+      "doc": {
+        "query": "Translate to French: The weather is nice today",
+        "choices": ["Le temps est agréable aujourd'hui"],
+        "gold_index": 0,
+        "task_name": "translation"
+      },
+      "model_response": {
+        "text": ["Le agréable temps est aujourd'hui"]
+      },
+      "expected_output": {
+        "bleu_4": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Word order change - no 4-gram matches (BLEU-4 = 0.0)"
+    },
+    {
+      "name": "BLEU-4 - Short Text",
+      "metric_class": "bleu_4",
+      "metric_params": {},
+      "doc": {
+        "query": "Translate to French: Hello",
+        "choices": ["Bonjour"],
+        "gold_index": 0,
+        "task_name": "translation"
+      },
+      "model_response": {
+        "text": ["Bonjour"]
+      },
+      "expected_output": {
+        "bleu_4": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Short text - single word, BLEU-4 defaults to BLEU-1 (BLEU-4 = 1.0)"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/bleurt.json b/tests/unit/metrics/test_cases/bleurt.json
new file mode 100644
index 000000000..7891b2aec
--- /dev/null
+++ b/tests/unit/metrics/test_cases/bleurt.json
@@ -0,0 +1,69 @@
+{
+  "name": "Bleurt Test Suite",
+  "description": "Test cases for bleurt metric",
+  "test_cases": [
+    {
+      "name": "BLEURT - Perfect Match",
+      "metric_class": "bleurt",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps over the lazy dog"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "bleurt": 0.82
+      },
+      "tolerance": 0.1,
+      "description": "Test BLEURT with perfect match"
+    },
+    {
+      "name": "BLEURT - Partial Match",
+      "metric_class": "bleurt",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "bleurt": -0.14
+      },
+      "tolerance": 0.2,
+      "description": "Test BLEURT with partial match"
+    },
+    {
+      "name": "BLEURT - Different Content",
+      "metric_class": "bleurt",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["A cat sleeps on the mat"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "bleurt": -1.11
+      },
+      "tolerance": 0.2,
+      "description": "Test BLEURT with completely different content"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/byte_perplexity.json b/tests/unit/metrics/test_cases/byte_perplexity.json
new file mode 100644
index 000000000..ef76f6bb7
--- /dev/null
+++ b/tests/unit/metrics/test_cases/byte_perplexity.json
@@ -0,0 +1,47 @@
+{
+  "name": "Byte Perplexity Test Suite",
+  "description": "Test cases for byte_perplexity metric",
+  "test_cases": [
+    {
+      "name": "Byte Perplexity - Basic Test",
+      "metric_class": "byte_perplexity",
+      "metric_params": {},
+      "doc": {
+        "query": "Test query for byte_perplexity",
+        "choices": [
+          "Test choice 1",
+          "Test choice 2",
+          "Test choice 3"
+        ],
+        "gold_index": 0,
+        "task_name": "test"
+      },
+      "model_response": {
+        "text": [
+          "Test choice 1"
+        ],
+        "logprobs": [
+          0.5,
+          0.3,
+          0.2
+        ],
+        "output_tokens": [
+          [
+            1
+          ],
+          [
+            2
+          ],
+          [
+            3
+          ]
+        ]
+      },
+      "expected_output": {
+        "byte_perplexity": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Basic test case for byte_perplexity metric"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/chrf.json b/tests/unit/metrics/test_cases/chrf.json
new file mode 100644
index 000000000..15f7b8c15
--- /dev/null
+++ b/tests/unit/metrics/test_cases/chrf.json
@@ -0,0 +1,207 @@
+{
+  "name": "CHRF Test Suite",
+  "description": "Test cases for chrf metric (corpus-level CHRF without word order)",
+  "corpus_level": true,
+  "test_cases": [
+    {
+      "name": "CHRF - Perfect Matches",
+      "metric_class": "chrf",
+      "metric_params": {},
+      "metric_name": "chrf",
+      "docs": [
+        {
+          "query": "Translate to French: Hello world",
+          "choices": ["Bonjour le monde"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: Good morning",
+          "choices": ["Buenos días"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: Thank you",
+          "choices": ["Danke schön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Bonjour le monde"]
+        },
+        {
+          "text": ["Buenos días"]
+        },
+        {
+          "text": ["Danke schön"]
+        }
+      ],
+      "expected_output": 100.0,
+      "tolerance": 0.01,
+      "description": "Perfect matches - exact character overlap (CHRF = 100.0)"
+    },
+    {
+      "name": "CHRF - High Similarity",
+      "metric_class": "chrf",
+      "metric_params": {},
+      "metric_name": "chrf",
+      "docs": [
+        {
+          "query": "Translate to French: The cat is sleeping",
+          "choices": ["Le chat dort"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: I like pizza",
+          "choices": ["Me gusta la pizza"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: The weather is nice",
+          "choices": ["Das Wetter ist schön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Le chat dort"]
+        },
+        {
+          "text": ["Me gusta pizza"]
+        },
+        {
+          "text": ["Das Wetter ist schön"]
+        }
+      ],
+      "expected_output": 88.0,
+      "tolerance": 5.0,
+      "description": "High similarity - minor character differences (CHRF ≈ 88.0)"
+    },
+    {
+      "name": "CHRF - Word Order Changes",
+      "metric_class": "chrf",
+      "metric_params": {},
+      "metric_name": "chrf",
+      "docs": [
+        {
+          "query": "Translate to French: The quick brown fox",
+          "choices": ["Le renard brun rapide"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: Artificial intelligence",
+          "choices": ["La inteligencia artificial"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: Machine learning",
+          "choices": ["Maschinelles Lernen"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Le rapide renard brun"]
+        },
+        {
+          "text": ["La artificial inteligencia"]
+        },
+        {
+          "text": ["Lernen Maschinelles"]
+        }
+      ],
+      "expected_output": 75.0,
+      "tolerance": 10.0,
+      "description": "Word order changes - same characters, different order (CHRF ≈ 75.0)"
+    },
+    {
+      "name": "CHRF - Moderate Similarity",
+      "metric_class": "chrf",
+      "metric_params": {},
+      "metric_name": "chrf",
+      "docs": [
+        {
+          "query": "Translate to French: The sun is bright",
+          "choices": ["Le soleil est brillant"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: The moon is full",
+          "choices": ["La luna está llena"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: The stars are beautiful",
+          "choices": ["Die Sterne sind wunderschön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Le soleil"]
+        },
+        {
+          "text": ["La luna"]
+        },
+        {
+          "text": ["Die Sterne"]
+        }
+      ],
+      "expected_output": 50.0,
+      "tolerance": 10.0,
+      "description": "Moderate similarity - partial character overlap (CHRF ≈ 50.0)"
+    },
+    {
+      "name": "CHRF - Low Similarity",
+      "metric_class": "chrf",
+      "metric_params": {},
+      "metric_name": "chrf",
+      "docs": [
+        {
+          "query": "Translate to French: The weather is nice",
+          "choices": ["Le temps est agréable"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: The food is delicious",
+          "choices": ["La comida está deliciosa"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: The music is beautiful",
+          "choices": ["Die Musik ist wunderschön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Il pleut beaucoup"]
+        },
+        {
+          "text": ["Hace mucho frío"]
+        },
+        {
+          "text": ["Es sehr heiß"]
+        }
+      ],
+      "expected_output": 20.0,
+      "tolerance": 10.0,
+      "description": "Low similarity - minimal character overlap (CHRF ≈ 20.0)"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/chrf_plus.json b/tests/unit/metrics/test_cases/chrf_plus.json
new file mode 100644
index 000000000..80023078e
--- /dev/null
+++ b/tests/unit/metrics/test_cases/chrf_plus.json
@@ -0,0 +1,167 @@
+{
+  "name": "CHRF Plus Test Suite",
+  "description": "Test cases for chrf_plus metric (corpus-level CHRF++ with word order)",
+  "corpus_level": true,
+  "test_cases": [
+    {
+      "name": "CHRF Plus - Perfect Matches",
+      "metric_class": "chrf_plus",
+      "metric_params": {},
+      "metric_name": "chrf++",
+      "docs": [
+        {
+          "query": "Translate to French: Hello world",
+          "choices": ["Bonjour le monde"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: Good morning",
+          "choices": ["Buenos días"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: Thank you",
+          "choices": ["Danke schön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Bonjour le monde"]
+        },
+        {
+          "text": ["Buenos días"]
+        },
+        {
+          "text": ["Danke schön"]
+        }
+      ],
+      "expected_output": 100.0,
+      "tolerance": 0.01,
+      "description": "Perfect matches - exact character and word order overlap (CHRF++ = 100.0)"
+    },
+    {
+      "name": "CHRF Plus - High Similarity",
+      "metric_class": "chrf_plus",
+      "metric_params": {},
+      "metric_name": "chrf++",
+      "docs": [
+        {
+          "query": "Translate to French: The cat is sleeping",
+          "choices": ["Le chat dort"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: I like pizza",
+          "choices": ["Me gusta la pizza"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: The weather is nice",
+          "choices": ["Das Wetter ist schön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Le chat dort"]
+        },
+        {
+          "text": ["Me gusta pizza"]
+        },
+        {
+          "text": ["Das Wetter ist schön"]
+        }
+      ],
+      "expected_output": 85.0,
+      "tolerance": 5.0,
+      "description": "High similarity - minor character differences (CHRF++ ≈ 85.0)"
+    },
+    {
+      "name": "CHRF Plus - Moderate Similarity",
+      "metric_class": "chrf_plus",
+      "metric_params": {},
+      "metric_name": "chrf++",
+      "docs": [
+        {
+          "query": "Translate to French: The quick brown fox",
+          "choices": ["Le renard brun rapide"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: Artificial intelligence",
+          "choices": ["La inteligencia artificial"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: Machine learning",
+          "choices": ["Maschinelles Lernen"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Le renard rapide"]
+        },
+        {
+          "text": ["La IA"]
+        },
+        {
+          "text": ["ML"]
+        }
+      ],
+      "expected_output": 45.0,
+      "tolerance": 10.0,
+      "description": "Moderate similarity - significant character omissions (CHRF++ ≈ 45.0)"
+    },
+    {
+      "name": "CHRF Plus - Low Similarity",
+      "metric_class": "chrf_plus",
+      "metric_params": {},
+      "metric_name": "chrf++",
+      "docs": [
+        {
+          "query": "Translate to French: The sun is bright",
+          "choices": ["Le soleil est brillant"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: The moon is full",
+          "choices": ["La luna está llena"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: The stars are beautiful",
+          "choices": ["Die Sterne sind wunderschön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Il pleut"]
+        },
+        {
+          "text": ["Hace frío"]
+        },
+        {
+          "text": ["Es heiß"]
+        }
+      ],
+      "expected_output": 15.0,
+      "tolerance": 10.0,
+      "description": "Low similarity - minimal character overlap (CHRF++ ≈ 15.0)"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/copyright.json b/tests/unit/metrics/test_cases/copyright.json
new file mode 100644
index 000000000..fb5b434f4
--- /dev/null
+++ b/tests/unit/metrics/test_cases/copyright.json
@@ -0,0 +1,69 @@
+{
+  "name": "Copyright Test Suite",
+  "description": "Test cases for copyright metric",
+  "test_cases": [
+    {
+      "name": "Copyright - No Copyright",
+      "metric_class": "copyright",
+      "metric_params": {},
+      "doc": {
+        "query": "Write a story",
+        "choices": ["Once upon a time"],
+        "gold_index": 0,
+        "task_name": "storytelling"
+      },
+      "model_response": {
+        "text": ["Once upon a time"]
+      },
+      "expected_output": {
+        "longest_common_prefix_length": 4.0,
+        "edit_distance": 0.0,
+        "edit_similarity": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test copyright with no copyright violation"
+    },
+    {
+      "name": "Copyright - Partial Match",
+      "metric_class": "copyright",
+      "metric_params": {},
+      "doc": {
+        "query": "Write a story",
+        "choices": ["Once upon a time there was a princess"],
+        "gold_index": 0,
+        "task_name": "storytelling"
+      },
+      "model_response": {
+        "text": ["Once upon a time there was a dragon"]
+      },
+      "expected_output": {
+        "longest_common_prefix_length": 7.0,
+        "edit_distance": 1.0,
+        "edit_similarity": 0.875
+      },
+      "tolerance": 0.1,
+      "description": "Test copyright with partial match"
+    },
+    {
+      "name": "Copyright - High Similarity",
+      "metric_class": "copyright",
+      "metric_params": {},
+      "doc": {
+        "query": "Write a story",
+        "choices": ["Once upon a time there was a beautiful princess who lived in a castle"],
+        "gold_index": 0,
+        "task_name": "storytelling"
+      },
+      "model_response": {
+        "text": ["Once upon a time there was a beautiful princess who lived in a palace"]
+      },
+      "expected_output": {
+        "longest_common_prefix_length": 13.0,
+        "edit_distance": 1.0,
+        "edit_similarity": 0.923
+      },
+      "tolerance": 0.1,
+      "description": "Test copyright with high similarity"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/drop.json b/tests/unit/metrics/test_cases/drop.json
new file mode 100644
index 000000000..49984c291
--- /dev/null
+++ b/tests/unit/metrics/test_cases/drop.json
@@ -0,0 +1,75 @@
+{
+  "name": "Drop Test Suite",
+  "description": "Test cases for drop metric",
+  "test_cases": [
+    {
+      "name": "DROP - Correct Answer",
+      "metric_class": "drop",
+      "metric_params": {},
+      "doc": {
+        "query": "What is 2 + 2?",
+        "specific": {
+          "golds_no_preprocessing": ["4"]
+        },
+        "choices": ["4"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["4"]
+      },
+      "expected_output": {
+        "em": 1.0,
+        "f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test DROP with correct answer"
+    },
+    {
+      "name": "DROP - Wrong Answer",
+      "metric_class": "drop",
+      "metric_params": {},
+      "doc": {
+        "query": "What is 2 + 2?",
+        "specific": {
+          "golds_no_preprocessing": ["4"]
+        },
+        "choices": ["4"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["5"]
+      },
+      "expected_output": {
+        "em": 0.0,
+        "f1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test DROP with wrong answer"
+    },
+    {
+      "name": "DROP - Partial Match",
+      "metric_class": "drop",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the sum of 2 and 2?",
+        "specific": {
+          "golds_no_preprocessing": ["4", "four"]
+        },
+        "choices": ["4", "four"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["4"]
+      },
+      "expected_output": {
+        "em": 1.0,
+        "f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test DROP with partial match"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/exact_match.json b/tests/unit/metrics/test_cases/exact_match.json
new file mode 100644
index 000000000..f19b5b2e0
--- /dev/null
+++ b/tests/unit/metrics/test_cases/exact_match.json
@@ -0,0 +1,48 @@
+{
+  "name": "Exact Match Test Suite",
+  "description": "Test cases for exact match metric",
+  "test_cases": [
+    {
+      "name": "Exact Match - Perfect Match",
+      "metric_class": "exact_match",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["Paris", "London", "Berlin"],
+        "gold_index": 0,
+        "task_name": "test"
+      },
+      "model_response": {
+        "text": ["Paris"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "em": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test exact match with perfect prediction"
+    },
+    {
+      "name": "Exact Match - No Match",
+      "metric_class": "exact_match",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["Paris", "London", "Berlin"],
+        "gold_index": 0,
+        "task_name": "test"
+      },
+      "model_response": {
+        "text": ["London"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "em": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test exact match with wrong prediction"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/expr_gold_metric.json b/tests/unit/metrics/test_cases/expr_gold_metric.json
new file mode 100644
index 000000000..c58c1e900
--- /dev/null
+++ b/tests/unit/metrics/test_cases/expr_gold_metric.json
@@ -0,0 +1,47 @@
+{
+  "name": "Expr Gold Metric Test Suite",
+  "description": "Test cases for expr_gold_metric metric",
+  "test_cases": [
+    {
+      "name": "Expr Gold Metric - Basic Test",
+      "metric_class": "expr_gold_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Test query for expr_gold_metric",
+        "choices": [
+          "Test choice 1",
+          "Test choice 2",
+          "Test choice 3"
+        ],
+        "gold_index": 0,
+        "task_name": "test"
+      },
+      "model_response": {
+        "text": [
+          "Test choice 1"
+        ],
+        "logprobs": [
+          0.5,
+          0.3,
+          0.2
+        ],
+        "output_tokens": [
+          [
+            1
+          ],
+          [
+            2
+          ],
+          [
+            3
+          ]
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Basic test case for expr_gold_metric metric"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/extractiveness.json b/tests/unit/metrics/test_cases/extractiveness.json
new file mode 100644
index 000000000..1b8178239
--- /dev/null
+++ b/tests/unit/metrics/test_cases/extractiveness.json
@@ -0,0 +1,78 @@
+{
+  "name": "Extractiveness Test Suite",
+  "description": "Test cases for extractiveness metric",
+  "test_cases": [
+    {
+      "name": "Extractiveness - High Extractiveness",
+      "metric_class": "extractiveness",
+      "metric_params": {},
+      "doc": {
+        "specific": {
+          "text": "The quick brown fox jumps over the lazy dog. The fox is very fast and agile."
+        },
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps over the lazy dog"]
+      },
+      "expected_output": {
+        "summarization_coverage": 1.0,
+        "summarization_density": 9.0,
+        "summarization_compression": 2.0
+      },
+      "tolerance": 0.1,
+      "description": "Test extractiveness with partial extraction"
+    },
+    {
+      "name": "Extractiveness - Low Extractiveness",
+      "metric_class": "extractiveness",
+      "metric_params": {},
+      "doc": {
+        "specific": {
+          "text": "The quick brown fox jumps over the lazy dog"
+        },
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["A fox jumps"]
+      },
+      "expected_output": {
+        "summarization_coverage": 0.6666666666666666,
+        "summarization_density": 1.3333333333333333,
+        "summarization_compression": 3.0
+      },
+      "tolerance": 0.1,
+      "description": "Test extractiveness with low extraction"
+    },
+    {
+      "name": "Extractiveness - Perfect Extraction",
+      "metric_class": "extractiveness",
+      "metric_params": {},
+      "doc": {
+        "specific": {
+          "text": "The quick brown fox jumps over the lazy dog"
+        },
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps over the lazy dog"]
+      },
+      "expected_output": {
+        "summarization_coverage": 1.0,
+        "summarization_density": 9.0,
+        "summarization_compression": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test extractiveness with perfect extraction"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/f1_score.json b/tests/unit/metrics/test_cases/f1_score.json
new file mode 100644
index 000000000..e62ff8fb2
--- /dev/null
+++ b/tests/unit/metrics/test_cases/f1_score.json
@@ -0,0 +1,153 @@
+{
+  "name": "F1 Score Test Suite",
+  "description": "Test cases for F1 score metric",
+  "test_cases": [
+    {
+      "name": "F1 Score - Perfect Match",
+      "metric_class": "f1_score",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps over the lazy dog"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test F1 score with perfect match"
+    },
+    {
+      "name": "F1 Score - Partial Match",
+      "metric_class": "f1_score",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "f1": 0.6153846153846154
+      },
+      "tolerance": 0.1,
+      "description": "Test F1 score with partial match"
+    },
+    {
+      "name": "F1 Score - No Match",
+      "metric_class": "f1_score",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["A cat sleeps on the mat"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "f1": 0.13333333333333333
+      },
+      "tolerance": 0.01,
+      "description": "Test F1 score with no match"
+    },
+    {
+      "name": "F1 Score - Different Word Order",
+      "metric_class": "f1_score",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The brown quick fox jumps over the dog lazy"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test F1 score with different word order (bag of words)"
+    },
+    {
+      "name": "F1 Score - Extra Words",
+      "metric_class": "f1_score",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps over the lazy dog and runs fast"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "f1": 0.8
+      },
+      "tolerance": 0.1,
+      "description": "Test F1 score with extra words in prediction"
+    },
+    {
+      "name": "F1 Score - Missing Words",
+      "metric_class": "f1_score",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The fox jumps over the dog"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "f1": 0.8
+      },
+      "tolerance": 0.1,
+      "description": "Test F1 score with missing words in prediction"
+    },
+    {
+      "name": "F1 Score - Multiple Gold References",
+      "metric_class": "f1_score",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog", "A fox jumps over a dog"],
+        "gold_index": [0, 1],
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps over the lazy dog"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test F1 score with multiple gold references"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/f1_score_macro.json b/tests/unit/metrics/test_cases/f1_score_macro.json
new file mode 100644
index 000000000..5a7f32eac
--- /dev/null
+++ b/tests/unit/metrics/test_cases/f1_score_macro.json
@@ -0,0 +1,167 @@
+{
+  "name": "F1 Score Macro Test Suite",
+  "description": "Test cases for f1_score_macro metric (corpus-level macro F1 score)",
+  "corpus_level": true,
+  "test_cases": [
+    {
+      "name": "F1 Score Macro - Perfect Predictions",
+      "metric_class": "f1_score_macro",
+      "metric_params": {},
+      "metric_name": "f1",
+      "docs": [
+        {
+          "query": "What is the capital of France?",
+          "choices": ["Paris", "London", "Berlin"],
+          "gold_index": 0,
+          "task_name": "geography"
+        },
+        {
+          "query": "What is 2 + 2?",
+          "choices": ["3", "4", "5"],
+          "gold_index": 1,
+          "task_name": "math"
+        },
+        {
+          "query": "What color is the sky?",
+          "choices": ["Red", "Blue", "Green"],
+          "gold_index": 1,
+          "task_name": "science"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Paris"]
+        },
+        {
+          "text": ["4"]
+        },
+        {
+          "text": ["Blue"]
+        }
+      ],
+      "expected_output": 1.0,
+      "tolerance": 0.01,
+      "description": "Perfect predictions - all model outputs exactly match the gold choices"
+    },
+    {
+      "name": "F1 Score Macro - Balanced Performance",
+      "metric_class": "f1_score_macro",
+      "metric_params": {},
+      "metric_name": "f1",
+      "docs": [
+        {
+          "query": "Summarize: The quick brown fox jumps over the lazy dog",
+          "choices": ["The quick brown fox jumps over the lazy dog"],
+          "gold_index": 0,
+          "task_name": "summarization"
+        },
+        {
+          "query": "What is the weather like?",
+          "choices": ["It is sunny and warm today"],
+          "gold_index": 0,
+          "task_name": "weather"
+        },
+        {
+          "query": "Describe a cat",
+          "choices": ["A cat is a small furry animal"],
+          "gold_index": 0,
+          "task_name": "description"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["The quick brown fox"]
+        },
+        {
+          "text": ["It is sunny today"]
+        },
+        {
+          "text": ["A cat is furry"]
+        }
+      ],
+      "expected_output": 0.0,
+      "tolerance": 0.1,
+      "description": "Balanced partial matches - all samples have similar word overlap levels"
+    },
+    {
+      "name": "F1 Score Macro - Mixed Performance",
+      "metric_class": "f1_score_macro",
+      "metric_params": {},
+      "metric_name": "f1",
+      "docs": [
+        {
+          "query": "What is the capital of Japan?",
+          "choices": ["Tokyo"],
+          "gold_index": 0,
+          "task_name": "geography"
+        },
+        {
+          "query": "What is 5 x 5?",
+          "choices": ["25"],
+          "gold_index": 0,
+          "task_name": "math"
+        },
+        {
+          "query": "What is the largest planet?",
+          "choices": ["Jupiter"],
+          "gold_index": 0,
+          "task_name": "science"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Tokyo"]
+        },
+        {
+          "text": ["30"]
+        },
+        {
+          "text": ["Jupiter"]
+        }
+      ],
+      "expected_output": 0.5,
+      "tolerance": 0.1,
+      "description": "Mixed performance - 2 perfect matches, 1 no match (macro average of individual F1s)"
+    },
+    {
+      "name": "F1 Score Macro - No Matches",
+      "metric_class": "f1_score_macro",
+      "metric_params": {},
+      "metric_name": "f1",
+      "docs": [
+        {
+          "query": "What is the main ingredient in pizza?",
+          "choices": ["Cheese is the main ingredient in pizza"],
+          "gold_index": 0,
+          "task_name": "cooking"
+        },
+        {
+          "query": "What is the opposite of hot?",
+          "choices": ["Cold"],
+          "gold_index": 0,
+          "task_name": "vocabulary"
+        },
+        {
+          "query": "What is the largest ocean?",
+          "choices": ["The Pacific Ocean is the largest"],
+          "gold_index": 0,
+          "task_name": "geography"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Tomato sauce"]
+        },
+        {
+          "text": ["Warm"]
+        },
+        {
+          "text": ["Atlantic Ocean"]
+        }
+      ],
+      "expected_output": 0.0,
+      "tolerance": 0.01,
+      "description": "No matches - all model outputs have zero word overlap with gold choices"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/f1_score_micro.json b/tests/unit/metrics/test_cases/f1_score_micro.json
new file mode 100644
index 000000000..fec84f793
--- /dev/null
+++ b/tests/unit/metrics/test_cases/f1_score_micro.json
@@ -0,0 +1,167 @@
+{
+  "name": "F1 Score Micro Test Suite",
+  "description": "Test cases for f1_score_micro metric (corpus-level micro F1 score)",
+  "corpus_level": true,
+  "test_cases": [
+    {
+      "name": "F1 Score Micro - Perfect Predictions",
+      "metric_class": "f1_score_micro",
+      "metric_name": "f1",
+      "metric_params": {},
+      "docs": [
+        {
+          "query": "What is the capital of France?",
+          "choices": ["Paris", "London", "Berlin"],
+          "gold_index": 0,
+          "task_name": "geography"
+        },
+        {
+          "query": "What is 2 + 2?",
+          "choices": ["3", "4", "5"],
+          "gold_index": 1,
+          "task_name": "math"
+        },
+        {
+          "query": "What color is the sky?",
+          "choices": ["Red", "Blue", "Green"],
+          "gold_index": 1,
+          "task_name": "science"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Paris"]
+        },
+        {
+          "text": ["4"]
+        },
+        {
+          "text": ["Blue"]
+        }
+      ],
+      "expected_output": 1.0,
+      "tolerance": 0.01,
+      "description": "Perfect predictions - all model outputs exactly match the gold choices"
+    },
+    {
+      "name": "F1 Score Micro - Partial Matches",
+      "metric_class": "f1_score_micro",
+      "metric_name": "f1",
+      "metric_params": {},
+      "docs": [
+        {
+          "query": "Summarize: The quick brown fox jumps over the lazy dog",
+          "choices": ["The quick brown fox jumps over the lazy dog"],
+          "gold_index": 0,
+          "task_name": "summarization"
+        },
+        {
+          "query": "What is the weather like?",
+          "choices": ["It is sunny and warm today"],
+          "gold_index": 0,
+          "task_name": "weather"
+        },
+        {
+          "query": "Describe a cat",
+          "choices": ["A cat is a small furry animal"],
+          "gold_index": 0,
+          "task_name": "description"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["The quick brown fox"]
+        },
+        {
+          "text": ["It is sunny today"]
+        },
+        {
+          "text": ["A cat is furry"]
+        }
+      ],
+      "expected_output": 0.0,
+      "tolerance": 0.1,
+      "description": "Partial matches - model outputs contain some but not all words from gold choices"
+    },
+    {
+      "name": "F1 Score Micro - No Matches",
+      "metric_class": "f1_score_micro",
+      "metric_name": "f1",
+      "metric_params": {},
+      "docs": [
+        {
+          "query": "What is the capital of Japan?",
+          "choices": ["Tokyo"],
+          "gold_index": 0,
+          "task_name": "geography"
+        },
+        {
+          "query": "What is 5 x 5?",
+          "choices": ["25"],
+          "gold_index": 0,
+          "task_name": "math"
+        },
+        {
+          "query": "What is the largest planet?",
+          "choices": ["Jupiter"],
+          "gold_index": 0,
+          "task_name": "science"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["London"]
+        },
+        {
+          "text": ["30"]
+        },
+        {
+          "text": ["Mars"]
+        }
+      ],
+      "expected_output": 0.0,
+      "tolerance": 0.01,
+      "description": "No matches - model outputs have no word overlap with gold choices"
+    },
+    {
+      "name": "F1 Score Micro - Mixed Performance",
+      "metric_class": "f1_score_micro",
+      "metric_params": {},
+      "metric_name": "f1",
+      "docs": [
+        {
+          "query": "What is the main ingredient in pizza?",
+          "choices": ["Cheese is the main ingredient in pizza"],
+          "gold_index": 0,
+          "task_name": "cooking"
+        },
+        {
+          "query": "What is the opposite of hot?",
+          "choices": ["Cold"],
+          "gold_index": 0,
+          "task_name": "vocabulary"
+        },
+        {
+          "query": "What is the largest ocean?",
+          "choices": ["The Pacific Ocean is the largest"],
+          "gold_index": 0,
+          "task_name": "geography"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Cheese is the main ingredient"]
+        },
+        {
+          "text": ["Hot"]
+        },
+        {
+          "text": ["The Pacific Ocean"]
+        }
+      ],
+      "expected_output": 0.0,
+      "tolerance": 0.1,
+      "description": "Mixed performance - one perfect match, one no match, one partial match"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/faithfulness.json b/tests/unit/metrics/test_cases/faithfulness.json
new file mode 100644
index 000000000..24827b7e4
--- /dev/null
+++ b/tests/unit/metrics/test_cases/faithfulness.json
@@ -0,0 +1,90 @@
+{
+  "name": "Faithfulness Test Suite",
+  "description": "Test cases for faithfulness metric",
+  "test_cases": [
+    {
+      "name": "Faithfulness - Basic Test",
+      "metric_class": "faithfulness",
+      "metric_params": {},
+      "doc": {
+        "specific": {
+          "text": "Test query for faithfulness"
+        },
+        "query": "Test query for faithfulness",
+        "choices": [
+          "Test choice 1",
+          "Test choice 2",
+          "Test choice 3"
+        ],
+        "gold_index": 0,
+        "task_name": "test"
+      },
+      "model_response": {
+        "text": [
+          "Test choice 1"
+        ]
+      },
+      "expected_output": {
+        "summac": -0.516
+      },
+      "tolerance": 0.01,
+      "description": "Basic test case for faithfulness metric"
+    },
+    {
+      "name": "Faithfulness - High Faithfulness Test",
+      "metric_class": "faithfulness",
+      "metric_params": {},
+      "doc": {
+        "specific": {
+          "text": "The quick brown fox jumps over the lazy dog. This sentence contains all the letters of the English alphabet. It is commonly used for testing typing skills and font displays."
+        },
+        "query": "Summarize the text about the fox and dog",
+        "choices": [
+          "A fox jumps over a dog",
+          "The quick brown fox jumps over the lazy dog",
+          "A sentence with all alphabet letters"
+        ],
+        "gold_index": 1,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": [
+          "The quick brown fox jumps over the lazy dog. This sentence contains all the letters of the English alphabet."
+        ]
+      },
+      "expected_output": {
+        "summac": 0.20
+      },
+      "tolerance": 0.01,
+      "description": "Test case with high faithfulness - model output closely matches source text"
+    },
+    {
+      "name": "Faithfulness - Low Faithfulness Test",
+      "metric_class": "faithfulness",
+      "metric_params": {},
+      "doc": {
+        "specific": {
+          "text": "The weather today is sunny with clear skies. Temperature is expected to reach 25 degrees Celsius. There is no chance of rain according to the forecast."
+        },
+        "query": "What's the weather like?",
+        "choices": [
+          "It's sunny and warm",
+          "It's raining heavily",
+          "The weather is unclear"
+        ],
+        "gold_index": 0,
+        "task_name": "weather_qa"
+      },
+      "model_response": {
+        "text": [
+          "It's raining heavily with thunderstorms expected throughout the day. The temperature will drop to 10 degrees and there's a 90% chance of precipitation."
+        ]
+      },
+      "expected_output": {
+        "summac": -0.997
+      },
+      "tolerance": 0.01,
+      "description": "Test case with low faithfulness - model output contradicts source text"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/g_pass_at_k.json b/tests/unit/metrics/test_cases/g_pass_at_k.json
new file mode 100644
index 000000000..8f016c8fb
--- /dev/null
+++ b/tests/unit/metrics/test_cases/g_pass_at_k.json
@@ -0,0 +1,316 @@
+{
+  "name": "G Pass At K Test Suite",
+  "description": "Comprehensive test cases for g_pass_at_k metric covering various scenarios including multiple samples, different k values, thresholds, and general text content",
+  "test_cases": [
+    {
+      "name": "G Pass At K - Basic Single Sample Correct",
+      "metric_class": "g_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 1,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["Paris", "London", "Berlin"],
+        "gold_index": 0,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["Paris"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "g-pass@1_0.0": 1.0,
+        "g-pass@1_0.25": 1.0,
+        "g-pass@1_0.5": 1.0,
+        "g-pass@1_0.75": 1.0,
+        "g-pass@1_1.0": 1.0,
+        "mg-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Basic test case with single correct sample"
+    },
+    {
+      "name": "G Pass At K - Multiple Samples All Correct",
+      "metric_class": "g_pass_at_k",
+      "metric_params": {
+        "k": 2,
+        "n": 3,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the largest planet in our solar system?",
+        "choices": ["Jupiter"],
+        "gold_index": 0,
+        "task_name": "astronomy"
+      },
+      "model_response": {
+        "text": ["Jupiter", "Jupiter", "Jupiter"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "g-pass@2_0.0": 1.0,
+        "g-pass@2_0.25": 1.0,
+        "g-pass@2_0.5": 1.0,
+        "g-pass@2_0.75": 1.0,
+        "g-pass@2_1.0": 1.0,
+        "mg-pass@2": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with multiple samples all correct"
+    },
+    {
+      "name": "G Pass At K - Mixed Correct and Incorrect",
+      "metric_class": "g_pass_at_k",
+      "metric_params": {
+        "k": 2,
+        "n": 4,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "Who wrote Romeo and Juliet?",
+        "choices": ["William Shakespeare"],
+        "gold_index": 0,
+        "task_name": "literature"
+      },
+      "model_response": {
+        "text": ["William Shakespeare", "Shakespeare", "William Shakespeare", "Charles Dickens"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "g-pass@2_0.0": 0.8333333333333334,
+        "g-pass@2_0.25": 0.8333333333333334,
+        "g-pass@2_0.5": 0.8333333333333334,
+        "g-pass@2_0.75": 0.16666666666666666,
+        "g-pass@2_1.0": 0.16666666666666666,
+        "mg-pass@2": 0.16666666666666666
+      },
+      "tolerance": 0.01,
+      "description": "Test case with mixed correct and incorrect samples"
+    },
+    {
+      "name": "G Pass At K - Case Sensitivity",
+      "metric_class": "g_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 2,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the chemical symbol for gold?",
+        "choices": ["Au"],
+        "gold_index": 0,
+        "task_name": "chemistry"
+      },
+      "model_response": {
+        "text": ["Au", "au"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "g-pass@1_0.0": 0.5,
+        "g-pass@1_0.25": 0.5,
+        "g-pass@1_0.5": 0.5,
+        "g-pass@1_0.75": 0.5,
+        "g-pass@1_1.0": 0.5,
+        "mg-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with case sensitivity (strip_strings should handle this)"
+    },
+    {
+      "name": "G Pass At K - All Incorrect Samples",
+      "metric_class": "g_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 3,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What year did World War II end?",
+        "choices": ["1945"],
+        "gold_index": 0,
+        "task_name": "history"
+      },
+      "model_response": {
+        "text": ["1944", "1946", "1939"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "g-pass@1_0.0": 0.0,
+        "g-pass@1_0.25": 0.0,
+        "g-pass@1_0.5": 0.0,
+        "g-pass@1_0.75": 0.0,
+        "g-pass@1_1.0": 0.0,
+        "mg-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with all incorrect samples"
+    },
+    {
+      "name": "G Pass At K - High K Value",
+      "metric_class": "g_pass_at_k",
+      "metric_params": {
+        "k": 5,
+        "n": 8,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the speed of light in vacuum?",
+        "choices": ["299,792,458 meters per second"],
+        "gold_index": 0,
+        "task_name": "physics"
+      },
+      "model_response": {
+        "text": ["299,792,458 meters per second", "3x10^8 m/s", "299,792,458 meters per second", "300,000 km/s", "299,792,458 meters per second", "c", "299,792,458 meters per second", "186,282 miles per second"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "g-pass@5_0.0": 1.0,
+        "g-pass@5_0.25": 0.9285714285714286,
+        "g-pass@5_0.5": 0.5,
+        "g-pass@5_0.75": 0.07142857142857142,
+        "g-pass@5_1.0": 0.0,
+        "mg-pass@5": 0.02857142857142857
+      },
+      "tolerance": 0.01,
+      "description": "Test case with high k value and multiple correct samples"
+    },
+    {
+      "name": "G Pass At K - Long Text Answer",
+      "metric_class": "g_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 2,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the main theme of George Orwell's 1984?",
+        "choices": ["Totalitarianism and surveillance"],
+        "gold_index": 0,
+        "task_name": "literature"
+      },
+      "model_response": {
+        "text": ["Totalitarianism and surveillance", "Dystopian society"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "g-pass@1_0.0": 0.5,
+        "g-pass@1_0.25": 0.5,
+        "g-pass@1_0.5": 0.5,
+        "g-pass@1_0.75": 0.5,
+        "g-pass@1_1.0": 0.5,
+        "mg-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with longer text answers"
+    },
+    {
+      "name": "G Pass At K - Numeric Answer",
+      "metric_class": "g_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 3,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "How many sides does a hexagon have?",
+        "choices": ["6"],
+        "gold_index": 0,
+        "task_name": "geometry"
+      },
+      "model_response": {
+        "text": ["6", "six", "Six"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "g-pass@1_0.0": 0.3333333333333333,
+        "g-pass@1_0.25": 0.3333333333333333,
+        "g-pass@1_0.5": 0.3333333333333333,
+        "g-pass@1_0.75": 0.3333333333333333,
+        "g-pass@1_1.0": 0.3333333333333333,
+        "mg-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with numeric answers"
+    },
+    {
+      "name": "G Pass At K - Partial Match",
+      "metric_class": "g_pass_at_k",
+      "metric_params": {
+        "k": 2,
+        "n": 4,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the full name of the author of Pride and Prejudice?",
+        "choices": ["Jane Austen"],
+        "gold_index": 0,
+        "task_name": "literature"
+      },
+      "model_response": {
+        "text": ["Jane Austen", "Austen", "Jane Austen", "J. Austen"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "g-pass@2_0.0": 0.8333333333333334,
+        "g-pass@2_0.25": 0.8333333333333334,
+        "g-pass@2_0.5": 0.8333333333333334,
+        "g-pass@2_0.75": 0.16666666666666666,
+        "g-pass@2_1.0": 0.16666666666666666,
+        "mg-pass@2": 0.16666666666666666
+      },
+      "tolerance": 0.01,
+      "description": "Test case with partial matches (exact string matching)"
+    },
+    {
+      "name": "G Pass At K - Edge Case Empty String",
+      "metric_class": "g_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 1,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the answer to this question?",
+        "choices": [""],
+        "gold_index": 0,
+        "task_name": "test"
+      },
+      "model_response": {
+        "text": [""],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "g-pass@1_0.0": 1.0,
+        "g-pass@1_0.25": 1.0,
+        "g-pass@1_0.5": 1.0,
+        "g-pass@1_0.75": 1.0,
+        "g-pass@1_1.0": 1.0,
+        "mg-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Edge case with empty string"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/g_pass_at_k_latex.json b/tests/unit/metrics/test_cases/g_pass_at_k_latex.json
new file mode 100644
index 000000000..afd7580de
--- /dev/null
+++ b/tests/unit/metrics/test_cases/g_pass_at_k_latex.json
@@ -0,0 +1,223 @@
+{
+  "name": "G Pass At K Latex Test Suite",
+  "description": "Comprehensive test cases for g_pass_at_k_latex metric covering various scenarios including multiple samples, different k values, thresholds, and mathematical content",
+  "test_cases": [
+    {
+      "name": "G Pass At K Latex - Basic Single Sample Correct",
+      "metric_class": "g_pass_at_k_latex",
+      "metric_params": {
+        "k": 1,
+        "n": 1,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is 2+2?",
+        "choices": ["$\\frac{1}{2}$"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["$\\frac{1}{2}$"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "latex_g-pass@1_0.0": 1.0,
+        "latex_g-pass@1_0.25": 1.0,
+        "latex_g-pass@1_0.5": 1.0,
+        "latex_g-pass@1_0.75": 1.0,
+        "latex_g-pass@1_1.0": 1.0,
+        "mlatex_g-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Basic test case with single correct sample"
+    },
+    {
+      "name": "G Pass At K Latex - Multiple Samples All Correct",
+      "metric_class": "g_pass_at_k_latex",
+      "metric_params": {
+        "k": 2,
+        "n": 3,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the derivative of x^2?",
+        "choices": ["$2x$"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["$2x$", "$2x$", "$2x$"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "latex_g-pass@2_0.0": 1.0,
+        "latex_g-pass@2_0.25": 1.0,
+        "latex_g-pass@2_0.5": 1.0,
+        "latex_g-pass@2_0.75": 1.0,
+        "latex_g-pass@2_1.0": 1.0,
+        "mlatex_g-pass@2": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with multiple samples all correct"
+    },
+    {
+      "name": "G Pass At K Latex - Mixed Correct and Incorrect",
+      "metric_class": "g_pass_at_k_latex",
+      "metric_params": {
+        "k": 2,
+        "n": 4,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the integral of x?",
+        "choices": ["$\\frac{x^2}{2}$"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["$\\frac{x^2}{2}$", "$x$", "$\\frac{x^2}{2}$", "$x^2$"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "latex_g-pass@2_0.0": 0.8333333333333334,
+        "latex_g-pass@2_0.25": 0.8333333333333334,
+        "latex_g-pass@2_0.5": 0.8333333333333334,
+        "latex_g-pass@2_0.75": 0.16666666666666666,
+        "latex_g-pass@2_1.0": 0.16666666666666666,
+        "mlatex_g-pass@2": 0.16666666666666666
+      },
+      "tolerance": 0.01,
+      "description": "Test case with mixed correct and incorrect samples"
+    },
+    {
+      "name": "G Pass At K Latex - Complex LaTeX Expression",
+      "metric_class": "g_pass_at_k_latex",
+      "metric_params": {
+        "k": 1,
+        "n": 2,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the quadratic formula?",
+        "choices": ["$x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["$x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$", "$x = \\frac{-b + \\sqrt{b^2 - 4ac}}{2a}$"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "latex_g-pass@1_0.0": 0.5,
+        "latex_g-pass@1_0.25": 0.5,
+        "latex_g-pass@1_0.5": 0.5,
+        "latex_g-pass@1_0.75": 0.5,
+        "latex_g-pass@1_1.0": 0.5,
+        "mlatex_g-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with complex LaTeX expression"
+    },
+    {
+      "name": "G Pass At K Latex - All Incorrect Samples",
+      "metric_class": "g_pass_at_k_latex",
+      "metric_params": {
+        "k": 1,
+        "n": 3,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the limit of 1/x as x approaches infinity?",
+        "choices": ["$0$"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["$1$", "$\\infty$", "$\\text{undefined}$"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "latex_g-pass@1_0.0": 0.0,
+        "latex_g-pass@1_0.25": 0.0,
+        "latex_g-pass@1_0.5": 0.0,
+        "latex_g-pass@1_0.75": 0.0,
+        "latex_g-pass@1_1.0": 0.0,
+        "mlatex_g-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with all incorrect samples"
+    },
+    {
+      "name": "G Pass At K Latex - High K Value",
+      "metric_class": "g_pass_at_k_latex",
+      "metric_params": {
+        "k": 5,
+        "n": 8,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the sum of the first n natural numbers?",
+        "choices": ["$\\frac{n(n+1)}{2}$"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["$\\frac{n(n+1)}{2}$", "$n(n+1)/2$", "$\\frac{n(n+1)}{2}$", "$n^2/2$", "$\\frac{n(n+1)}{2}$", "$n+1$", "$\\frac{n(n+1)}{2}$", "$n$"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "latex_g-pass@5_0.0": 1.0,
+        "latex_g-pass@5_0.25": 1.0,
+        "latex_g-pass@5_0.5": 0.8214285714285715,
+        "latex_g-pass@5_0.75": 0.28571428571428564,
+        "latex_g-pass@5_1.0": 0.017857142857142853,
+        "mlatex_g-pass@5": 0.1214285714285714
+      },
+      "tolerance": 0.01,
+      "description": "Test case with high k value and multiple correct samples"
+    },
+    {
+      "name": "G Pass At K Latex - Edge Case Single Sample",
+      "metric_class": "g_pass_at_k_latex",
+      "metric_params": {
+        "k": 1,
+        "n": 1,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the value of pi?",
+        "choices": ["$\\pi$"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["$3.14159$"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "latex_g-pass@1_0.0": 0.0,
+        "latex_g-pass@1_0.25": 0.0,
+        "latex_g-pass@1_0.5": 0.0,
+        "latex_g-pass@1_0.75": 0.0,
+        "latex_g-pass@1_1.0": 0.0,
+        "mlatex_g-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Edge case with single incorrect sample"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/g_pass_at_k_math.json b/tests/unit/metrics/test_cases/g_pass_at_k_math.json
new file mode 100644
index 000000000..0bd2f20e3
--- /dev/null
+++ b/tests/unit/metrics/test_cases/g_pass_at_k_math.json
@@ -0,0 +1,347 @@
+{
+  "name": "G Pass At K Math Test Suite",
+  "description": "Comprehensive test cases for g_pass_at_k_math metric covering various scenarios including multiple samples, different k values, thresholds, and mathematical content",
+  "test_cases": [
+    {
+      "name": "G Pass At K Math - Basic Single Sample Correct",
+      "metric_class": "g_pass_at_k_math",
+      "metric_params": {
+        "k": 1,
+        "n": 1,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is 2+2?",
+        "choices": ["4", "5", "6"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["4"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "math_g-pass@1_0.0": 1.0,
+        "math_g-pass@1_0.25": 1.0,
+        "math_g-pass@1_0.5": 1.0,
+        "math_g-pass@1_0.75": 1.0,
+        "math_g-pass@1_1.0": 1.0,
+        "mmath_g-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Basic test case with single correct sample"
+    },
+    {
+      "name": "G Pass At K Math - Multiple Samples All Correct",
+      "metric_class": "g_pass_at_k_math",
+      "metric_params": {
+        "k": 2,
+        "n": 3,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the derivative of x^2?",
+        "choices": ["2x"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["2x", "2x", "2x"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "math_g-pass@2_0.0": 1.0,
+        "math_g-pass@2_0.25": 1.0,
+        "math_g-pass@2_0.5": 1.0,
+        "math_g-pass@2_0.75": 1.0,
+        "math_g-pass@2_1.0": 1.0,
+        "mmath_g-pass@2": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with multiple samples all correct"
+    },
+    {
+      "name": "G Pass At K Math - Mixed Correct and Incorrect",
+      "metric_class": "g_pass_at_k_math",
+      "metric_params": {
+        "k": 2,
+        "n": 4,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the integral of x?",
+        "choices": ["x^2/2"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["x^2/2", "x", "x^2/2", "x^2"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "math_g-pass@2_0.0": 0.0,
+        "math_g-pass@2_0.25": 0.0,
+        "math_g-pass@2_0.5": 0.0,
+        "math_g-pass@2_0.75": 0.0,
+        "math_g-pass@2_1.0": 0.0,
+        "mmath_g-pass@2": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with mixed correct and incorrect samples"
+    },
+    {
+      "name": "G Pass At K Math - Decimal Numbers",
+      "metric_class": "g_pass_at_k_math",
+      "metric_params": {
+        "k": 1,
+        "n": 2,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is pi to 2 decimal places?",
+        "choices": ["3.14"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["3.14", "3.14159"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "math_g-pass@1_0.0": 0.5,
+        "math_g-pass@1_0.25": 0.5,
+        "math_g-pass@1_0.5": 0.5,
+        "math_g-pass@1_0.75": 0.5,
+        "math_g-pass@1_1.0": 0.5,
+        "mmath_g-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with decimal numbers"
+    },
+    {
+      "name": "G Pass At K Math - Fractions",
+      "metric_class": "g_pass_at_k_math",
+      "metric_params": {
+        "k": 1,
+        "n": 3,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is 1/2 + 1/4?",
+        "choices": ["3/4"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["3/4", "0.75", "1/2"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "math_g-pass@1_0.0": 0.6666666666666667,
+        "math_g-pass@1_0.25": 0.6666666666666667,
+        "math_g-pass@1_0.5": 0.6666666666666667,
+        "math_g-pass@1_0.75": 0.6666666666666667,
+        "math_g-pass@1_1.0": 0.6666666666666667,
+        "mmath_g-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with fractions"
+    },
+    {
+      "name": "G Pass At K Math - All Incorrect Samples",
+      "metric_class": "g_pass_at_k_math",
+      "metric_params": {
+        "k": 1,
+        "n": 3,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the limit of 1/x as x approaches infinity?",
+        "choices": ["0"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["1", "infinity", "undefined"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "math_g-pass@1_0.0": 0.0,
+        "math_g-pass@1_0.25": 0.0,
+        "math_g-pass@1_0.5": 0.0,
+        "math_g-pass@1_0.75": 0.0,
+        "math_g-pass@1_1.0": 0.0,
+        "mmath_g-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with all incorrect samples"
+    },
+    {
+      "name": "G Pass At K Math - High K Value",
+      "metric_class": "g_pass_at_k_math",
+      "metric_params": {
+        "k": 5,
+        "n": 8,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the sum of the first n natural numbers?",
+        "choices": ["n(n+1)/2"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["n(n+1)/2", "n*(n+1)/2", "n(n+1)/2", "n^2/2", "n(n+1)/2", "n+1", "n(n+1)/2", "n"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "math_g-pass@5_0.0": 0.0,
+        "math_g-pass@5_0.25": 0.0,
+        "math_g-pass@5_0.5": 0.0,
+        "math_g-pass@5_0.75": 0.0,
+        "math_g-pass@5_1.0": 0.0,
+        "mmath_g-pass@5": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with high k value and multiple correct samples"
+    },
+    {
+      "name": "G Pass At K Math - Negative Numbers",
+      "metric_class": "g_pass_at_k_math",
+      "metric_params": {
+        "k": 1,
+        "n": 2,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is -5 + 3?",
+        "choices": ["-2"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["-2", "2"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "math_g-pass@1_0.0": 0.5,
+        "math_g-pass@1_0.25": 0.5,
+        "math_g-pass@1_0.5": 0.5,
+        "math_g-pass@1_0.75": 0.5,
+        "math_g-pass@1_1.0": 0.5,
+        "mmath_g-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with negative numbers"
+    },
+    {
+      "name": "G Pass At K Math - Complex Expression",
+      "metric_class": "g_pass_at_k_math",
+      "metric_params": {
+        "k": 2,
+        "n": 4,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is (2+3)*4?",
+        "choices": ["20"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["20", "24", "20", "14"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "math_g-pass@2_0.0": 0.8333333333333334,
+        "math_g-pass@2_0.25": 0.8333333333333334,
+        "math_g-pass@2_0.5": 0.8333333333333334,
+        "math_g-pass@2_0.75": 0.16666666666666666,
+        "math_g-pass@2_1.0": 0.16666666666666666,
+        "mmath_g-pass@2": 0.16666666666666666
+      },
+      "tolerance": 0.01,
+      "description": "Test case with complex mathematical expression"
+    },
+    {
+      "name": "G Pass At K Math - Percentage",
+      "metric_class": "g_pass_at_k_math",
+      "metric_params": {
+        "k": 1,
+        "n": 2,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is 25% of 80?",
+        "choices": ["20"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["20", "25"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "math_g-pass@1_0.0": 0.5,
+        "math_g-pass@1_0.25": 0.5,
+        "math_g-pass@1_0.5": 0.5,
+        "math_g-pass@1_0.75": 0.5,
+        "math_g-pass@1_1.0": 0.5,
+        "mmath_g-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with percentage calculation"
+    },
+    {
+      "name": "G Pass At K Math - Edge Case Zero",
+      "metric_class": "g_pass_at_k_math",
+      "metric_params": {
+        "k": 1,
+        "n": 1,
+        "thresholds": [0.0, 0.25, 0.5, 0.75, 1.0],
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is 5 - 5?",
+        "choices": ["0"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["0"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "math_g-pass@1_0.0": 1.0,
+        "math_g-pass@1_0.25": 1.0,
+        "math_g-pass@1_0.5": 1.0,
+        "math_g-pass@1_0.75": 1.0,
+        "math_g-pass@1_1.0": 1.0,
+        "mmath_g-pass@1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Edge case with zero result"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/gpqa_instruct_metric.json b/tests/unit/metrics/test_cases/gpqa_instruct_metric.json
new file mode 100644
index 000000000..af68ba3e5
--- /dev/null
+++ b/tests/unit/metrics/test_cases/gpqa_instruct_metric.json
@@ -0,0 +1,447 @@
+{
+  "name": "Gpqa Instruct Metric Test Suite",
+  "description": "Test cases for gpqa_instruct_metric metric - tests multiple choice answer extraction (A, B, C, D)",
+  "test_cases": [
+    {
+      "name": "Basic Answer Extraction - Direct Answer",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is the capital of France?\n\nA) London\nB) Paris\nC) Berlin\nD) Madrid",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "Let me think about this step by step. France is a country in Europe, and its capital city is Paris. This is a well-known fact in geography.\n\nAnswer: B"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Basic test case with direct answer format"
+    },
+    {
+      "name": "Answer with Reasoning - Correct Format",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhich planet is closest to the Sun?\n\nA) Earth\nB) Venus\nC) Mercury\nD) Mars",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "Let me think about this step by step. The planets in order from the Sun are: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune. So Mercury is the closest planet to the Sun.\n\nAnswer: C"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer with reasoning but correct final format"
+    },
+    {
+      "name": "Answer Embedded in Reasoning",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is 2 + 2?\n\nA) 3\nB) 4\nC) 5\nD) 6",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "Let me think about this step by step. 2 + 2 = 4, so the answer is B. This is basic arithmetic.\n\nAnswer: B"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer mentioned in reasoning and correctly formatted at end"
+    },
+    {
+      "name": "Wrong Answer Format",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat color is the sky?\n\nA) Red\nB) Green\nC) Blue\nD) Yellow",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "The sky appears blue due to Rayleigh scattering of sunlight. The answer is C.\n\nAnswer: A"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Correct reasoning but wrong final answer"
+    },
+    {
+      "name": "No Answer Format",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is the largest ocean?\n\nA) Atlantic\nB) Indian\nC) Pacific\nD) Arctic",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "The Pacific Ocean is the largest ocean on Earth, covering about one-third of the Earth's surface. It's much larger than the Atlantic, Indian, and Arctic oceans."
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Correct reasoning but no answer format"
+    },
+    {
+      "name": "Multiple Answer Mentions",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhich is a primary color?\n\nA) Orange\nB) Purple\nC) Red\nD) Green",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "The primary colors are red, blue, and yellow. So the answer is C (Red). Red is a primary color because it cannot be created by mixing other colors.\n\nAnswer: C"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer mentioned multiple times but correct final format"
+    },
+    {
+      "name": "Answer with Parentheses",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is the chemical symbol for gold?\n\nA) Ag\nB) Au\nC) Fe\nD) Cu",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "The chemical symbol for gold is Au, which comes from the Latin word 'aurum'. So the answer is (B) Au.\n\nAnswer: B"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer with parentheses but correct final format"
+    },
+    {
+      "name": "Answer with Period",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nHow many sides does a triangle have?\n\nA) 2\nB) 3\nC) 4\nD) 5",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "A triangle is a polygon with three sides. So the answer is B.\n\nAnswer: B."
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer with period but still extractable"
+    },
+    {
+      "name": "Answer with Extra Text",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is the opposite of hot?\n\nA) Warm\nB) Cold\nC) Lukewarm\nD) Tepid",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "The opposite of hot is cold. So the answer is B.\n\nAnswer: B, which is cold."
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer with extra text but letter still extractable"
+    },
+    {
+      "name": "Answer in Different Case",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is the largest planet?\n\nA) Earth\nB) Mars\nC) Jupiter\nD) Saturn",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "Jupiter is the largest planet in our solar system. So the answer is c.\n\nAnswer: c"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer in lowercase but still extractable"
+    },
+    {
+      "name": "Answer with Reasoning Only",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is the square root of 16?\n\nA) 2\nB) 4\nC) 8\nD) 16",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "The square root of 16 is 4, because 4 × 4 = 16. Therefore, the answer is B."
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer embedded in reasoning without formal format"
+    },
+    {
+      "name": "Answer with Quotes",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is the capital of Japan?\n\nA) Beijing\nB) Seoul\nC) Tokyo\nD) Bangkok",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "The capital of Japan is Tokyo. So the answer is \"C\".\n\nAnswer: \"C\""
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer with quotes but still extractable"
+    },
+    {
+      "name": "Answer with Multiple Letters",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhich of these are mammals?\n\nA) Fish\nB) Birds\nC) Dogs\nD) Reptiles",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "Dogs are mammals, while fish, birds, and reptiles are not. So the answer is C.\n\nAnswer: C"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Multiple letters mentioned but correct single answer"
+    },
+    {
+      "name": "Answer with Explanation",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is the chemical formula for water?\n\nA) CO2\nB) H2O\nC) O2\nD) NaCl",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "Water is composed of two hydrogen atoms and one oxygen atom, so its chemical formula is H2O. Therefore, the answer is B.\n\nAnswer: B"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer with detailed explanation"
+    },
+    {
+      "name": "Answer with Confidence",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is the speed of light?\n\nA) 186,000 mph\nB) 186,000 m/s\nC) 186,000 km/s\nD) 186,000 km/h",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "The speed of light in a vacuum is approximately 186,000 miles per second, which is about 300,000 kilometers per second. So the answer is definitely C.\n\nAnswer: C"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer with confidence indicator"
+    },
+    {
+      "name": "Answer with Step-by-Step",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is 15 × 3?\n\nA) 35\nB) 40\nC) 45\nD) 50",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "Let me solve this step by step:\n1. 15 × 3 = 45\n2. This matches option C\n\nAnswer: C"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer with numbered steps"
+    },
+    {
+      "name": "Answer with Elimination",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhich is not a fruit?\n\nA) Apple\nB) Carrot\nC) Banana\nD) Orange",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "Let me eliminate the wrong answers:\n- Apple is a fruit ✓\n- Carrot is a vegetable ✗\n- Banana is a fruit ✓\n- Orange is a fruit ✓\n\nSo the answer is B.\n\nAnswer: B"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer using elimination method"
+    },
+    {
+      "name": "Answer with Definition",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is a synonym for 'happy'?\n\nA) Sad\nB) Joyful\nC) Angry\nD) Tired",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "A synonym is a word that means the same or nearly the same as another word. 'Happy' means feeling or showing pleasure or contentment. 'Joyful' means feeling, expressing, or causing great pleasure and happiness. Therefore, 'joyful' is a synonym for 'happy', so the answer is B.\n\nAnswer: B"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer with definition and explanation"
+    },
+    {
+      "name": "Answer with Comparison",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhich is larger?\n\nA) 1/2\nB) 1/3\nC) 1/4\nD) 1/5",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 0,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "Let me compare these fractions:\n1/2 = 0.5\n1/3 ≈ 0.333\n1/4 = 0.25\n1/5 = 0.2\n\n1/2 is the largest, so the answer is A.\n\nAnswer: A"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer with numerical comparison"
+    },
+    {
+      "name": "Answer with Context",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nIn what year did World War II end?\n\nA) 1943\nB) 1944\nC) 1945\nD) 1946",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "World War II ended in 1945. Germany surrendered in May 1945, and Japan surrendered in September 1945 after the atomic bombings of Hiroshima and Nagasaki. So the answer is C.\n\nAnswer: C"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer with historical context"
+    },
+    {
+      "name": "Answer with Formula",
+      "metric_class": "gpqa_instruct_metric",
+      "metric_params": {},
+      "doc": {
+        "query": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nWhat is the area of a circle with radius 3?\n\nA) 6π\nB) 9π\nC) 12π\nD) 18π",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "gpqa_instruct"
+      },
+      "model_response": {
+        "text": [
+          "The area of a circle is given by the formula A = πr².\nFor radius r = 3:\nA = π(3)² = π(9) = 9π\n\nSo the answer is B.\n\nAnswer: B"
+        ]
+      },
+      "expected_output": {
+        "extractive_match": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Answer with mathematical formula"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json b/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json
new file mode 100644
index 000000000..c3a0c6f25
--- /dev/null
+++ b/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json
@@ -0,0 +1,281 @@
+{
+  "name": "Gpqa Instruct Pass At K Test Suite",
+  "description": "Comprehensive test cases for gpqa_instruct_pass_at_k metric covering various scenarios including multiple samples, different k values, and multiple choice letter indices (A, B, C, D, etc.)",
+  "test_cases": [
+    {
+      "name": "Gpqa Instruct Pass At K - Basic Single Sample Correct",
+      "metric_class": "gpqa_instruct_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 1,
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the capital of France?\nA. London\nB. Paris\nC. Berlin\nD. Madrid",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["B"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "gpqa_pass@k_with_k&n&strip_strings": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Basic test case with single correct sample"
+    },
+    {
+      "name": "Gpqa Instruct Pass At K - Multiple Samples All Correct",
+      "metric_class": "gpqa_instruct_pass_at_k",
+      "metric_params": {
+        "k": 2,
+        "n": 3,
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the largest planet in our solar system?\nA. Earth\nB. Jupiter\nC. Saturn\nD. Mars",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "astronomy"
+      },
+      "model_response": {
+        "text": ["B", "B", "B"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "gpqa_pass@k_with_k&n&strip_strings": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with multiple samples all correct"
+    },
+    {
+      "name": "Gpqa Instruct Pass At K - Mixed Correct and Incorrect",
+      "metric_class": "gpqa_instruct_pass_at_k",
+      "metric_params": {
+        "k": 2,
+        "n": 4,
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "Who wrote Romeo and Juliet?\nA. Charles Dickens\nB. William Shakespeare\nC. Jane Austen\nD. Mark Twain",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "literature"
+      },
+      "model_response": {
+        "text": ["B", "A", "B", "C"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "gpqa_pass@k_with_k&n&strip_strings": 0.8333333333333333
+      },
+      "tolerance": 0.01,
+      "description": "Test case with mixed correct and incorrect samples"
+    },
+    {
+      "name": "Gpqa Instruct Pass At K - Case Sensitivity",
+      "metric_class": "gpqa_instruct_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 2,
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the chemical symbol for gold?\nA. Ag\nB. Au\nC. Fe\nD. Cu",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "chemistry"
+      },
+      "model_response": {
+        "text": ["B", "b"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "gpqa_pass@k_with_k&n&strip_strings": 0.5
+      },
+      "tolerance": 0.01,
+      "description": "Test case with case sensitivity (strip_strings should handle this)"
+    },
+    {
+      "name": "Gpqa Instruct Pass At K - All Incorrect Samples",
+      "metric_class": "gpqa_instruct_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 3,
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What year did World War II end?\nA. 1943\nB. 1944\nC. 1945\nD. 1946",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "history"
+      },
+      "model_response": {
+        "text": ["A", "B", "D"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "gpqa_pass@k_with_k&n&strip_strings": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with all incorrect samples"
+    },
+    {
+      "name": "Gpqa Instruct Pass At K - High K Value",
+      "metric_class": "gpqa_instruct_pass_at_k",
+      "metric_params": {
+        "k": 5,
+        "n": 8,
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the speed of light in vacuum?\nA. 299,792,458 m/s\nB. 300,000 km/s\nC. 186,282 miles/s\nD. 3x10^8 m/s",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 0,
+        "task_name": "physics"
+      },
+      "model_response": {
+        "text": ["A", "B", "A", "C", "A", "D", "A", "B"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "gpqa_pass@k_with_k&n&strip_strings": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with high k value and multiple correct samples"
+    },
+    {
+      "name": "Gpqa Instruct Pass At K - Parentheses Format",
+      "metric_class": "gpqa_instruct_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 2,
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the main theme of George Orwell's 1984?\nA. Love and romance\nB. Totalitarianism and surveillance\nC. War and peace\nD. Economic inequality",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 1,
+        "task_name": "literature"
+      },
+      "model_response": {
+        "text": ["(B)", "B"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "gpqa_pass@k_with_k&n&strip_strings": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with parentheses format"
+    },
+    {
+      "name": "Gpqa Instruct Pass At K - Reasoning with Answer",
+      "metric_class": "gpqa_instruct_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 2,
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "How many sides does a hexagon have?\nA. 4\nB. 5\nC. 6\nD. 7",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "geometry"
+      },
+      "model_response": {
+        "text": ["A hexagon has 6 sides, so the answer is C", "C"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "gpqa_pass@k_with_k&n&strip_strings": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with reasoning and answer extraction"
+    },
+    {
+      "name": "Gpqa Instruct Pass At K - Final Answer Format",
+      "metric_class": "gpqa_instruct_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 2,
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "What is the largest ocean on Earth?\nA. Atlantic Ocean\nB. Indian Ocean\nC. Pacific Ocean\nD. Arctic Ocean",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 2,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["The largest ocean is the Pacific Ocean. Final answer is C", "C"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "gpqa_pass@k_with_k&n&strip_strings": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test case with 'final answer' format"
+    },
+    {
+      "name": "Gpqa Instruct Pass At K - Edge Case Single Choice",
+      "metric_class": "gpqa_instruct_pass_at_k",
+      "metric_params": {
+        "k": 1,
+        "n": 1,
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "Is the Earth round?\nA. Yes",
+        "choices": ["A"],
+        "gold_index": 0,
+        "task_name": "science"
+      },
+      "model_response": {
+        "text": ["A"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "gpqa_pass@k_with_k&n&strip_strings": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Edge case with single choice"
+    },
+    {
+      "name": "Gpqa Instruct Pass At K - Multiple Correct Answers",
+      "metric_class": "gpqa_instruct_pass_at_k",
+      "metric_params": {
+        "k": 2,
+        "n": 4,
+        "strip_strings": true
+      },
+      "doc": {
+        "query": "Which of the following are primary colors?\nA. Red\nB. Blue\nC. Green\nD. Yellow",
+        "choices": ["A", "B", "C", "D"],
+        "gold_index": 0,
+        "task_name": "art"
+      },
+      "model_response": {
+        "text": ["A", "B", "A", "C"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "gpqa_pass@k_with_k&n&strip_strings": 0.8333333333333333
+      },
+      "tolerance": 0.01,
+      "description": "Test case with multiple correct answers (first correct answer)"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/loglikelihood_acc.json b/tests/unit/metrics/test_cases/loglikelihood_acc.json
new file mode 100644
index 000000000..c877566e0
--- /dev/null
+++ b/tests/unit/metrics/test_cases/loglikelihood_acc.json
@@ -0,0 +1,266 @@
+{
+  "name": "Loglikelihood Accuracy Test Suite",
+  "description": "Comprehensive test cases for loglikelihood accuracy metric covering various scenarios including different logprob distributions, correct/incorrect predictions, and edge cases",
+  "test_cases": [
+    {
+      "name": "Loglikelihood Accuracy - Correct Choice",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "logprobs": [0.1, 0.8, 0.1],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with correct choice having highest logprob"
+    },
+    {
+      "name": "Loglikelihood Accuracy - Incorrect Choice",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the largest planet in our solar system?",
+        "choices": ["Earth", "Jupiter", "Saturn"],
+        "gold_index": 1,
+        "task_name": "astronomy"
+      },
+      "model_response": {
+        "logprobs": [0.1, 0.3, 0.6],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with incorrect choice having highest logprob"
+    },
+    {
+      "name": "Loglikelihood Accuracy - Close Probabilities",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "Who wrote Romeo and Juliet?",
+        "choices": ["Charles Dickens", "William Shakespeare", "Jane Austen"],
+        "gold_index": 1,
+        "task_name": "literature"
+      },
+      "model_response": {
+        "logprobs": [0.2, 0.35, 0.45],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with close probabilities but wrong choice highest"
+    },
+    {
+      "name": "Loglikelihood Accuracy - Very Confident Correct",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the chemical symbol for gold?",
+        "choices": ["Ag", "Au", "Fe"],
+        "gold_index": 1,
+        "task_name": "chemistry"
+      },
+      "model_response": {
+        "logprobs": [0.01, 0.98, 0.01],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with very confident correct prediction"
+    },
+    {
+      "name": "Loglikelihood Accuracy - Very Confident Incorrect",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "What year did World War II end?",
+        "choices": ["1943", "1944", "1945"],
+        "gold_index": 2,
+        "task_name": "history"
+      },
+      "model_response": {
+        "logprobs": [0.95, 0.03, 0.02],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with very confident incorrect prediction"
+    },
+    {
+      "name": "Loglikelihood Accuracy - Equal Probabilities",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the speed of light?",
+        "choices": ["299,792,458 m/s", "300,000 km/s", "186,282 miles/s"],
+        "gold_index": 0,
+        "task_name": "physics"
+      },
+      "model_response": {
+        "logprobs": [0.33, 0.33, 0.34],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with nearly equal probabilities"
+    },
+    {
+      "name": "Loglikelihood Accuracy - Negative Logprobs",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "How many sides does a hexagon have?",
+        "choices": ["4", "5", "6"],
+        "gold_index": 2,
+        "task_name": "geometry"
+      },
+      "model_response": {
+        "logprobs": [-2.0, -1.5, -0.5],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with negative logprobs (correct choice highest)"
+    },
+    {
+      "name": "Loglikelihood Accuracy - All Negative Logprobs",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the main theme of 1984?",
+        "choices": ["Love", "Totalitarianism", "War"],
+        "gold_index": 1,
+        "task_name": "literature"
+      },
+      "model_response": {
+        "logprobs": [-5.0, -2.0, -4.0],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with all negative logprobs (correct choice highest)"
+    },
+    {
+      "name": "Loglikelihood Accuracy - Single Choice",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "Is the Earth round?",
+        "choices": ["Yes"],
+        "gold_index": 0,
+        "task_name": "science"
+      },
+      "model_response": {
+        "logprobs": [0.9],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with single choice (trivial case)"
+    },
+    {
+      "name": "Loglikelihood Accuracy - Multiple Gold Indices",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "Which are primary colors?",
+        "choices": ["Red", "Blue", "Green", "Yellow"],
+        "gold_index": [0, 1],
+        "task_name": "art"
+      },
+      "model_response": {
+        "logprobs": [0.4, 0.3, 0.2, 0.1],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with multiple correct answers (first correct answer highest)"
+    },
+    {
+      "name": "Loglikelihood Accuracy - Multiple Gold Indices Wrong",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "Which are even numbers?",
+        "choices": ["2", "3", "4", "5"],
+        "gold_index": [0, 2],
+        "task_name": "math"
+      },
+      "model_response": {
+        "logprobs": [0.2, 0.5, 0.2, 0.1],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with multiple correct answers but wrong choice highest"
+    },
+    {
+      "name": "Loglikelihood Accuracy - Zero Probabilities",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the capital of Japan?",
+        "choices": ["Tokyo", "Kyoto", "Osaka"],
+        "gold_index": 0,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "logprobs": [0.0, 0.0, 0.0],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with zero probabilities (first choice wins by default)"
+    },
+    {
+      "name": "Loglikelihood Accuracy - Very Small Differences",
+      "metric_class": "loglikelihood_acc",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the largest ocean?",
+        "choices": ["Atlantic", "Pacific", "Indian"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "logprobs": [0.333, 0.334, 0.333],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "acc": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood accuracy with very small differences in probabilities"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/loglikelihood_f1.json b/tests/unit/metrics/test_cases/loglikelihood_f1.json
new file mode 100644
index 000000000..81a0f26cd
--- /dev/null
+++ b/tests/unit/metrics/test_cases/loglikelihood_f1.json
@@ -0,0 +1,286 @@
+{
+  "name": "Loglikelihood F1 Test Suite",
+  "description": "Comprehensive test cases for loglikelihood_f1 metric covering various scenarios including different logprob distributions, correct/incorrect predictions, and edge cases. This is a corpus-level F1 score metric.",
+  "test_cases": [
+    {
+      "name": "Loglikelihood F1 - Perfect Predictions",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "logprobs": [0.1, 0.8, 0.1],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with perfect predictions across corpus"
+    },
+    {
+      "name": "Loglikelihood F1 - All Incorrect Predictions",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the largest planet in our solar system?",
+        "choices": ["Earth", "Jupiter", "Saturn"],
+        "gold_index": 1,
+        "task_name": "astronomy"
+      },
+      "model_response": {
+        "logprobs": [0.1, 0.3, 0.6],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with all incorrect predictions across corpus"
+    },
+    {
+      "name": "Loglikelihood F1 - Mixed Predictions",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "Who wrote Romeo and Juliet?",
+        "choices": ["Charles Dickens", "William Shakespeare", "Jane Austen"],
+        "gold_index": 1,
+        "task_name": "literature"
+      },
+      "model_response": {
+        "logprobs": [0.2, 0.35, 0.45],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with mixed predictions (some correct, some incorrect)"
+    },
+    {
+      "name": "Loglikelihood F1 - Very Confident Correct",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the chemical symbol for gold?",
+        "choices": ["Ag", "Au", "Fe"],
+        "gold_index": 1,
+        "task_name": "chemistry"
+      },
+      "model_response": {
+        "logprobs": [0.01, 0.98, 0.01],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with very confident correct prediction"
+    },
+    {
+      "name": "Loglikelihood F1 - Very Confident Incorrect",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "What year did World War II end?",
+        "choices": ["1943", "1944", "1945"],
+        "gold_index": 2,
+        "task_name": "history"
+      },
+      "model_response": {
+        "logprobs": [0.95, 0.03, 0.02],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with very confident incorrect prediction"
+    },
+    {
+      "name": "Loglikelihood F1 - Close Probabilities",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the speed of light?",
+        "choices": ["299,792,458 m/s", "300,000 km/s", "186,282 miles/s"],
+        "gold_index": 0,
+        "task_name": "physics"
+      },
+      "model_response": {
+        "logprobs": [0.33, 0.33, 0.34],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with close probabilities but wrong choice highest"
+    },
+    {
+      "name": "Loglikelihood F1 - Negative Logprobs",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "How many sides does a hexagon have?",
+        "choices": ["4", "5", "6"],
+        "gold_index": 2,
+        "task_name": "geometry"
+      },
+      "model_response": {
+        "logprobs": [-2.0, -1.5, -0.5],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with negative logprobs (correct choice highest)"
+    },
+    {
+      "name": "Loglikelihood F1 - All Negative Logprobs",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the main theme of 1984?",
+        "choices": ["Love", "Totalitarianism", "War"],
+        "gold_index": 1,
+        "task_name": "literature"
+      },
+      "model_response": {
+        "logprobs": [-5.0, -2.0, -4.0],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with all negative logprobs (correct choice highest)"
+    },
+    {
+      "name": "Loglikelihood F1 - Single Choice",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "Is the Earth round?",
+        "choices": ["Yes"],
+        "gold_index": 0,
+        "task_name": "science"
+      },
+      "model_response": {
+        "logprobs": [0.9],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with single choice (trivial case)"
+    },
+    {
+      "name": "Loglikelihood F1 - Multiple Gold Indices",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "Which are primary colors?",
+        "choices": ["Red", "Blue", "Green", "Yellow"],
+        "gold_index": [0, 1],
+        "task_name": "art"
+      },
+      "model_response": {
+        "logprobs": [0.4, 0.3, 0.2, 0.1],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with multiple correct answers (first correct answer highest)"
+    },
+    {
+      "name": "Loglikelihood F1 - Multiple Gold Indices Wrong",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "Which are even numbers?",
+        "choices": ["2", "3", "4", "5"],
+        "gold_index": [0, 2],
+        "task_name": "math"
+      },
+      "model_response": {
+        "logprobs": [0.2, 0.5, 0.2, 0.1],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with multiple correct answers but wrong choice highest"
+    },
+    {
+      "name": "Loglikelihood F1 - Zero Probabilities",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the capital of Japan?",
+        "choices": ["Tokyo", "Kyoto", "Osaka"],
+        "gold_index": 0,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "logprobs": [0.0, 0.0, 0.0],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with zero probabilities (first choice wins by default)"
+    },
+    {
+      "name": "Loglikelihood F1 - Very Small Differences",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the largest ocean?",
+        "choices": ["Atlantic", "Pacific", "Indian"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "logprobs": [0.333, 0.334, 0.333],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with very small differences in probabilities"
+    },
+    {
+      "name": "Loglikelihood F1 - Balanced Predictions",
+      "metric_class": "loglikelihood_f1",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the square root of 16?",
+        "choices": ["2", "4", "8"],
+        "gold_index": 1,
+        "task_name": "math"
+      },
+      "model_response": {
+        "logprobs": [0.25, 0.5, 0.25],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "loglikelihood_f1": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test loglikelihood F1 with balanced predictions (correct choice has highest probability)"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/maj_at_k.json b/tests/unit/metrics/test_cases/maj_at_k.json
new file mode 100644
index 000000000..aa83871b2
--- /dev/null
+++ b/tests/unit/metrics/test_cases/maj_at_k.json
@@ -0,0 +1,82 @@
+{
+  "name": "Maj At K Test Suite",
+  "description": "Test cases for maj_at_k metric",
+  "test_cases": [
+    {
+      "name": "Maj at K - Majority Correct",
+      "metric_class": "maj_at_k",
+      "metric_params": {"k": 3},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": [1],
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["Paris", "Paris", "London"]
+      },
+      "expected_output": {
+        "maj@k_with_k": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test maj at k with majority correct"
+    },
+    {
+      "name": "Maj at K - No Majority",
+      "metric_class": "maj_at_k",
+      "metric_params": {"k": 3},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": [1],
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["Paris", "London", "Berlin"]
+      },
+      "expected_output": {
+        "maj@k_with_k": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test maj at k with no majority"
+    },
+    {
+      "name": "Maj at K - All Correct",
+      "metric_class": "maj_at_k",
+      "metric_params": {"k": 3},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": [1],
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["Paris", "Paris", "Paris"]
+      },
+      "expected_output": {
+        "maj@k_with_k": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test maj at k with all correct"
+    },
+    {
+      "name": "Maj at K - Wrong Answer",
+      "metric_class": "maj_at_k",
+      "metric_params": {"k": 3},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": [1],
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["London", "London", "London"]
+      },
+      "expected_output": {
+        "maj@k_with_k": 0
+      },
+      "tolerance": 0.01,
+      "description": "Test maj at k with wrong answer"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/mcc.json b/tests/unit/metrics/test_cases/mcc.json
new file mode 100644
index 000000000..b0cbaa219
--- /dev/null
+++ b/tests/unit/metrics/test_cases/mcc.json
@@ -0,0 +1,47 @@
+{
+  "name": "MCC Test Suite",
+  "description": "Test cases for MCC (Matthews Correlation Coefficient) metric",
+  "corpus_level": true,
+  "test_cases": [
+    {
+      "name": "MCC - Corpus Level Test with 3 Samples",
+      "metric_class": "mcc",
+      "metric_name": "mcc",
+      "metric_params": {},
+      "docs": [
+        {
+          "query": "What is the capital of France?",
+          "choices": ["Paris", "London", "Berlin"],
+          "gold_index": 0,
+          "task_name": "geography"
+        },
+        {
+          "query": "What is 2 + 2?",
+          "choices": ["3", "4", "5"],
+          "gold_index": 1,
+          "task_name": "math"
+        },
+        {
+          "query": "What color is the sky?",
+          "choices": ["Red", "Blue", "Green"],
+          "gold_index": 1,
+          "task_name": "science"
+        }
+      ],
+      "model_responses": [
+        {
+          "logprobs": [-0.2, -0.8, -1.5]
+        },
+        {
+          "logprobs": [-1.2, -0.3, -0.9]
+        },
+        {
+          "logprobs": [-0.7, -0.4, -1.1]
+        }
+      ],
+      "expected_output": 1.0,
+      "tolerance": 0.01,
+      "description": "Corpus level test case for MCC metric with 3 samples - all predictions correct"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/mrr.json b/tests/unit/metrics/test_cases/mrr.json
new file mode 100644
index 000000000..0fe43dca4
--- /dev/null
+++ b/tests/unit/metrics/test_cases/mrr.json
@@ -0,0 +1,90 @@
+{
+  "name": "Mrr Test Suite",
+  "description": "Test cases for mrr metric",
+  "test_cases": [
+    {
+      "name": "MRR - Correct First",
+      "metric_class": "mrr",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["Paris"],
+        "logprobs": [0.1, 0.8, 0.1],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "mrr": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test MRR with correct choice ranked first"
+    },
+    {
+      "name": "MRR - Correct Second",
+      "metric_class": "mrr",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["London"],
+        "logprobs": [0.8, 0.1, 0.1],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "mrr": 0.5
+      },
+      "tolerance": 0.01,
+      "description": "Test MRR with correct choice ranked second"
+    },
+    {
+      "name": "MRR - Correct Third",
+      "metric_class": "mrr",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Berlin", "Paris"],
+        "gold_index": 2,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["London"],
+        "logprobs": [0.8, 0.15, 0.05],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "mrr": 0.3333333333333333
+      },
+      "tolerance": 0.01,
+      "description": "Test MRR with correct choice ranked third"
+    },
+    {
+      "name": "MRR - Multiple Gold Indices",
+      "metric_class": "mrr",
+      "metric_params": {},
+      "doc": {
+        "query": "Which are European capitals?",
+        "choices": ["London", "Paris", "Tokyo", "Berlin"],
+        "gold_index": [0, 1, 3],
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["Paris"],
+        "logprobs": [0.2, 0.6, 0.1, 0.1],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "mrr": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test MRR with multiple gold indices"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/multi_f1_numeric.json b/tests/unit/metrics/test_cases/multi_f1_numeric.json
new file mode 100644
index 000000000..ccc0ac536
--- /dev/null
+++ b/tests/unit/metrics/test_cases/multi_f1_numeric.json
@@ -0,0 +1,167 @@
+{
+  "name": "Multi F1 Numeric Test Suite",
+  "description": "Test cases for multi_f1_numeric metric (corpus-level multi-class F1 score with 3 classes)",
+  "corpus_level": true,
+  "test_cases": [
+    {
+      "name": "Multi F1 Numeric - Perfect Predictions",
+      "metric_class": "multi_f1_numeric",
+      "metric_params": {},
+      "metric_name": "mf1",
+      "docs": [
+        {
+          "query": "Classify the sentiment: I love this movie!",
+          "choices": ["negative", "neutral", "positive"],
+          "gold_index": 2,
+          "task_name": "sentiment_classification"
+        },
+        {
+          "query": "Classify the topic: 2 + 2 = 4",
+          "choices": ["history", "science", "math"],
+          "gold_index": 2,
+          "task_name": "topic_classification"
+        },
+        {
+          "query": "Classify the emotion: I am so happy today!",
+          "choices": ["sad", "angry", "happy"],
+          "gold_index": 2,
+          "task_name": "emotion_classification"
+        }
+      ],
+      "model_responses": [
+        {
+          "logprobs": [-2.0, -1.5, -0.1]
+        },
+        {
+          "logprobs": [-1.8, -2.1, -0.2]
+        },
+        {
+          "logprobs": [-2.2, -1.9, -0.1]
+        }
+      ],
+      "expected_output": 1.0,
+      "tolerance": 0.01,
+      "description": "Perfect predictions - all classes correctly predicted (F1 = 1.0 for each class)"
+    },
+    {
+      "name": "Multi F1 Numeric - Balanced Performance",
+      "metric_class": "multi_f1_numeric",
+      "metric_params": {},
+      "metric_name": "mf1",
+      "docs": [
+        {
+          "query": "Classify the sentiment: The weather is okay",
+          "choices": ["negative", "neutral", "positive"],
+          "gold_index": 1,
+          "task_name": "sentiment_classification"
+        },
+        {
+          "query": "Classify the topic: The French Revolution",
+          "choices": ["history", "science", "math"],
+          "gold_index": 0,
+          "task_name": "topic_classification"
+        },
+        {
+          "query": "Classify the emotion: I feel nothing special",
+          "choices": ["sad", "angry", "happy"],
+          "gold_index": 0,
+          "task_name": "emotion_classification"
+        }
+      ],
+      "model_responses": [
+        {
+          "logprobs": [-1.0, -0.2, -1.5]
+        },
+        {
+          "logprobs": [-0.1, -1.8, -2.0]
+        },
+        {
+          "logprobs": [-0.2, -1.5, -1.8]
+        }
+      ],
+      "expected_output": 1.0,
+      "tolerance": 0.01,
+      "description": "Balanced performance - 2 correct, 1 incorrect (F1 varies by class)"
+    },
+    {
+      "name": "Multi F1 Numeric - Poor Performance",
+      "metric_class": "multi_f1_numeric",
+      "metric_params": {},
+      "metric_name": "mf1",
+      "docs": [
+        {
+          "query": "Classify the sentiment: This is terrible",
+          "choices": ["negative", "neutral", "positive"],
+          "gold_index": 0,
+          "task_name": "sentiment_classification"
+        },
+        {
+          "query": "Classify the topic: Photosynthesis",
+          "choices": ["history", "science", "math"],
+          "gold_index": 1,
+          "task_name": "topic_classification"
+        },
+        {
+          "query": "Classify the emotion: I am furious",
+          "choices": ["sad", "angry", "happy"],
+          "gold_index": 1,
+          "task_name": "emotion_classification"
+        }
+      ],
+      "model_responses": [
+        {
+          "logprobs": [-1.5, -0.1, -0.8]
+        },
+        {
+          "logprobs": [-0.2, -1.8, -0.3]
+        },
+        {
+          "logprobs": [-0.1, -1.9, -0.2]
+        }
+      ],
+      "expected_output": 0.33,
+      "tolerance": 0.01,
+      "description": "Poor performance - 1 correct, 2 incorrect (low F1 across classes)"
+    },
+    {
+      "name": "Multi F1 Numeric - Random Performance",
+      "metric_class": "multi_f1_numeric",
+      "metric_params": {},
+      "metric_name": "mf1",
+      "docs": [
+        {
+          "query": "Classify the sentiment: I don't know",
+          "choices": ["negative", "neutral", "positive"],
+          "gold_index": 1,
+          "task_name": "sentiment_classification"
+        },
+        {
+          "query": "Classify the topic: Calculus",
+          "choices": ["history", "science", "math"],
+          "gold_index": 2,
+          "task_name": "topic_classification"
+        },
+        {
+          "query": "Classify the emotion: I am confused",
+          "choices": ["sad", "angry", "happy"],
+          "gold_index": 0,
+          "task_name": "emotion_classification"
+        }
+      ],
+      "model_responses": [
+        {
+          "logprobs": [-0.5, -0.5, -0.5]
+        },
+        {
+          "logprobs": [-0.5, -0.5, -0.5]
+        },
+        {
+          "logprobs": [-0.5, -0.5, -0.5]
+        }
+      ],
+      "expected_output": 0.55,
+      "tolerance": 0.1,
+      "description": "Random performance - equal logprobs lead to random predictions (F1 ≈ 0.0)"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/pass_at_k.json b/tests/unit/metrics/test_cases/pass_at_k.json
new file mode 100644
index 000000000..1e552cb96
--- /dev/null
+++ b/tests/unit/metrics/test_cases/pass_at_k.json
@@ -0,0 +1,69 @@
+{
+  "name": "Pass At K Test Suite",
+  "description": "Test cases for pass_at_k metric",
+  "test_cases": [
+    {
+      "name": "Pass at K - Correct in K",
+      "metric_class": "pass_at_k",
+      "metric_params": {"k": 1, "n": 2},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["Paris", "London"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "pass@k_with_k&n": 0.5
+      },
+      "tolerance": 0.01,
+      "description": "Test pass at k with correct answer in k"
+    },
+    {
+      "name": "Pass at K - Not in K",
+      "metric_class": "pass_at_k",
+      "metric_params": {"k": 1, "n": 2},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["London", "Berlin"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "pass@k_with_k&n": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test pass at k with correct answer not in k"
+    },
+    {
+      "name": "Pass at K - Multiple Attempts",
+      "metric_class": "pass_at_k",
+      "metric_params": {"k": 2, "n": 3},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["London", "Paris", "Berlin"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "pass@k_with_k&n": 0.66
+      },
+      "tolerance": 0.01,
+      "description": "Test pass at k with multiple attempts"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/pass_at_k_letters.json b/tests/unit/metrics/test_cases/pass_at_k_letters.json
new file mode 100644
index 000000000..5156b8e36
--- /dev/null
+++ b/tests/unit/metrics/test_cases/pass_at_k_letters.json
@@ -0,0 +1,69 @@
+{
+  "name": "Pass At K Letters Test Suite",
+  "description": "Test cases for pass_at_k_letters metric",
+  "test_cases": [
+    {
+      "name": "Pass at K Letters - Correct Letters",
+      "metric_class": "pass_at_k_letters",
+      "metric_params": {"k": 1, "n": 2},
+      "doc": {
+        "query": "What letter comes after A?",
+        "choices": ["B"],
+        "gold_index": 0,
+        "task_name": "alphabet"
+      },
+      "model_response": {
+        "text": ["B", "C"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "pass@k_with_k&n": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test pass at k letters with correct letter answer"
+    },
+    {
+      "name": "Pass at K Letters - Wrong Letters",
+      "metric_class": "pass_at_k_letters",
+      "metric_params": {"k": 1, "n": 2},
+      "doc": {
+        "query": "What letter comes after A?",
+        "choices": ["B"],
+        "gold_index": 0,
+        "task_name": "alphabet"
+      },
+      "model_response": {
+        "text": ["C", "D"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "pass@k_with_k&n": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test pass at k letters with wrong letter answer"
+    },
+    {
+      "name": "Pass at K Letters - Multiple Attempts",
+      "metric_class": "pass_at_k_letters",
+      "metric_params": {"k": 2, "n": 3},
+      "doc": {
+        "query": "What letter comes after B?",
+        "choices": ["C"],
+        "gold_index": 0,
+        "task_name": "alphabet"
+      },
+      "model_response": {
+        "text": ["D", "C", "E"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "pass@k_with_k&n": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test pass at k letters with multiple attempts"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/pass_at_k_math.json b/tests/unit/metrics/test_cases/pass_at_k_math.json
new file mode 100644
index 000000000..0ebd6436a
--- /dev/null
+++ b/tests/unit/metrics/test_cases/pass_at_k_math.json
@@ -0,0 +1,63 @@
+{
+  "name": "Pass At K Math Test Suite",
+  "description": "Test cases for pass_at_k_math metric",
+  "test_cases": [
+    {
+      "name": "Pass at K Math - Correct Math",
+      "metric_class": "pass_at_k_math",
+      "metric_params": {"k": 1, "n": 2},
+      "doc": {
+        "query": "What is 2 + 2?",
+        "choices": ["4"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["4", "5"]
+      },
+      "expected_output": {
+        "pass@k_with_k&n": 0.5
+      },
+      "tolerance": 0.01,
+      "description": "Test pass at k math with correct math answer"
+    },
+    {
+      "name": "Pass at K Math - Wrong Math",
+      "metric_class": "pass_at_k_math",
+      "metric_params": {"k": 1, "n": 2},
+      "doc": {
+        "query": "What is 2 + 2?",
+        "choices": ["4"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["5", "6"]
+      },
+      "expected_output": {
+        "pass@k_with_k&n": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test pass at k math with wrong math answer"
+    },
+    {
+      "name": "Pass at K Math - Multiple Attempts",
+      "metric_class": "pass_at_k_math",
+      "metric_params": {"k": 2, "n": 3},
+      "doc": {
+        "query": "What is 3 * 4?",
+        "choices": ["12"],
+        "gold_index": 0,
+        "task_name": "math"
+      },
+      "model_response": {
+        "text": ["10", "12", "15"]
+      },
+      "expected_output": {
+        "pass@k_with_k&n": 0.66
+      },
+      "tolerance": 0.01,
+      "description": "Test pass at k math with multiple attempts"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/prediction_perplexity.json b/tests/unit/metrics/test_cases/prediction_perplexity.json
new file mode 100644
index 000000000..26468edcc
--- /dev/null
+++ b/tests/unit/metrics/test_cases/prediction_perplexity.json
@@ -0,0 +1,47 @@
+{
+  "name": "Prediction Perplexity Test Suite",
+  "description": "Test cases for prediction_perplexity metric",
+  "test_cases": [
+    {
+      "name": "Prediction Perplexity - Basic Test",
+      "metric_class": "prediction_perplexity",
+      "metric_params": {},
+      "doc": {
+        "query": "Test query for prediction_perplexity",
+        "choices": [
+          "Test choice 1",
+          "Test choice 2",
+          "Test choice 3"
+        ],
+        "gold_index": 0,
+        "task_name": "test"
+      },
+      "model_response": {
+        "text": [
+          "Test choice 1"
+        ],
+        "logprobs": [
+          0.5,
+          0.3,
+          0.2
+        ],
+        "output_tokens": [
+          [
+            1
+          ],
+          [
+            2
+          ],
+          [
+            3
+          ]
+        ]
+      },
+      "expected_output": {
+        "ppl": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Basic test case for prediction_perplexity metric"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/recall_at_k.json b/tests/unit/metrics/test_cases/recall_at_k.json
new file mode 100644
index 000000000..8259a0ced
--- /dev/null
+++ b/tests/unit/metrics/test_cases/recall_at_k.json
@@ -0,0 +1,69 @@
+{
+  "name": "Recall At K Test Suite",
+  "description": "Test cases for recall_at_k metric",
+  "test_cases": [
+    {
+      "name": "Recall At K - Correct in Top K",
+      "metric_class": "recall_at_k",
+      "metric_params": {"k": 2},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin", "Madrid"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["Paris"],
+        "logprobs": [0.1, 0.8, 0.05, 0.05],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "recall_with_k": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test recall at k with correct choice in top k"
+    },
+    {
+      "name": "Recall At K - Not in Top K",
+      "metric_class": "recall_at_k",
+      "metric_params": {"k": 1},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["London", "Paris", "Berlin", "Madrid"],
+        "gold_index": 1,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["London"],
+        "logprobs": [0.8, 0.1, 0.05, 0.05],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "recall_with_k": 0
+      },
+      "tolerance": 0.01,
+      "description": "Test recall at k with correct choice not in top k"
+    },
+    {
+      "name": "Recall At K - Multiple Gold Indices",
+      "metric_class": "recall_at_k",
+      "metric_params": {"k": 2},
+      "doc": {
+        "query": "Which are European capitals?",
+        "choices": ["London", "Paris", "Tokyo", "Berlin"],
+        "gold_index": [0, 1, 3],
+        "task_name": "geography"
+      },
+      "model_response": {
+        "text": ["Paris", "London"],
+        "logprobs": [0.3, 0.4, 0.1, 0.2],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "recall_with_k": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test recall at k with multiple gold indices"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/rouge1.json b/tests/unit/metrics/test_cases/rouge1.json
new file mode 100644
index 000000000..f937a4de5
--- /dev/null
+++ b/tests/unit/metrics/test_cases/rouge1.json
@@ -0,0 +1,28 @@
+{
+  "name": "ROUGE1 Test Suite",
+  "description": "Test cases for ROUGE1 metric",
+  "test_cases": [
+    {
+      "name": "ROUGE Score",
+      "metric_class": "rouge1",
+      "metric_params": {
+      },
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "test"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps over the lazy dog"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rouge1": 1
+      },
+      "tolerance": 0.01,
+      "description": "Test ROUGE score with perfect match"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/rouge2.json b/tests/unit/metrics/test_cases/rouge2.json
new file mode 100644
index 000000000..f18e1ca3a
--- /dev/null
+++ b/tests/unit/metrics/test_cases/rouge2.json
@@ -0,0 +1,69 @@
+{
+  "name": "Rouge2 Test Suite",
+  "description": "Test cases for rouge2 metric",
+  "test_cases": [
+    {
+      "name": "ROUGE2 - Perfect Match",
+      "metric_class": "rouge2",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps over the lazy dog"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rouge2": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test ROUGE2 with perfect match"
+    },
+    {
+      "name": "ROUGE2 - Partial Match",
+      "metric_class": "rouge2",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rouge2": 0.5454
+      },
+      "tolerance": 0.01,
+      "description": "Test ROUGE2 with partial match (no bigram overlap)"
+    },
+    {
+      "name": "ROUGE2 - Some Bigram Overlap",
+      "metric_class": "rouge2",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rouge2": 0.666
+      },
+      "tolerance": 0.1,
+      "description": "Test ROUGE2 with some bigram overlap"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/rougeL.json b/tests/unit/metrics/test_cases/rougeL.json
new file mode 100644
index 000000000..81635aa05
--- /dev/null
+++ b/tests/unit/metrics/test_cases/rougeL.json
@@ -0,0 +1,69 @@
+{
+  "name": "Rougel Test Suite",
+  "description": "Test cases for rougeL metric",
+  "test_cases": [
+    {
+      "name": "ROUGEL - Perfect Match",
+      "metric_class": "rougeL",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps over the lazy dog"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rougeL": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test ROUGEL with perfect match"
+    },
+    {
+      "name": "ROUGEL - Partial Match",
+      "metric_class": "rougeL",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rougeL": 0.615
+      },
+      "tolerance": 0.1,
+      "description": "Test ROUGEL with partial match"
+    },
+    {
+      "name": "ROUGEL - Different Word Order",
+      "metric_class": "rougeL",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The brown quick fox jumps over the dog lazy"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rougeL": 0.8
+      },
+      "tolerance": 0.1,
+      "description": "Test ROUGEL with different word order"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/rougeLsum.json b/tests/unit/metrics/test_cases/rougeLsum.json
new file mode 100644
index 000000000..8a5faf3a3
--- /dev/null
+++ b/tests/unit/metrics/test_cases/rougeLsum.json
@@ -0,0 +1,69 @@
+{
+  "name": "Rougelsum Test Suite",
+  "description": "Test cases for rougeLsum metric",
+  "test_cases": [
+    {
+      "name": "ROUGELsum - Perfect Match",
+      "metric_class": "rougeLsum",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps over the lazy dog"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rougeLsum": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test ROUGELsum with perfect match"
+    },
+    {
+      "name": "ROUGELsum - Partial Match",
+      "metric_class": "rougeLsum",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rougeLsum": 0.61
+      },
+      "tolerance": 0.1,
+      "description": "Test ROUGELsum with partial match"
+    },
+    {
+      "name": "ROUGELsum - Multi-sentence",
+      "metric_class": "rougeLsum",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog. The fox is very fast."],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps over the lazy dog. The fox is very fast."],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rougeLsum": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test ROUGELsum with multi-sentence text"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/rouge_t5.json b/tests/unit/metrics/test_cases/rouge_t5.json
new file mode 100644
index 000000000..df2f81777
--- /dev/null
+++ b/tests/unit/metrics/test_cases/rouge_t5.json
@@ -0,0 +1,78 @@
+{
+  "name": "Rouge T5 Test Suite",
+  "description": "Test cases for rouge_t5 metric",
+  "test_cases": [
+    {
+      "name": "ROUGE T5 - Perfect Match",
+      "metric_class": "rouge_t5",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox jumps over the lazy dog"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rouge1": 100.0,
+        "rouge2": 100.0,
+        "rougeL": 100.0,
+        "rougeLsum": 100.0
+      },
+      "tolerance": 0.01,
+      "description": "Test ROUGE T5 with perfect match"
+    },
+    {
+      "name": "ROUGE T5 - Partial Match",
+      "metric_class": "rouge_t5",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["The quick brown fox"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rouge1": 61.53846153846153,
+        "rouge2": 54.54545454545454,
+        "rougeL": 61.53846153846153,
+        "rougeLsum": 61.53846153846153
+      },
+      "tolerance": 0.1,
+      "description": "Test ROUGE T5 with partial match"
+    },
+    {
+      "name": "ROUGE T5 - Different Content",
+      "metric_class": "rouge_t5",
+      "metric_params": {},
+      "doc": {
+        "query": "Summarize the text",
+        "choices": ["The quick brown fox jumps over the lazy dog"],
+        "gold_index": 0,
+        "task_name": "summarization"
+      },
+      "model_response": {
+        "text": ["A cat sleeps on the mat"],
+        "logprobs": [],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "rouge1": 13.333333333333334,
+        "rouge2": 0.0,
+        "rougeL": 13.333333333333334,
+        "rougeLsum": 13.333333333333334
+      },
+      "tolerance": 0.01,
+      "description": "Test ROUGE T5 with completely different content"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/simpleqa_judge.json b/tests/unit/metrics/test_cases/simpleqa_judge.json
new file mode 100644
index 000000000..485bf4b3d
--- /dev/null
+++ b/tests/unit/metrics/test_cases/simpleqa_judge.json
@@ -0,0 +1,31 @@
+{
+  "name": "Simpleqa Judge Test Suite",
+  "description": "Test cases for simpleqa_judge metric",
+  "test_cases": [
+    {
+      "name": "Simpleqa Judge - Basic Test",
+      "metric_class": "simpleqa_judge",
+      "metric_params": {},
+      "doc": {
+        "query": "Test query for simpleqa_judge",
+        "choices": [
+          "Test choice 1",
+          "Test choice 2",
+          "Test choice 3"
+        ],
+        "gold_index": 0,
+        "task_name": "test"
+      },
+      "model_response": {
+        "text": [
+          "Test choice 1"
+        ]
+      },
+      "expected_output": {
+        "simpleqa_judge": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Basic test case for simpleqa_judge metric"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/target_perplexity.json b/tests/unit/metrics/test_cases/target_perplexity.json
new file mode 100644
index 000000000..5654613c2
--- /dev/null
+++ b/tests/unit/metrics/test_cases/target_perplexity.json
@@ -0,0 +1,101 @@
+{
+  "name": "Target Perplexity Test Suite",
+  "description": "Test cases for target_perplexity metric (sample-level perplexity of target text)",
+  "test_cases": [
+    {
+      "name": "Target Perplexity - Low Perplexity",
+      "metric_class": "target_perplexity",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the capital of France?",
+        "choices": ["Paris", "London", "Berlin"],
+        "gold_index": 0,
+        "task_name": "geography"
+      },
+      "model_response": {
+        "logprobs": [-0.1, -0.2, -0.3]
+      },
+      "expected_output": {
+        "ppl": 1.5
+      },
+      "tolerance": 0.01,
+      "description": "Low perplexity - model has high confidence in target text"
+    },
+    {
+      "name": "Target Perplexity - Moderate Perplexity",
+      "metric_class": "target_perplexity",
+      "metric_params": {},
+      "doc": {
+        "query": "What is 2 + 2?",
+        "choices": ["3", "4", "5"],
+        "gold_index": 1,
+        "task_name": "math"
+      },
+      "model_response": {
+        "logprobs": [-0.8, -0.3, -1.2]
+      },
+      "expected_output": {
+        "ppl": 2.0
+      },
+      "tolerance": 0.01,
+      "description": "Moderate perplexity - model has moderate confidence in target text"
+    },
+    {
+      "name": "Target Perplexity - High Perplexity",
+      "metric_class": "target_perplexity",
+      "metric_params": {},
+      "doc": {
+        "query": "What color is the sky?",
+        "choices": ["Red", "Blue", "Green"],
+        "gold_index": 1,
+        "task_name": "science"
+      },
+      "model_response": {
+        "logprobs": [-1.5, -0.1, -1.8]
+      },
+      "expected_output": {
+        "ppl": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "High perplexity - model has low confidence in target text"
+    },
+    {
+      "name": "Target Perplexity - Very High Perplexity",
+      "metric_class": "target_perplexity",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the largest planet?",
+        "choices": ["Mars", "Jupiter", "Saturn"],
+        "gold_index": 1,
+        "task_name": "astronomy"
+      },
+      "model_response": {
+        "logprobs": [-2.1, -0.2, -2.5]
+      },
+      "expected_output": {
+        "ppl": 8.2
+      },
+      "tolerance": 0.8,
+      "description": "Very high perplexity - model has very low confidence in target text"
+    },
+    {
+      "name": "Target Perplexity - Mixed Confidence",
+      "metric_class": "target_perplexity",
+      "metric_params": {},
+      "doc": {
+        "query": "What is the weather like?",
+        "choices": ["Sunny", "Rainy", "Cloudy"],
+        "gold_index": 0,
+        "task_name": "weather"
+      },
+      "model_response": {
+        "logprobs": [-0.2, -1.8, -1.5]
+      },
+      "expected_output": {
+        "ppl": 1.2
+      },
+      "tolerance": 0.2,
+      "description": "Mixed confidence - high confidence in correct choice, low in others"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/ter.json b/tests/unit/metrics/test_cases/ter.json
new file mode 100644
index 000000000..39b671b0f
--- /dev/null
+++ b/tests/unit/metrics/test_cases/ter.json
@@ -0,0 +1,167 @@
+{
+  "name": "TER Test Suite",
+  "description": "Test cases for ter metric (Translation Edit Rate - corpus-level)",
+  "corpus_level": true,
+  "test_cases": [
+    {
+      "name": "TER - Perfect Translations",
+      "metric_class": "ter",
+      "metric_params": {},
+      "metric_name": "ter",
+      "docs": [
+        {
+          "query": "Translate to French: Hello world",
+          "choices": ["Bonjour le monde"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: Good morning",
+          "choices": ["Buenos días"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: Thank you",
+          "choices": ["Danke schön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Bonjour le monde"]
+        },
+        {
+          "text": ["Buenos días"]
+        },
+        {
+          "text": ["Danke schön"]
+        }
+      ],
+      "expected_output": 0.0,
+      "tolerance": 0.01,
+      "description": "Perfect translations - no edits needed (TER = 0.0)"
+    },
+    {
+      "name": "TER - Minor Edits",
+      "metric_class": "ter",
+      "metric_params": {},
+      "metric_name": "ter",
+      "docs": [
+        {
+          "query": "Translate to French: The cat is sleeping",
+          "choices": ["Le chat dort"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: I like pizza",
+          "choices": ["Me gusta la pizza"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: The weather is nice",
+          "choices": ["Das Wetter ist schön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Le chat dort"]
+        },
+        {
+          "text": ["Me gusta pizza"]
+        },
+        {
+          "text": ["Das Wetter ist schön"]
+        }
+      ],
+      "expected_output": 0.0,
+      "tolerance": 0.05,
+      "description": "Minor edits - small word differences"
+    },
+    {
+      "name": "TER - Major Edits",
+      "metric_class": "ter",
+      "metric_params": {},
+      "metric_name": "ter",
+      "docs": [
+        {
+          "query": "Translate to French: The quick brown fox jumps over the lazy dog",
+          "choices": ["Le renard brun rapide saute par-dessus le chien paresseux"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: Artificial intelligence is transforming the world",
+          "choices": ["La inteligencia artificial está transformando el mundo"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: Machine learning algorithms are becoming more sophisticated",
+          "choices": ["Maschinelle Lernalgorithmen werden immer ausgefeilter"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Le renard saute le chien"]
+        },
+        {
+          "text": ["La IA cambia el mundo"]
+        },
+        {
+          "text": ["ML Algorithmen werden besser"]
+        }
+      ],
+      "expected_output": 57.14285714285714,
+      "tolerance": 0.01,
+      "description": "Major edits - significant word omissions and changes"
+    },
+    {
+      "name": "TER - Completely Different",
+      "metric_class": "ter",
+      "metric_params": {},
+      "metric_name": "ter",
+      "docs": [
+        {
+          "query": "Translate to French: The sun is bright today",
+          "choices": ["Le soleil est brillant aujourd'hui"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to Spanish: The moon is full tonight",
+          "choices": ["La luna está llena esta noche"],
+          "gold_index": 0,
+          "task_name": "translation"
+        },
+        {
+          "query": "Translate to German: The stars are beautiful",
+          "choices": ["Die Sterne sind wunderschön"],
+          "gold_index": 0,
+          "task_name": "translation"
+        }
+      ],
+      "model_responses": [
+        {
+          "text": ["Il pleut beaucoup"]
+        },
+        {
+          "text": ["Hace mucho frío"]
+        },
+        {
+          "text": ["Es ist sehr heiß"]
+        }
+      ],
+      "expected_output": 80.0,
+      "tolerance": 0.1,
+      "description": "Completely different translations - maximum edit distance"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json b/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json
new file mode 100644
index 000000000..29a009b0d
--- /dev/null
+++ b/tests/unit/metrics/test_cases/truthfulqa_mc_metrics.json
@@ -0,0 +1,81 @@
+{
+  "name": "Truthfulqa Mc Metrics Test Suite",
+  "description": "Test cases for truthfulqa_mc_metrics metric",
+  "test_cases": [
+    {
+      "name": "TruthfulQA MC Metrics - Correct Answer",
+      "metric_class": "truthfulqa_mc_metrics",
+      "metric_params": {},
+      "doc": {
+        "query": "Q: What is the capital of France?\nA:",
+        "choices": [" Paris", " London", " Berlin", " Madrid", " Rome"],
+        "gold_index": [0, 3, 4],
+        "specific": {
+          "len_mc1": 3
+        },
+        "task_name": "truthfulqa"
+      },
+      "model_response": {
+        "text": [" Paris"],
+        "logprobs": [0.8, 0.1, 0.1, 0.0, 0.0],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "truthfulqa_mc1": 1.0,
+        "truthfulqa_mc2": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test TruthfulQA MC metrics with correct answer"
+    },
+    {
+      "name": "TruthfulQA MC Metrics - Wrong Answer",
+      "metric_class": "truthfulqa_mc_metrics",
+      "metric_params": {},
+      "doc": {
+        "query": "Q: What is the capital of France?\nA:",
+        "choices": [" Paris", " London", " Berlin", " Madrid", " Rome"],
+        "gold_index": [0, 3, 4],
+        "specific": {
+          "len_mc1": 3
+        },
+        "task_name": "truthfulqa"
+      },
+      "model_response": {
+        "text": [" London"],
+        "logprobs": [0.1, 0.8, 0.1, 0.0, 0.0],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "truthfulqa_mc1": false,
+        "truthfulqa_mc2": 1.0
+      },
+      "tolerance": 0.01,
+      "description": "Test TruthfulQA MC metrics with wrong answer"
+    },
+    {
+      "name": "TruthfulQA MC Metrics - Multiple Gold Indices",
+      "metric_class": "truthfulqa_mc_metrics",
+      "metric_params": {},
+      "doc": {
+        "query": "Q: Which are European capitals?\nA:",
+        "choices": [" London", " Paris", " Tokyo", " Berlin", " Madrid", " Rome"],
+        "gold_index": [0, 1, 3, 4, 5],
+        "specific": {
+          "len_mc1": 4
+        },
+        "task_name": "truthfulqa"
+      },
+      "model_response": {
+        "text": [" Paris"],
+        "logprobs": [0.1, 0.6, 0.1, 0.1, 0.05, 0.05],
+        "output_tokens": []
+      },
+      "expected_output": {
+        "truthfulqa_mc1": false,
+        "truthfulqa_mc2": 0.0
+      },
+      "tolerance": 0.01,
+      "description": "Test TruthfulQA MC metrics with multiple gold indices"
+    }
+  ]
+}
diff --git a/tests/unit/metrics/test_cases/word_perplexity.json b/tests/unit/metrics/test_cases/word_perplexity.json
new file mode 100644
index 000000000..4f4640e67
--- /dev/null
+++ b/tests/unit/metrics/test_cases/word_perplexity.json
@@ -0,0 +1,127 @@
+{
+  "name": "Word Perplexity Test Suite",
+  "description": "Test cases for word_perplexity metric (corpus-level weighted perplexity)",
+  "corpus_level": true,
+  "test_cases": [
+    {
+      "name": "Word Perplexity - Low Perplexity",
+      "metric_class": "word_perplexity",
+      "metric_params": {},
+      "metric_name": "word_perplexity",
+      "docs": [
+        {
+          "query": "The quick brown fox",
+          "choices": ["jumps over the lazy dog"],
+          "gold_index": 0,
+          "task_name": "completion"
+        },
+        {
+          "query": "It is a beautiful day",
+          "choices": ["in the neighborhood"],
+          "gold_index": 0,
+          "task_name": "completion"
+        },
+        {
+          "query": "Hello world",
+          "choices": ["how are you"],
+          "gold_index": 0,
+          "task_name": "completion"
+        }
+      ],
+      "model_responses": [
+        {
+          "logprobs": [-0.1, -0.2, -0.1, -0.3]
+        },
+        {
+          "logprobs": [-0.2, -0.1, -0.2, -0.1]
+        },
+        {
+          "logprobs": [-0.1, -0.1, -0.2]
+        }
+      ],
+      "expected_output": 1.1671273280939887,
+      "tolerance": 0.01,
+      "description": "Low perplexity - model has high confidence in predictions"
+    },
+    {
+      "name": "Word Perplexity - High Perplexity",
+      "metric_class": "word_perplexity",
+      "metric_params": {},
+      "metric_name": "word_perplexity",
+      "docs": [
+        {
+          "query": "The weather is",
+          "choices": ["unpredictable today"],
+          "gold_index": 0,
+          "task_name": "completion"
+        },
+        {
+          "query": "Mathematics is",
+          "choices": ["a complex subject"],
+          "gold_index": 0,
+          "task_name": "completion"
+        },
+        {
+          "query": "Artificial intelligence",
+          "choices": ["continues to evolve"],
+          "gold_index": 0,
+          "task_name": "completion"
+        }
+      ],
+      "model_responses": [
+        {
+          "logprobs": [-2.0, -1.8, -2.2, -1.9]
+        },
+        {
+          "logprobs": [-2.1, -1.7, -2.3, -1.8]
+        },
+        {
+          "logprobs": [-2.2, -1.9, -2.1, -1.6]
+        }
+      ],
+      "expected_output": 29.120097496837726,
+      "tolerance": 0.01,
+      "description": "High perplexity - model has low confidence in predictions"
+    },
+    {
+      "name": "Word Perplexity - Mixed Confidence",
+      "metric_class": "word_perplexity",
+      "metric_params": {},
+      "metric_name": "word_perplexity",
+      "docs": [
+        {
+          "query": "The sun rises",
+          "choices": ["in the east"],
+          "gold_index": 0,
+          "task_name": "completion"
+        },
+        {
+          "query": "Quantum physics",
+          "choices": ["is very complex"],
+          "gold_index": 0,
+          "task_name": "completion"
+        },
+        {
+          "query": "Birds can",
+          "choices": ["fly in the sky"],
+          "gold_index": 0,
+          "task_name": "completion"
+        }
+      ],
+      "model_responses": [
+        {
+          "logprobs": [-0.3, -0.2]
+        },
+        {
+          "logprobs": [-1.8, -1.9, -1.7]
+        },
+        {
+          "logprobs": [-0.4, -0.3, -0.2, -0.3]
+        }
+      ],
+      "expected_output": 2.7573931272726773,
+      "tolerance": 0.01,
+      "description": "Mixed confidence - combination of high and low confidence predictions"
+    }
+  ]
+}

From f903ee0b3d5184778ffd223c67d8ec828247a2bf Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Mon, 15 Sep 2025 13:08:26 +0000
Subject: [PATCH 21/26] use SKIPPED_METRIC list instead of hardcoding all
 metric names

---
 tests/unit/metrics/test_metrics_automated.py | 59 +++-----------------
 1 file changed, 8 insertions(+), 51 deletions(-)

diff --git a/tests/unit/metrics/test_metrics_automated.py b/tests/unit/metrics/test_metrics_automated.py
index c705e672a..87b61a9b4 100644
--- a/tests/unit/metrics/test_metrics_automated.py
+++ b/tests/unit/metrics/test_metrics_automated.py
@@ -82,60 +82,17 @@ class MetricTestSuite(BaseModel):
     description: str | None = None
 
 
+SKIPPED_METRICS = [
+    "faithfulness",  # Need GPU to run
+    "bert_score",  # Issue with the scoring function, int too big to convert
+    "simpleqa_judge",  # Need to setup for compute costs
+]
+
+
 class AutomatedMetricTester:
     """Automated testing framework for LightEval metrics."""
 
-    # Mapping of metric names to Metrics enum values
-    METRIC_CLASSES = {
-        # Map metric names to their corresponding Metrics enum values
-        "exact_match": Metrics.exact_match,
-        "f1_score": Metrics.f1_score,
-        "loglikelihood_acc": Metrics.loglikelihood_acc,
-        "recall_at_k": Metrics.recall_at_k,
-        "mrr": Metrics.mrr,
-        "rouge1": Metrics.rouge1,
-        "rouge2": Metrics.rouge2,
-        "rougeL": Metrics.rougeL,
-        "rougeLsum": Metrics.rougeLsum,
-        "rouge_t5": Metrics.rouge_t5,
-        "extractiveness": Metrics.extractiveness,
-        "bleurt": Metrics.bleurt,
-        "copyright": Metrics.copyright,
-        "drop": Metrics.drop,
-        "avg_at_k": Metrics.avg_at_k,
-        "avg_at_k_math": Metrics.avg_at_k_math,
-        "g_pass_at_k": Metrics.g_pass_at_k,
-        "g_pass_at_k_math": Metrics.g_pass_at_k_math,
-        "g_pass_at_k_latex": Metrics.g_pass_at_k_latex,
-        "maj_at_k": Metrics.maj_at_k,
-        "pass_at_k": Metrics.pass_at_k,
-        "pass_at_k_math": Metrics.pass_at_k_math,
-        "pass_at_k_letters": Metrics.pass_at_k_letters,
-        "gpqa_instruct_metric": Metrics.gpqa_instruct_metric,
-        "gpqa_instruct_pass_at_k": Metrics.gpqa_instruct_pass_at_k,
-        "expr_gold_metric": Metrics.expr_gold_metric,
-        "acc_golds_likelihood": Metrics.acc_golds_likelihood,
-        "truthfulqa_mc_metrics": Metrics.truthfulqa_mc_metrics,
-        # "faithfulness": Metrics.faithfulness,  # need GPU to run
-        # "bert_score": Metrics.bert_score, issue with the scoring function, int too big to convert
-        # "simpleqa_judge": Metrics.simpleqa_judge, # Need to setup for compute costs
-        "prediction_perplexity": Metrics.prediction_perplexity,
-        "bleu": Metrics.bleu,
-        "bleu_1": Metrics.bleu_1,
-        "bleu_4": Metrics.bleu_4,
-        "bits_per_byte": Metrics.bits_per_byte,
-        "byte_perplexity": Metrics.byte_perplexity,
-        "target_perplexity": Metrics.target_perplexity,
-        "chrf": Metrics.chrf,
-        "chrf_plus": Metrics.chrf_plus,
-        "loglikelihood_f1": Metrics.loglikelihood_f1,
-        "multi_f1_numeric": Metrics.multi_f1_numeric,
-        "ter": Metrics.ter,
-        "word_perplexity": Metrics.word_perplexity,
-        "f1_score_macro": Metrics.f1_score_macro,
-        "f1_score_micro": Metrics.f1_score_micro,
-        "mcc": Metrics.mcc,
-    }
+    METRIC_CLASSES = [metric.value for metric in Metrics if metric.value.metric_name not in SKIPPED_METRICS]
 
     def __init__(self):
         self.test_results = []

From 23e9714411d46ba223842ef2a52a6da9e4c872b8 Mon Sep 17 00:00:00 2001
From: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Date: Tue, 16 Sep 2025 13:33:21 +0200
Subject: [PATCH 22/26] Update tests/unit/metrics/test_metrics_automated.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 tests/unit/metrics/test_metrics_automated.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/metrics/test_metrics_automated.py b/tests/unit/metrics/test_metrics_automated.py
index 87b61a9b4..1c5bd940a 100644
--- a/tests/unit/metrics/test_metrics_automated.py
+++ b/tests/unit/metrics/test_metrics_automated.py
@@ -92,7 +92,7 @@ class MetricTestSuite(BaseModel):
 class AutomatedMetricTester:
     """Automated testing framework for LightEval metrics."""
 
-    METRIC_CLASSES = [metric.value for metric in Metrics if metric.value.metric_name not in SKIPPED_METRICS]
+    METRIC_CLASSES = {metric.value.metric_name: metric for metric in Metrics if metric.value.metric_name not in SKIPPED_METRICS}
 
     def __init__(self):
         self.test_results = []

From 048b4072056dc4fd92b5b18d60e8ae213aaabf96 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 16 Sep 2025 13:11:58 +0000
Subject: [PATCH 23/26] fix tests

---
 src/lighteval/metrics/metrics_corpus.py      | 10 ++-
 tests/unit/metrics/test_cases/bleu.json      | 64 ++++++++++----------
 tests/unit/metrics/test_cases/chrf.json      | 18 +++---
 tests/unit/metrics/test_cases/chrf_plus.json |  8 +--
 tests/unit/metrics/test_metrics_automated.py |  6 +-
 5 files changed, 57 insertions(+), 49 deletions(-)

diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
index cd9dda375..cfaa770ab 100644
--- a/src/lighteval/metrics/metrics_corpus.py
+++ b/src/lighteval/metrics/metrics_corpus.py
@@ -151,7 +151,15 @@ def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float:
                     f"Multiple predictions present, keeping only the first prediction (when computing sacrebleu.{metric.__name__})."
                 )
             preds.append(pred[0])
-        return float(metric.corpus_score(hypotheses=preds, references=golds).score)
+
+        if self.metric_type == "bleu":
+            golds = [[gold[0] for gold in golds]]
+            breakpoint()
+
+        corpus_score = metric.corpus_score(hypotheses=preds, references=golds)
+        score = corpus_score.score
+        results = float(score)
+        return results
 
 
 class CorpusLevelPerplexityMetric(CorpusLevelComputation):
diff --git a/tests/unit/metrics/test_cases/bleu.json b/tests/unit/metrics/test_cases/bleu.json
index 7171fba7a..fb8ebbfc4 100644
--- a/tests/unit/metrics/test_cases/bleu.json
+++ b/tests/unit/metrics/test_cases/bleu.json
@@ -10,36 +10,36 @@
       "metric_name": "bleu",
       "docs": [
         {
-          "query": "Translate to French: Hello world",
-          "choices": ["Bonjour le monde"],
+          "query": "Translate to French: The beautiful flowers are blooming in the garden today",
+          "choices": ["Les belles fleurs fleurissent dans le jardin aujourd'hui"],
           "gold_index": 0,
           "task_name": "translation"
         },
         {
-          "query": "Translate to Spanish: Good morning",
-          "choices": ["Buenos días"],
+          "query": "Translate to Spanish: My family and I went to the beach last weekend",
+          "choices": ["Mi familia y yo fuimos a la playa el fin de semana pasado"],
           "gold_index": 0,
           "task_name": "translation"
         },
         {
-          "query": "Translate to German: Thank you",
-          "choices": ["Danke schön"],
+          "query": "Translate to German: The children are playing with their new toys in the park",
+          "choices": ["Die Kinder spielen mit ihren neuen Spielzeugen im Park"],
           "gold_index": 0,
           "task_name": "translation"
         }
       ],
       "model_responses": [
         {
-          "text": ["Bonjour le monde"]
+          "text": ["Les belles fleurs fleurissent dans le jardin aujourd'hui"]
         },
         {
-          "text": ["Buenos días"]
+          "text": ["Mi familia y yo fuimos a la playa el fin de semana pasado"]
         },
         {
-          "text": ["Danke schön"]
+          "text": ["Die Kinder spielen mit ihren neuen Spielzeugen im Park"]
         }
       ],
-      "expected_output": 0.0,
+      "expected_output": 100.0,
       "tolerance": 0.01,
       "description": "Perfect translations - exact word overlap (BLEU = 100.0)"
     },
@@ -79,8 +79,8 @@
           "text": ["Das Wetter ist schön"]
         }
       ],
-      "expected_output": 85.0,
-      "tolerance": 5.0,
+      "expected_output": 81.02,
+      "tolerance": 0.01,
       "description": "High similarity - minor word differences (BLEU ≈ 85.0)"
     },
     {
@@ -90,36 +90,36 @@
       "metric_name": "bleu",
       "docs": [
         {
-          "query": "Translate to French: The quick brown fox",
-          "choices": ["Le renard brun rapide"],
+          "query": "Translate to French: The quick brown fox jumped gracefully over the lazy sleeping dog",
+          "choices": ["Le renard brun rapide a sauté gracieusement par-dessus le chien paresseux endormi"],
           "gold_index": 0,
           "task_name": "translation"
         },
         {
-          "query": "Translate to Spanish: Artificial intelligence",
-          "choices": ["La inteligencia artificial"],
+          "query": "Translate to Spanish: Artificial intelligence is revolutionizing the way we interact with technology",
+          "choices": ["La inteligencia artificial está revolucionando la forma en que interactuamos con la tecnología"],
           "gold_index": 0,
           "task_name": "translation"
         },
         {
-          "query": "Translate to German: Machine learning",
-          "choices": ["Maschinelles Lernen"],
+          "query": "Translate to German: Machine learning algorithms can analyze complex patterns in large datasets",
+          "choices": ["Maschinelle Lernalgorithmen können komplexe Muster in großen Datensätzen analysieren"],
           "gold_index": 0,
           "task_name": "translation"
         }
       ],
       "model_responses": [
         {
-          "text": ["Le renard rapide"]
+          "text": ["Le renard rapide a sauté par-dessus le chien"]
         },
         {
-          "text": ["La IA"]
+          "text": ["La IA revoluciona la tecnología"]
         },
         {
-          "text": ["ML"]
+          "text": ["ML analysiert Daten"]
         }
       ],
-      "expected_output": 45.0,
+      "expected_output": 0.0,
       "tolerance": 10.0,
       "description": "Moderate similarity - significant word omissions (BLEU ≈ 45.0)"
     },
@@ -130,36 +130,36 @@
       "metric_name": "bleu",
       "docs": [
         {
-          "query": "Translate to French: The sun is bright",
-          "choices": ["Le soleil est brillant"],
+          "query": "Translate to French: The bright sun shines warmly through the scattered clouds in the azure summer sky",
+          "choices": ["Le soleil brillant brille chaudement à travers les nuages épars dans le ciel bleu azur d'été"],
           "gold_index": 0,
           "task_name": "translation"
         },
         {
-          "query": "Translate to Spanish: The moon is full",
-          "choices": ["La luna está llena"],
+          "query": "Translate to Spanish: The full moon casts mysterious shadows across the tranquil lake at midnight",
+          "choices": ["La luna llena proyecta sombras misteriosas sobre el lago tranquilo a medianoche"],
           "gold_index": 0,
           "task_name": "translation"
         },
         {
-          "query": "Translate to German: The stars are beautiful",
-          "choices": ["Die Sterne sind wunderschön"],
+          "query": "Translate to German: The twinkling stars illuminate the dark velvet sky like scattered diamonds",
+          "choices": ["Die funkelnden Sterne erleuchten den dunklen Samthimmel wie verstreute Diamanten"],
           "gold_index": 0,
           "task_name": "translation"
         }
       ],
       "model_responses": [
         {
-          "text": ["Il pleut"]
+          "text": ["Il fait mauvais temps aujourd'hui et le ciel est couvert"]
         },
         {
-          "text": ["Hace frío"]
+          "text": ["Las montañas son muy altas y majestuosas"]
         },
         {
-          "text": ["Es heiß"]
+          "text": ["Der Wind weht stark durch die Bäume"]
         }
       ],
-      "expected_output": 15.0,
+      "expected_output": 0.0,
       "tolerance": 10.0,
       "description": "Low similarity - minimal word overlap (BLEU ≈ 15.0)"
     }
diff --git a/tests/unit/metrics/test_cases/chrf.json b/tests/unit/metrics/test_cases/chrf.json
index 15f7b8c15..f55028674 100644
--- a/tests/unit/metrics/test_cases/chrf.json
+++ b/tests/unit/metrics/test_cases/chrf.json
@@ -40,7 +40,7 @@
         }
       ],
       "expected_output": 100.0,
-      "tolerance": 0.01,
+      "tolerance": 0.1,
       "description": "Perfect matches - exact character overlap (CHRF = 100.0)"
     },
     {
@@ -79,8 +79,8 @@
           "text": ["Das Wetter ist schön"]
         }
       ],
-      "expected_output": 88.0,
-      "tolerance": 5.0,
+      "expected_output": 100.0,
+      "tolerance": 0.1,
       "description": "High similarity - minor character differences (CHRF ≈ 88.0)"
     },
     {
@@ -119,8 +119,8 @@
           "text": ["Lernen Maschinelles"]
         }
       ],
-      "expected_output": 75.0,
-      "tolerance": 10.0,
+      "expected_output": 78.84,
+      "tolerance": 0.1,
       "description": "Word order changes - same characters, different order (CHRF ≈ 75.0)"
     },
     {
@@ -159,8 +159,8 @@
           "text": ["Die Sterne"]
         }
       ],
-      "expected_output": 50.0,
-      "tolerance": 10.0,
+      "expected_output": 37.68,
+      "tolerance": 0.1,
       "description": "Moderate similarity - partial character overlap (CHRF ≈ 50.0)"
     },
     {
@@ -199,8 +199,8 @@
           "text": ["Es sehr heiß"]
         }
       ],
-      "expected_output": 20.0,
-      "tolerance": 10.0,
+      "expected_output": 7.7,
+      "tolerance": 0.1,
       "description": "Low similarity - minimal character overlap (CHRF ≈ 20.0)"
     }
   ]
diff --git a/tests/unit/metrics/test_cases/chrf_plus.json b/tests/unit/metrics/test_cases/chrf_plus.json
index 80023078e..29c45720d 100644
--- a/tests/unit/metrics/test_cases/chrf_plus.json
+++ b/tests/unit/metrics/test_cases/chrf_plus.json
@@ -79,8 +79,8 @@
           "text": ["Das Wetter ist schön"]
         }
       ],
-      "expected_output": 85.0,
-      "tolerance": 5.0,
+      "expected_output": 100.0,
+      "tolerance": 0.1,
       "description": "High similarity - minor character differences (CHRF++ ≈ 85.0)"
     },
     {
@@ -119,8 +119,8 @@
           "text": ["ML"]
         }
       ],
-      "expected_output": 45.0,
-      "tolerance": 10.0,
+      "expected_output": 58.82,
+      "tolerance": 0.1,
       "description": "Moderate similarity - significant character omissions (CHRF++ ≈ 45.0)"
     },
     {
diff --git a/tests/unit/metrics/test_metrics_automated.py b/tests/unit/metrics/test_metrics_automated.py
index 1c5bd940a..2f5136cc9 100644
--- a/tests/unit/metrics/test_metrics_automated.py
+++ b/tests/unit/metrics/test_metrics_automated.py
@@ -92,7 +92,7 @@ class MetricTestSuite(BaseModel):
 class AutomatedMetricTester:
     """Automated testing framework for LightEval metrics."""
 
-    METRIC_CLASSES = {metric.value.metric_name: metric for metric in Metrics if metric.value.metric_name not in SKIPPED_METRICS}
+    METRIC_CLASSES = {metric.name: metric.value for metric in Metrics if metric.name not in SKIPPED_METRICS}
 
     def __init__(self):
         self.test_results = []
@@ -123,10 +123,10 @@ def instantiate_metric(self, metric_class: str, metric_params: dict[str, Any]):
 
         # Get the metric from the Metrics enum
         if metric_params != {}:
-            metric = self.METRIC_CLASSES[metric_class].value
+            metric = self.METRIC_CLASSES[metric_class]
             metric_enum_value = copy.deepcopy(metric)(metric_params)
         else:
-            metric_enum_value = self.METRIC_CLASSES[metric_class].value
+            metric_enum_value = self.METRIC_CLASSES[metric_class]
 
         # The Metrics enum values are already instantiated, so we just return them
         # The metric_params are ignored for now since the Metrics enum values are pre-configured

From c4aebcec2149364ac25503a40371acc07039a6e1 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 16 Sep 2025 13:14:39 +0000
Subject: [PATCH 24/26] remove breakpoint

---
 src/lighteval/metrics/metrics_corpus.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
index cfaa770ab..54b7f9fc6 100644
--- a/src/lighteval/metrics/metrics_corpus.py
+++ b/src/lighteval/metrics/metrics_corpus.py
@@ -154,7 +154,6 @@ def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float:
 
         if self.metric_type == "bleu":
             golds = [[gold[0] for gold in golds]]
-            breakpoint()
 
         corpus_score = metric.corpus_score(hypotheses=preds, references=golds)
         score = corpus_score.score

From 432345e3ce157609f0cac74c151d7605718b5066 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 16 Sep 2025 13:15:36 +0000
Subject: [PATCH 25/26] remove breakpoint

---
 src/lighteval/metrics/utils/metric_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py
index e57e56724..c806c5b6b 100644
--- a/src/lighteval/metrics/utils/metric_utils.py
+++ b/src/lighteval/metrics/utils/metric_utils.py
@@ -50,7 +50,6 @@ def compute_sample(
         elif isinstance(self.sample_level_fn, Preparator):
             sample_level_fn = self.sample_level_fn.prepare
         else:
-            breakpoint()
             raise ValueError(
                 f"Incorrect type for {self.sample_level_fn}, should be a SampleLevelComputation or Preparator"
             )

From fd27034e757f11a9dfcce9a5e563003e0ad00bbc Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 16 Sep 2025 13:28:17 +0000
Subject: [PATCH 26/26] fix quality

---
 src/lighteval/tasks/extended/ifbench/instructions.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/lighteval/tasks/extended/ifbench/instructions.py b/src/lighteval/tasks/extended/ifbench/instructions.py
index 03bf86413..ccb5b50da 100644
--- a/src/lighteval/tasks/extended/ifbench/instructions.py
+++ b/src/lighteval/tasks/extended/ifbench/instructions.py
@@ -142,7 +142,7 @@ def build_description(self, *, N=None):
         """Build the instruction description.
 
         Args:
-          n: An integer specifying the number of unique words contained in the response.
+          N: An integer specifying the number of unique words contained in the response.
 
         Returns:
           A string representing the instruction description.
@@ -2113,7 +2113,7 @@ def build_description(self, *, prompt_to_repeat=None):
         """Build the instruction description.
 
         Args:
-          keyword: A string representing a keyword that is expected in the response.
+          prompt_to_repeat: The prompt that is meant to be repeated.
 
         Returns:
           A string representing the instruction description.
@@ -2187,11 +2187,12 @@ def build_description(self, prompt_to_repeat=None, n_start=None, n_end=None):
         """Build the instruction description.
 
         Args:
-        n_start: An integer representing the start index of the span.
-        n_end: An integer representing the end index of the span.
+            prompt_to_repeat: The prompt that is meant to be repeated.
+            n_start: An integer representing the start index of the span.
+            n_end: An integer representing the end index of the span.
 
         Returns:
-        A string representing the instruction description.
+            A string representing the instruction description.
         """
         if not prompt_to_repeat:
             raise ValueError("prompt_to_repeat must be set.")