huggingface · clefourrier · Sep 11, 2025 · Jul 25, 2025 · Jul 25, 2025 · Aug 1, 2025
diff --git a/docs/source/evaluating-a-custom-model.mdx b/docs/source/evaluating-a-custom-model.mdx
@@ -15,7 +15,7 @@ Here's a basic example:
 ```python
 from lighteval.models.abstract_model import LightevalModel
 from lighteval.models.model_output import ModelResponse
-from lighteval.tasks.requests import Doc
+from lighteval.tasks.requests import Doc, SamplingMethod
 from lighteval.utils.cache_management import SampleCache, cached
 
 class MyCustomModel(LightevalModel):
@@ -26,18 +26,18 @@ class MyCustomModel(LightevalModel):
         # Enable caching (recommended)
         self._cache = SampleCache(config)
 
-    @cached("predictions")  # Enable caching for better performance
-    def greedy_until(self, docs: list[Doc]) -> list[ModelResponse]:
+    @cached(SamplingMethod.GENERATIVE)
+    def greedy_until(self, docs: List[Doc]) -> List[ModelResponse]:
         # Implement generation logic
         pass
 
-    @cached("predictions")  # Enable caching for better performance
-    def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
+    @cached(SamplingMethod.LOGPROBS)
+    def loglikelihood(self, docs: List[Doc]) -> List[ModelResponse]:
         # Implement loglikelihood computation
         pass
 
-    @cached("predictions")  # Enable caching for better performance
-    def loglikelihood_rolling(self, docs: list[Doc]) -> list[ModelResponse]:
+    @cached(SamplingMethod.PERPLEXITY)
+    def loglikelihood_rolling(self, docs: List[Doc]) -> List[ModelResponse]:
         # Implement rolling loglikelihood computation
         pass
 ```
@@ -179,13 +179,12 @@ def __init__(self, config):
     self._cache = SampleCache(config)
 ```
 
-### Step 3: Add Cache Decorators
-Add cache decorators to your prediction methods:
-```python
-@cached("predictions")
-def greedy_until(self, docs: list[Doc]) -> list[ModelResponse]:
-    # Your implementation...
-```
+3. Add cache decorators to your prediction methods:
+   ```python
+   @cached(SamplingMethod.GENERATIVE)
+   def greedy_until(self, docs: List[Doc]) -> List[ModelResponse]:
+       # Your implementation...
+   ```
 
 For detailed information about the caching system, see the [Caching Documentation](caching).
 

diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
@@ -50,6 +50,17 @@ class CorpusLevelComputation(ABC):
     def compute_corpus(self):
         raise NotImplementedError
 
+    def __str__(self):
+        attrs = vars(self)
+        attr_strs = []
+        for k, v in attrs.items():
+            if callable(v):
+                val_str = v.__name__
+            else:
+                val_str = str(v)
+            attr_strs.append(f"{k}={val_str}")
+        return f"{self.__class__.__name__}({', '.join(attr_strs)})"
+
 
 # General aggregations
 class MatthewsCorrCoef(CorpusLevelComputation):

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -66,6 +66,17 @@ class SampleLevelComputation(ABC):
     def compute(self, model_response: ModelResponse, doc: Doc, **kwargs):
         raise NotImplementedError
 
+    def __str__(self):
+        attrs = vars(self)
+        attr_strs = []
+        for k, v in attrs.items():
+            if callable(v):
+                val_str = v.__name__
+            else:
+                val_str = str(v)
+            attr_strs.append(f"{k}={val_str}")
+        return f"{self.__class__.__name__}({', '.join(attr_strs)})"
+
 
 class ExactMatches(SampleLevelComputation):
     def __init__(
@@ -1109,10 +1120,11 @@ def __init__(
         self.strip_strings = strip_strings
 
         if callable(sample_scoring_function):
-            self.score_sample = sample_scoring_function
+            self.compute_score = sample_scoring_function
             self.type_exact_match = None
         elif isinstance(sample_scoring_function, SampleLevelComputation):
             self.score_sample = sample_scoring_function.compute
+            self.type_exact_match = None
         else:
             if isinstance(sample_scoring_function, str):
                 if sample_scoring_function not in ["prefix", "suffix", "full"]:
@@ -1199,7 +1211,7 @@ def __init__(self, k: int | None = None, **kwargs):
             k (int): The number of top choices to consider.
             **kwargs: Additional keyword arguments.
         """
-        super().__init__(kwargs)
+        super().__init__(**kwargs)
 
         self.k = k
         self.attribute_must_be_set = ["k"]
@@ -1280,7 +1292,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
         elif len(predictions) < self.n:
             logger.warning(f"Number of predictions is less than {self.n} for pass@k.")
 
-        processed_choices = [self.preprocess(text=g) for g in doc.choices]
+        processed_choices = [self.preprocess(g) for g in doc.choices]
         new_doc = Doc(
             choices=processed_choices,
             query=doc.query,
@@ -1289,7 +1301,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
 
         all_scores = []
         for pred in predictions[: self.n]:
-            cur_pred = self.preprocess(text=pred)
+            cur_pred = self.preprocess(pred)
             new_model_response = ModelResponse(
                 text=[cur_pred],
             )

diff --git a/src/lighteval/metrics/sample_preparator.py b/src/lighteval/metrics/sample_preparator.py
@@ -81,6 +81,17 @@ def prepare(doc: Doc, model_response: ModelResponse, **kwargs):
         predictions = model_response.final_text
         return GenerativeCorpusMetricInput(golds=golds, preds=predictions)
 
+    def __str__(self):
+        attrs = vars(self)
+        attr_strs = []
+        for k, v in attrs.items():
+            if callable(v):
+                val_str = v.__name__
+            else:
+                val_str = str(v)
+            attr_strs.append(f"{k}={val_str}")
+        return f"{self.__class__.__name__}({', '.join(attr_strs)})"
+
 
 class LoglikelihoodPreparator(Preparator):
     def __init__(self, is_single_token: bool = False):

diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py
@@ -95,6 +95,10 @@ def __call__(self, sample_params: dict | None):
             self.metric_name = f"{self.metric_name}_with_{sample_params_name}"
         return self
 
+    @staticmethod
+    def get_allowed_types_for_metrics():
+        return (SampleLevelComputation, Preparator, CorpusLevelComputation, Callable)
+
 
 @dataclass
 class MetricGrouping(Metric):

diff --git a/src/lighteval/models/abstract_model.py b/src/lighteval/models/abstract_model.py
@@ -46,6 +46,8 @@ class ModelConfig(BaseModel, extra="forbid"):
     as well as shared attributes that are used by all models like generation parameters and system prompts.
 
     Attributes:
+        model_name (str):
+            The model name or unique id
         generation_parameters (GenerationParameters):
             Configuration parameters that control text generation behavior, including
             temperature, top_p, max_new_tokens, etc. Defaults to empty GenerationParameters.
@@ -80,6 +82,8 @@ class ModelConfig(BaseModel, extra="forbid"):
         ```
     """
 
+    model_name: str = None
+
     generation_parameters: GenerationParameters = GenerationParameters()
     system_prompt: str | None = None
     cache_dir: str = "~/.cache/huggingface/lighteval"

diff --git a/src/lighteval/models/dummy/dummy_model.py b/src/lighteval/models/dummy/dummy_model.py
@@ -28,7 +28,7 @@
 
 from lighteval.models.abstract_model import LightevalModel, ModelConfig
 from lighteval.models.model_output import ModelResponse
-from lighteval.tasks.requests import Doc
+from lighteval.tasks.requests import Doc, SamplingMethod
 from lighteval.utils.cache_management import SampleCache, cached
 
 
@@ -87,11 +87,11 @@ def add_special_tokens(self):
     def max_length(self) -> int:
         return 2048
 
-    @cached("predictions")
+    @cached(SamplingMethod.GENERATIVE)
     def greedy_until(self, docs: list[Doc]) -> list[ModelResponse]:
         return [ModelResponse(text=["random baseline"]) for _ in range(len(docs))]
 
-    @cached("predictions")
+    @cached(SamplingMethod.LOGPROBS)
     def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
         model_responses = []
         for doc in docs:
@@ -104,7 +104,7 @@ def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
 
         return model_responses
 
-    @cached("predictions")
+    @cached(SamplingMethod.PERPLEXITY)
     def loglikelihood_rolling(self, docs: list[Doc]) -> list[ModelResponse]:
         model_responses = []
         for doc in docs:

diff --git a/src/lighteval/models/endpoints/endpoint_model.py b/src/lighteval/models/endpoints/endpoint_model.py
@@ -48,7 +48,7 @@
 from lighteval.models.abstract_model import LightevalModel, ModelConfig
 from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.prompt_manager import PromptManager
-from lighteval.tasks.requests import Doc
+from lighteval.tasks.requests import Doc, SamplingMethod
 from lighteval.utils.cache_management import SampleCache, cached
 
 
@@ -555,7 +555,7 @@ def _process_batch_logprob(self, docs: list[Doc], rolling: bool = False) -> list
             for context, doc in zip(contexts, docs)
         ]
 
-    @cached("predictions")
+    @cached(SamplingMethod.GENERATIVE)
     def greedy_until(
         self,
         docs: List[Doc],
@@ -599,11 +599,11 @@ def _greedy_until(self, docs: List[Doc]) -> list[ModelResponse]:
 
         return dataset.get_original_order(results)
 
-    @cached("predictions")
+    @cached(SamplingMethod.LOGPROBS)
     def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
         return self._loglikelihood(docs, rolling=False)
 
-    @cached("predictions")
+    @cached(SamplingMethod.PERPLEXITY)
     def loglikelihood_rolling(self, docs: list[Doc], override_bs=None) -> list[ModelResponse]:
         return self._loglikelihood(docs, rolling=True)
 

diff --git a/src/lighteval/models/endpoints/inference_providers_model.py b/src/lighteval/models/endpoints/inference_providers_model.py
@@ -35,7 +35,7 @@
 from lighteval.models.abstract_model import LightevalModel, ModelConfig
 from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.prompt_manager import PromptManager
-from lighteval.tasks.requests import Doc
+from lighteval.tasks.requests import Doc, SamplingMethod
 from lighteval.utils.cache_management import SampleCache, cached
 
 
@@ -196,7 +196,7 @@ async def bounded_api_call(prompt, num_samples):
 
         return results
 
-    @cached("predictions")
+    @cached(SamplingMethod.GENERATIVE)
     def greedy_until(
         self,
         docs: list[Doc],
@@ -253,14 +253,14 @@ def max_length(self) -> int:
             logger.warning("Tokenizer was not correctly loaded. Max model context length is assumed to be 30K tokens")
             return 30000
 
-    @cached("predictions")
+    @cached(SamplingMethod.LOGPROBS)
     def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
         """Tokenize the context and continuation and compute the log likelihood of those
         tokenized sequences.
         """
         raise NotImplementedError
 
-    @cached("predictions")
+    @cached(SamplingMethod.PERPLEXITY)
     def loglikelihood_rolling(self, docs: list[Doc]) -> list[ModelResponse]:
         """This function is used to compute the log likelihood of the context for perplexity metrics."""
         raise NotImplementedError
diff --git a/src/lighteval/models/endpoints/litellm_model.py b/src/lighteval/models/endpoints/litellm_model.py
@@ -30,7 +30,7 @@
 from lighteval.models.abstract_model import LightevalModel, ModelConfig
 from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.prompt_manager import PromptManager
-from lighteval.tasks.requests import Doc
+from lighteval.tasks.requests import Doc, SamplingMethod
 from lighteval.utils.cache_management import SampleCache, cached
 from lighteval.utils.imports import is_litellm_available
 
@@ -258,7 +258,7 @@ def __call_api_parallel(
 
         return results
 
-    @cached("predictions")
+    @cached(SamplingMethod.GENERATIVE)
     def greedy_until(
         self,
         docs: list[Doc],
@@ -323,14 +323,14 @@ def max_length(self) -> int:
         """Return the maximum sequence length of the model."""
         return 4096
 
-    @cached("predictions")
+    @cached(SamplingMethod.LOGPROBS)
     def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
         """Tokenize the context and continuation and compute the log likelihood of those
         tokenized sequences.
         """
         raise NotImplementedError
 
-    @cached("predictions")
+    @cached(SamplingMethod.PERPLEXITY)
     def loglikelihood_rolling(self, docs: list[Doc]) -> list[ModelResponse]:
         """This function is used to compute the log likelihood of the context for perplexity metrics."""
         raise NotImplementedError
diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py
@@ -48,6 +48,7 @@
 from lighteval.models.transformers.transformers_model import LightevalModel
 from lighteval.tasks.requests import (
     Doc,
+    SamplingMethod,
 )
 from lighteval.utils.cache_management import SampleCache, cached
 from lighteval.utils.imports import is_nanotron_available
@@ -483,7 +484,7 @@ def _check_continuations_start_space(self, continuation: str) -> str:
                 continuation = continuation.lstrip()
         return continuation
 
-    @cached("predictions")
+    @cached(SamplingMethod.LOGPROBS)
     def loglikelihood(self, requests: List[Doc]) -> List[ModelResponse]:
         """Tokenize the context and continuation and compute the log likelihood of those
         tokenized sequences.
@@ -506,7 +507,7 @@ def loglikelihood(self, requests: List[Doc]) -> List[ModelResponse]:
             disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0),
         )
 
-    @cached("predictions")
+    @cached(SamplingMethod.PERPLEXITY)
     def loglikelihood_rolling(self, requests: List[Doc]) -> List[ModelResponse]:
         """This function is used to compute the log likelihood of the context for perplexity metrics."""
         for request in tqdm(
@@ -941,7 +942,7 @@ def _loglikelihood_tokens(
         return dataset.get_original_order(res)
 
     @torch.inference_mode()
-    @cached("predictions")
+    @cached(SamplingMethod.GENERATIVE)
     def greedy_until(
         self,
         requests: List[Doc],

diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py
@@ -33,7 +33,7 @@
 from lighteval.models.model_output import ModelResponse
 from lighteval.models.utils import _simplify_name, uses_chat_template
 from lighteval.tasks.prompt_manager import PromptManager
-from lighteval.tasks.requests import Doc
+from lighteval.tasks.requests import Doc, SamplingMethod
 from lighteval.utils.cache_management import SampleCache, cached
 from lighteval.utils.imports import is_sglang_available
 
@@ -221,7 +221,7 @@ def _create_auto_tokenizer(self, config: SGLangModelConfig):
         tokenizer.pad_token = tokenizer.eos_token
         return tokenizer
 
-    @cached("predictions")
+    @cached(SamplingMethod.GENERATIVE)
     def greedy_until(
         self,
         docs: list[Doc],
@@ -347,7 +347,7 @@ def _generate(
         )
         return outputs
 
-    @cached("predictions")
+    @cached(SamplingMethod.LOGPROBS)
     def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
         return self._loglikelihood_tokens(docs)
 
@@ -416,6 +416,6 @@ def _loglikelihood_tokens(
                 res.append(answer)
         return dataset.get_original_order(res)
 
-    @cached("predictions")
+    @cached(SamplingMethod.PERPLEXITY)
     def loglikelihood_rolling(self, docs: list[Doc]) -> list[ModelResponse]:
         raise NotImplementedError()