Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d9de61d
init
clefourrier Jul 25, 2025
a2b1bad
Update src/lighteval/tasks/default_prompts.py
clefourrier Jul 25, 2025
0e00937
Merge branch 'main' into test_mmlu_redux_2
clefourrier Aug 1, 2025
8429f5f
small fixes
clefourrier Aug 1, 2025
1cfa2f3
Merge branch 'main' into test_mmlu_redux_2
clefourrier Aug 11, 2025
f600c40
Merge branch 'main' into test_mmlu_redux_2
clefourrier Aug 20, 2025
541c89f
Merge branch 'main' into test_mmlu_redux_2
clefourrier Aug 25, 2025
1e139ab
Apply suggestion from @NathanHB
NathanHB Aug 25, 2025
b597546
fix metrics kwargs passing
clefourrier Sep 4, 2025
b0e5584
add default metric for mmlu_redux
clefourrier Sep 4, 2025
951cbc0
fix
clefourrier Sep 4, 2025
96df0e7
update caching"
clefourrier Sep 8, 2025
971e082
Merge branch 'main' into test_mmlu_redux_2
clefourrier Sep 8, 2025
8b28aba
better str for classes, which allows correct hashing
clefourrier Sep 8, 2025
a3eeebd
last fix is to possibly push to configs
clefourrier Sep 8, 2025
c7d1eb0
removed token system + added an actual separation between tasks with …
clefourrier Sep 9, 2025
42ec1ce
fix
clefourrier Sep 9, 2025
68688b4
update caching tests
clefourrier Sep 9, 2025
b463eff
simplified system with cleaner task_id
clefourrier Sep 10, 2025
27be046
adapted to new functions
clefourrier Sep 10, 2025
f7c62bb
update vllm test
clefourrier Sep 11, 2025
0c4a429
fixing the metric changed res by 1 point
clefourrier Sep 11, 2025
dbac859
byteorder arg
clefourrier Sep 11, 2025
668e2f4
this makes little sense
clefourrier Sep 11, 2025
ef5ffe7
Update src/lighteval/utils/cache_management.py
clefourrier Sep 11, 2025
085f59c
comments
clefourrier Sep 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 13 additions & 14 deletions docs/source/evaluating-a-custom-model.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Here's a basic example:
```python
from lighteval.models.abstract_model import LightevalModel
from lighteval.models.model_output import ModelResponse
from lighteval.tasks.requests import Doc
from lighteval.tasks.requests import Doc, SamplingMethod
from lighteval.utils.cache_management import SampleCache, cached

class MyCustomModel(LightevalModel):
Expand All @@ -26,18 +26,18 @@ class MyCustomModel(LightevalModel):
# Enable caching (recommended)
self._cache = SampleCache(config)

@cached("predictions") # Enable caching for better performance
def greedy_until(self, docs: list[Doc]) -> list[ModelResponse]:
@cached(SamplingMethod.GENERATIVE)
def greedy_until(self, docs: List[Doc]) -> List[ModelResponse]:
# Implement generation logic
pass

@cached("predictions") # Enable caching for better performance
def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
@cached(SamplingMethod.LOGPROBS)
def loglikelihood(self, docs: List[Doc]) -> List[ModelResponse]:
# Implement loglikelihood computation
pass

@cached("predictions") # Enable caching for better performance
def loglikelihood_rolling(self, docs: list[Doc]) -> list[ModelResponse]:
@cached(SamplingMethod.PERPLEXITY)
def loglikelihood_rolling(self, docs: List[Doc]) -> List[ModelResponse]:
# Implement rolling loglikelihood computation
pass
```
Expand Down Expand Up @@ -179,13 +179,12 @@ def __init__(self, config):
self._cache = SampleCache(config)
```

### Step 3: Add Cache Decorators
Add cache decorators to your prediction methods:
```python
@cached("predictions")
def greedy_until(self, docs: list[Doc]) -> list[ModelResponse]:
# Your implementation...
```
3. Add cache decorators to your prediction methods:
```python
@cached(SamplingMethod.GENERATIVE)
def greedy_until(self, docs: List[Doc]) -> List[ModelResponse]:
# Your implementation...
```

For detailed information about the caching system, see the [Caching Documentation](caching).

Expand Down
11 changes: 11 additions & 0 deletions src/lighteval/metrics/metrics_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,17 @@ class CorpusLevelComputation(ABC):
def compute_corpus(self):
raise NotImplementedError

def __str__(self):
attrs = vars(self)
attr_strs = []
for k, v in attrs.items():
if callable(v):
val_str = v.__name__
else:
val_str = str(v)
attr_strs.append(f"{k}={val_str}")
return f"{self.__class__.__name__}({', '.join(attr_strs)})"


# General aggregations
class MatthewsCorrCoef(CorpusLevelComputation):
Expand Down
20 changes: 16 additions & 4 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,17 @@ class SampleLevelComputation(ABC):
def compute(self, model_response: ModelResponse, doc: Doc, **kwargs):
raise NotImplementedError

def __str__(self):
attrs = vars(self)
attr_strs = []
for k, v in attrs.items():
if callable(v):
val_str = v.__name__
else:
val_str = str(v)
attr_strs.append(f"{k}={val_str}")
return f"{self.__class__.__name__}({', '.join(attr_strs)})"


class ExactMatches(SampleLevelComputation):
def __init__(
Expand Down Expand Up @@ -1109,10 +1120,11 @@ def __init__(
self.strip_strings = strip_strings

if callable(sample_scoring_function):
self.score_sample = sample_scoring_function
self.compute_score = sample_scoring_function
self.type_exact_match = None
elif isinstance(sample_scoring_function, SampleLevelComputation):
self.score_sample = sample_scoring_function.compute
self.type_exact_match = None
else:
if isinstance(sample_scoring_function, str):
if sample_scoring_function not in ["prefix", "suffix", "full"]:
Expand Down Expand Up @@ -1199,7 +1211,7 @@ def __init__(self, k: int | None = None, **kwargs):
k (int): The number of top choices to consider.
**kwargs: Additional keyword arguments.
"""
super().__init__(kwargs)
super().__init__(**kwargs)

self.k = k
self.attribute_must_be_set = ["k"]
Expand Down Expand Up @@ -1280,7 +1292,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
elif len(predictions) < self.n:
logger.warning(f"Number of predictions is less than {self.n} for pass@k.")

processed_choices = [self.preprocess(text=g) for g in doc.choices]
processed_choices = [self.preprocess(g) for g in doc.choices]
new_doc = Doc(
choices=processed_choices,
query=doc.query,
Expand All @@ -1289,7 +1301,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:

all_scores = []
for pred in predictions[: self.n]:
cur_pred = self.preprocess(text=pred)
cur_pred = self.preprocess(pred)
new_model_response = ModelResponse(
text=[cur_pred],
)
Expand Down
11 changes: 11 additions & 0 deletions src/lighteval/metrics/sample_preparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,17 @@ def prepare(doc: Doc, model_response: ModelResponse, **kwargs):
predictions = model_response.final_text
return GenerativeCorpusMetricInput(golds=golds, preds=predictions)

def __str__(self):
attrs = vars(self)
attr_strs = []
for k, v in attrs.items():
if callable(v):
val_str = v.__name__
else:
val_str = str(v)
attr_strs.append(f"{k}={val_str}")
return f"{self.__class__.__name__}({', '.join(attr_strs)})"


class LoglikelihoodPreparator(Preparator):
def __init__(self, is_single_token: bool = False):
Expand Down
4 changes: 4 additions & 0 deletions src/lighteval/metrics/utils/metric_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ def __call__(self, sample_params: dict | None):
self.metric_name = f"{self.metric_name}_with_{sample_params_name}"
return self

@staticmethod
def get_allowed_types_for_metrics():
return (SampleLevelComputation, Preparator, CorpusLevelComputation, Callable)


@dataclass
class MetricGrouping(Metric):
Expand Down
4 changes: 4 additions & 0 deletions src/lighteval/models/abstract_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ class ModelConfig(BaseModel, extra="forbid"):
as well as shared attributes that are used by all models like generation parameters and system prompts.

Attributes:
model_name (str):
The model name or unique id
generation_parameters (GenerationParameters):
Configuration parameters that control text generation behavior, including
temperature, top_p, max_new_tokens, etc. Defaults to empty GenerationParameters.
Expand Down Expand Up @@ -80,6 +82,8 @@ class ModelConfig(BaseModel, extra="forbid"):
```
"""

model_name: str = None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

needs to be added to the doc


generation_parameters: GenerationParameters = GenerationParameters()
system_prompt: str | None = None
cache_dir: str = "~/.cache/huggingface/lighteval"
Expand Down
8 changes: 4 additions & 4 deletions src/lighteval/models/dummy/dummy_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

from lighteval.models.abstract_model import LightevalModel, ModelConfig
from lighteval.models.model_output import ModelResponse
from lighteval.tasks.requests import Doc
from lighteval.tasks.requests import Doc, SamplingMethod
from lighteval.utils.cache_management import SampleCache, cached


Expand Down Expand Up @@ -87,11 +87,11 @@ def add_special_tokens(self):
def max_length(self) -> int:
return 2048

@cached("predictions")
@cached(SamplingMethod.GENERATIVE)
def greedy_until(self, docs: list[Doc]) -> list[ModelResponse]:
return [ModelResponse(text=["random baseline"]) for _ in range(len(docs))]

@cached("predictions")
@cached(SamplingMethod.LOGPROBS)
def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
model_responses = []
for doc in docs:
Expand All @@ -104,7 +104,7 @@ def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:

return model_responses

@cached("predictions")
@cached(SamplingMethod.PERPLEXITY)
def loglikelihood_rolling(self, docs: list[Doc]) -> list[ModelResponse]:
model_responses = []
for doc in docs:
Expand Down
8 changes: 4 additions & 4 deletions src/lighteval/models/endpoints/endpoint_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from lighteval.models.abstract_model import LightevalModel, ModelConfig
from lighteval.models.model_output import ModelResponse
from lighteval.tasks.prompt_manager import PromptManager
from lighteval.tasks.requests import Doc
from lighteval.tasks.requests import Doc, SamplingMethod
from lighteval.utils.cache_management import SampleCache, cached


Expand Down Expand Up @@ -555,7 +555,7 @@ def _process_batch_logprob(self, docs: list[Doc], rolling: bool = False) -> list
for context, doc in zip(contexts, docs)
]

@cached("predictions")
@cached(SamplingMethod.GENERATIVE)
def greedy_until(
self,
docs: List[Doc],
Expand Down Expand Up @@ -599,11 +599,11 @@ def _greedy_until(self, docs: List[Doc]) -> list[ModelResponse]:

return dataset.get_original_order(results)

@cached("predictions")
@cached(SamplingMethod.LOGPROBS)
def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
return self._loglikelihood(docs, rolling=False)

@cached("predictions")
@cached(SamplingMethod.PERPLEXITY)
def loglikelihood_rolling(self, docs: list[Doc], override_bs=None) -> list[ModelResponse]:
return self._loglikelihood(docs, rolling=True)

Expand Down
8 changes: 4 additions & 4 deletions src/lighteval/models/endpoints/inference_providers_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from lighteval.models.abstract_model import LightevalModel, ModelConfig
from lighteval.models.model_output import ModelResponse
from lighteval.tasks.prompt_manager import PromptManager
from lighteval.tasks.requests import Doc
from lighteval.tasks.requests import Doc, SamplingMethod
from lighteval.utils.cache_management import SampleCache, cached


Expand Down Expand Up @@ -196,7 +196,7 @@ async def bounded_api_call(prompt, num_samples):

return results

@cached("predictions")
@cached(SamplingMethod.GENERATIVE)
def greedy_until(
self,
docs: list[Doc],
Expand Down Expand Up @@ -253,14 +253,14 @@ def max_length(self) -> int:
logger.warning("Tokenizer was not correctly loaded. Max model context length is assumed to be 30K tokens")
return 30000

@cached("predictions")
@cached(SamplingMethod.LOGPROBS)
def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
"""Tokenize the context and continuation and compute the log likelihood of those
tokenized sequences.
"""
raise NotImplementedError

@cached("predictions")
@cached(SamplingMethod.PERPLEXITY)
def loglikelihood_rolling(self, docs: list[Doc]) -> list[ModelResponse]:
"""This function is used to compute the log likelihood of the context for perplexity metrics."""
raise NotImplementedError
8 changes: 4 additions & 4 deletions src/lighteval/models/endpoints/litellm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from lighteval.models.abstract_model import LightevalModel, ModelConfig
from lighteval.models.model_output import ModelResponse
from lighteval.tasks.prompt_manager import PromptManager
from lighteval.tasks.requests import Doc
from lighteval.tasks.requests import Doc, SamplingMethod
from lighteval.utils.cache_management import SampleCache, cached
from lighteval.utils.imports import is_litellm_available

Expand Down Expand Up @@ -258,7 +258,7 @@ def __call_api_parallel(

return results

@cached("predictions")
@cached(SamplingMethod.GENERATIVE)
def greedy_until(
self,
docs: list[Doc],
Expand Down Expand Up @@ -323,14 +323,14 @@ def max_length(self) -> int:
"""Return the maximum sequence length of the model."""
return 4096

@cached("predictions")
@cached(SamplingMethod.LOGPROBS)
def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
"""Tokenize the context and continuation and compute the log likelihood of those
tokenized sequences.
"""
raise NotImplementedError

@cached("predictions")
@cached(SamplingMethod.PERPLEXITY)
def loglikelihood_rolling(self, docs: list[Doc]) -> list[ModelResponse]:
"""This function is used to compute the log likelihood of the context for perplexity metrics."""
raise NotImplementedError
7 changes: 4 additions & 3 deletions src/lighteval/models/nanotron/nanotron_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from lighteval.models.transformers.transformers_model import LightevalModel
from lighteval.tasks.requests import (
Doc,
SamplingMethod,
)
from lighteval.utils.cache_management import SampleCache, cached
from lighteval.utils.imports import is_nanotron_available
Expand Down Expand Up @@ -483,7 +484,7 @@ def _check_continuations_start_space(self, continuation: str) -> str:
continuation = continuation.lstrip()
return continuation

@cached("predictions")
@cached(SamplingMethod.LOGPROBS)
def loglikelihood(self, requests: List[Doc]) -> List[ModelResponse]:
"""Tokenize the context and continuation and compute the log likelihood of those
tokenized sequences.
Expand All @@ -506,7 +507,7 @@ def loglikelihood(self, requests: List[Doc]) -> List[ModelResponse]:
disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0),
)

@cached("predictions")
@cached(SamplingMethod.PERPLEXITY)
def loglikelihood_rolling(self, requests: List[Doc]) -> List[ModelResponse]:
"""This function is used to compute the log likelihood of the context for perplexity metrics."""
for request in tqdm(
Expand Down Expand Up @@ -941,7 +942,7 @@ def _loglikelihood_tokens(
return dataset.get_original_order(res)

@torch.inference_mode()
@cached("predictions")
@cached(SamplingMethod.GENERATIVE)
def greedy_until(
self,
requests: List[Doc],
Expand Down
8 changes: 4 additions & 4 deletions src/lighteval/models/sglang/sglang_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from lighteval.models.model_output import ModelResponse
from lighteval.models.utils import _simplify_name, uses_chat_template
from lighteval.tasks.prompt_manager import PromptManager
from lighteval.tasks.requests import Doc
from lighteval.tasks.requests import Doc, SamplingMethod
from lighteval.utils.cache_management import SampleCache, cached
from lighteval.utils.imports import is_sglang_available

Expand Down Expand Up @@ -221,7 +221,7 @@ def _create_auto_tokenizer(self, config: SGLangModelConfig):
tokenizer.pad_token = tokenizer.eos_token
return tokenizer

@cached("predictions")
@cached(SamplingMethod.GENERATIVE)
def greedy_until(
self,
docs: list[Doc],
Expand Down Expand Up @@ -347,7 +347,7 @@ def _generate(
)
return outputs

@cached("predictions")
@cached(SamplingMethod.LOGPROBS)
def loglikelihood(self, docs: list[Doc]) -> list[ModelResponse]:
return self._loglikelihood_tokens(docs)

Expand Down Expand Up @@ -416,6 +416,6 @@ def _loglikelihood_tokens(
res.append(answer)
return dataset.get_original_order(res)

@cached("predictions")
@cached(SamplingMethod.PERPLEXITY)
def loglikelihood_rolling(self, docs: list[Doc]) -> list[ModelResponse]:
raise NotImplementedError()
Loading
Loading