Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/lighteval/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,9 @@ def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsR
"""
toks = request.tokenized_context
gen_length = request.generation_size
# The generative task has no limit except the model context
if gen_length is None:
gen_length = 0
return -(len(toks) + gen_length)


Expand Down
48 changes: 3 additions & 45 deletions src/lighteval/logging/info_loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from lighteval.metrics.stderr import get_stderr_function
from lighteval.models.model_loader import ModelInfo
from lighteval.models.model_output import ModelReturn
from lighteval.tasks.lighteval_task import LightevalTask
from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.utils import as_list, is_nanotron_available, sanitize_numpy

Expand Down Expand Up @@ -497,53 +497,11 @@ class TaskConfigLogger:
"""Logs the different parameters of the current [`LightevalTask`] of interest.

Attributes:
tasks_config (dict[str, TaskConfig]): Maps each task to its associated [`TaskConfig`]
tasks_config (dict[str, LightevalTaskConfig]): Maps each task to its associated [`LightevalTaskConfig`]

"""

@dataclass
class TaskConfig:
"""Stored configuration of a given [`LightevalTask`].

Arguments:
name (str): Short name of the evaluation task.
suite (list[str]): Evaluation suites to which the task belongs.
prompt_function (str): Name of the function used to create the [`Doc`] samples from each line of the evaluation dataset.
hf_repo (str): Path of the hub dataset repository containing the evaluation information.
hf_subset (str): Subset used for the current task, will be default if none is selected.
hf_avail_splits (list[str]): All the available splits in the evaluation dataset
evaluation_splits (list[str]): List of the splits actually used for this evaluation
few_shots_split (str): Name of the split from which to sample few-shot examples
few_shots_select (str): Method with which to sample few-shot examples
generation_size (int): Maximum allowed size of the generation
metric (list[str]): List of all the metrics for the current task.
stop_sequence (list[str]): Stop sequence which interrupts the generation for generative metrics.
original_num_docs (int): Number of documents in the task
effective_num_docs (int): Number of documents used in a specific evaluation
truncated_num_docs (bool): Whether less than the total number of documents were used
output_regex (str)
frozen (bool)

"""

name: str
suite: list[str]
prompt_function: str
hf_repo: str
hf_subset: str
hf_avail_splits: list[str]
evaluation_splits: list[str]
few_shots_split: str
few_shots_select: str
generation_size: int
metric: list[str]
stop_sequence: list[str]
output_regex: str
frozen: bool
original_num_docs: int = -1
effective_num_docs: int = -1

tasks_configs: dict[str, TaskConfig] = {}
tasks_configs: dict[str, LightevalTaskConfig] = {}

def log(self, task_dict: dict[str, LightevalTask]) -> None:
self.tasks_configs = {name: task.cfg for name, task in task_dict.items()}
Expand Down
34 changes: 29 additions & 5 deletions src/lighteval/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,9 +354,17 @@ def greedy_until(
position=0,
disable=self.disable_tqdm,
):
# Longest context in the current split is the first item (since we sort reversed)
longest_context_continuation_size_in_split = len(dataset[0].tokenized_context) + dataset[0].generation_size
max_context_continuation_size_allowed = min(longest_context_continuation_size_in_split, self.max_length)
if dataset[0].generation_size is None:
# No constraints on the generation size: max length allowed is the max model context
max_context_continuation_size_allowed = self.max_length
else:
# Longest context in the current split is the first item (since we sort reversed)
longest_context_continuation_size_in_split = (
len(dataset[0].tokenized_context) + dataset[0].generation_size
)
max_context_continuation_size_allowed = min(
longest_context_continuation_size_in_split, self.max_length
)
batch_size = self._get_batch_size(
override_bs=override_bs,
max_input_length=max_context_continuation_size_allowed,
Expand All @@ -376,9 +384,25 @@ def greedy_until(
# stop_tokens and max_tokens genrated) which is not necessarily
# the case! Because of that we only use batch size of 1
stop_tokens = batch[0].stop_sequence
max_generated_tokens = batch[0].generation_size
context = [c.context for c in batch]
max_context_size_allowed = self.max_length - max_generated_tokens
max_context_size_allowed = self.max_length
if batch[0].generation_size is None:
# No constraints on max tokens except the model and data
# Max generation possible is the max_length - the smallest context
smallest_context = min([len(c) for c in context])
if smallest_context < self.max_length:
max_generated_tokens = self.max_length - smallest_context
max_context_size_allowed = self.max_length
else:
# The max context size is smaller than the smallest context
max_generated_tokens = 1
max_context_size_allowed = self.max_length - 1
hlog_warn(
f"The smallest context of your batch ({smallest_context}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in {[i.task_name for i in batch]}. This is likely to lead to some errors."
)
else:
max_generated_tokens = batch[0].generation_size
max_context_size_allowed = self.max_length - max_generated_tokens

tokenized = self.tokenizer(
context,
Expand Down
30 changes: 28 additions & 2 deletions src/lighteval/tasks/lighteval_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,29 @@

@dataclass
class LightevalTaskConfig:
"""Stored configuration of a given [`LightevalTask`].

Arguments:
name (str): Short name of the evaluation task.
suite (list[str]): Evaluation suites to which the task belongs.
prompt_function (str): Name of the function used to create the [`Doc`] samples from each line of the evaluation dataset.
hf_repo (str): Path of the hub dataset repository containing the evaluation information.
hf_subset (str): Subset used for the current task, will be default if none is selected.
hf_avail_splits (list[str]): All the available splits in the evaluation dataset
evaluation_splits (list[str]): List of the splits actually used for this evaluation
few_shots_split (str): Name of the split from which to sample few-shot examples
few_shots_select (str): Method with which to sample few-shot examples
generation_size (int): Maximum allowed size of the generation
metric (list[str]): List of all the metrics for the current task.
stop_sequence (list[str]): Stop sequence which interrupts the generation for generative metrics.
original_num_docs (int): Number of documents in the task
effective_num_docs (int): Number of documents used in a specific evaluation
truncated_num_docs (bool): Whether less than the total number of documents were used
output_regex (str)
frozen (bool)

"""

name: str
prompt_function: str
hf_repo: str
Expand All @@ -51,12 +74,15 @@ class LightevalTaskConfig:
evaluation_splits: Optional[Tuple[str]] = None
few_shots_split: Optional[str] = None
few_shots_select: Optional[str] = None
generation_size: int = -1
generation_size: int = None
stop_sequence: Optional[Tuple[str]] = None
output_regex: Optional[str] = None

frozen: bool = False
suite: Optional[Tuple[str]] = None # we use this to know if we should use a custom lighteval or bigcode task
suite: Optional[Tuple[str]] = None

original_num_docs: int = -1
effective_num_docs: int = -1

def as_dict(self):
return {
Expand Down