From 41838c003dc1590340c01be0bb82949cd5f0284d Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Fri, 9 May 2025 13:16:07 +0200 Subject: [PATCH 01/12] update for CB --- .../models/transformers/transformers_model.py | 164 +++++++++++++++++- 1 file changed, 158 insertions(+), 6 deletions(-) diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py index ca5bd001e..b16ee17ab 100644 --- a/src/lighteval/models/transformers/transformers_model.py +++ b/src/lighteval/models/transformers/transformers_model.py @@ -254,14 +254,15 @@ def from_model( # Instanciate the object without using __init__ self = cls.__new__(cls) - self.config = config self.transformers_config = model.config - self.generation_config_dict = config.generation_parameters.to_transformers_dict() + self.config = config if config is not None else TransformersModelConfig(model_name=model.config.name_or_path) + if config is not None: + self.generation_config_dict = config.generation_parameters.to_transformers_dict() self._max_length = self._init_max_length() self._tokenizer = self._create_auto_tokenizer() - self.batch_size = config.batch_size + self.batch_size = getattr(config, "batch_size", None) self.model_name = _simplify_name(model.name_or_path) - self.model_sha = config.get_model_sha() + self.model_sha = self.config.get_model_sha() # If model_parallel is not set we compare the number of processes with the number of GPUs self.model = model @@ -508,7 +509,119 @@ def greedy_until_multi_turn( # noqa: C901 ) -> GenerativeMultiturnResponse: raise NotImplementedError("This method is not implemented for this model") - def greedy_until( + + def _continious_greedy_until( + self, + requests: list[GreedyUntilRequest], + ) -> list[GenerativeResponse]: + """ + Generates responses using a greedy decoding strategy until certain ending conditions are met. + + Args: + requests (list[Request]): list of requests containing the context and ending conditions. + override_bs (int, optional): Override the batch size for generation. Defaults to None. + + Returns: + list[GenerateReturn]: list of generated responses. + """ + for request in requests: + request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token] + request.tokenized_context = self.tok_encode(request.context) + + dataset = GenerativeTaskDataset(requests=requests, num_dataset_splits=self.DATASET_SPLITS) + results = [] + + for split in tqdm( + dataset.splits_iterator(), + total=dataset.num_dataset_splits, + desc="Splits", + position=0, + disable=False, # self.disable_tqdm, + ): + # For chat models, generation stops with EOS token, so we don't need to specify stop tokens + if self.use_chat_template: + stop_tokens = [] + else: + # NOTE: we are assuming all items in a batch behave similarly (same + # stop_tokens and max_tokens genrated) which is not necessarily + # the case! Because of that we only use batch size of 1 + stop_tokens = split[0].stop_sequence + + max_new_tokens = self.config.generation_parameters.max_new_tokens or split[0].generation_size + returns_logits = split[0].use_logits + num_samples = split[0].num_samples + + context = [sample.context for sample in split] + tokenized = self.tokenizer(context, add_special_tokens=self.add_special_tokens) + + # The main question for this step is the following: + # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk + # of losing some meaning, or have some generations that are exceedingly short? + # The choice we go for here is to avoid truncating the prompt if we can, since it + # should have been managed by the prompt creator/few shot manager if requested by the user. + inputs = tokenized["input_ids"] + context_size = len(inputs[0]) + + # left truncate the inputs to the maximum length + if max_new_tokens is not None: + if context_size + max_new_tokens > self.max_length: + logger.warning( + f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens." + ) + context_size = self.max_length - max_new_tokens + if context_size < 0: + logger.critical( + f"{context_size=} is less than 0, either reduce the max_new_tokens or increase model max length." + ) + raise ValueError("Context size is less than 0.") + inputs = [input[-context_size:] for input in inputs] + else: + if context_size > self.max_length: + logger.warning( + f"{context_size=} which is greater than {self.max_length=}. Truncating context to {self.max_length} tokens." + ) + context_size = self.max_length + inputs = [input[-context_size:] for input in inputs] + + _outputs = self._generate( + inputs=inputs, + max_new_tokens=max_new_tokens, + stop_tokens=stop_tokens, + returns_logits=returns_logits, + num_samples=num_samples, + ) + + for _output in _outputs: + output_token_ids = [] + logprobs_raw = [] + result = [] + + # for output in _output.outputs: + output_token_ids.append(_output) + # logprobs_raw.append(output.logprobs) + result.append(self.tokenizer.decode(_output)) + + if logprobs_raw and output_token_ids and False: + logprobs = [ + logprobs_raw[0][token_id].logprob + for token_id in output_token_ids[0] + ] + else: + logprobs = [] + + input_token_ids = _output.prompt_token_ids + + cur_response = GenerativeResponse( + result=result, + logits=logprobs, + generated_tokens=output_token_ids, + # input_tokens=input_token_ids, + ) + results.append(cur_response) + + return dataset.get_original_order(results) + + def _padded_greedy_until( self, requests: list[GreedyUntilRequest], ) -> list[GenerativeResponse]: @@ -625,12 +738,41 @@ def greedy_until( returns_logits=returns_logits, num_samples=num_samples, do_sample=do_sample, + use_fast=False ) results.extend(cur_reponses) return dataset.get_original_order(results) - def _generate( + def greedy_until( + self, + requests: list[GreedyUntilRequest], + use_fast: bool = True, + ) -> list[GenerativeResponse]: + if use_fast: + return self._continious_greedy_until(requests) + else: + return self._padded_greedy_until(requests) + + def _generate_fast( + self, + inputs: list[list[int]], + max_new_tokens: Optional[int] = None, + stop_tokens: Optional[list[str]] = None, + returns_logits: Optional[bool] = False, + num_samples: int = 1, + generate: bool = True, + ) -> list[GenerativeResponse]: + # Compute model generation + batch_outputs = self.model.generate_batch( + inputs=inputs, + generation_config=self.model.generation_config, + # You can pass request-specific overrides here, e.g., max_new_tokens=100 + ) + + return batch_outputs + + def _generate_padded( self, batch: Batch, max_new_tokens: int, @@ -711,6 +853,16 @@ def _generate( return all_responses + def _generate( + self, + use_fast: bool = True, + **kwargs, + ) -> list[GenerativeResponse]: + if use_fast: + return self._generate_fast(**kwargs) + else: + return self._generate_padded(**kwargs) + def loglikelihood( self, requests: list[LoglikelihoodRequest], From f7a3c2f37e26de88e35b707ab5421ec3ca43158d Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Fri, 9 May 2025 14:07:35 +0200 Subject: [PATCH 02/12] update --- src/lighteval/models/transformers/transformers_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py index b16ee17ab..7037d758d 100644 --- a/src/lighteval/models/transformers/transformers_model.py +++ b/src/lighteval/models/transformers/transformers_model.py @@ -597,9 +597,9 @@ def _continious_greedy_until( result = [] # for output in _output.outputs: - output_token_ids.append(_output) + output_token_ids.append(_output["output_ids"]) # logprobs_raw.append(output.logprobs) - result.append(self.tokenizer.decode(_output)) + result.append(self.tokenizer.decode(_output["output_ids"])) if logprobs_raw and output_token_ids and False: logprobs = [ @@ -609,13 +609,13 @@ def _continious_greedy_until( else: logprobs = [] - input_token_ids = _output.prompt_token_ids + input_token_ids = _output["prompt_token_ids"] cur_response = GenerativeResponse( result=result, logits=logprobs, generated_tokens=output_token_ids, - # input_tokens=input_token_ids, + input_tokens=input_token_ids, ) results.append(cur_response) From c9b3467af2a40dfbe7cbcc95445bf5d3288829f7 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 15 May 2025 13:31:16 +0000 Subject: [PATCH 03/12] push --- .../models/transformers/transformers_model.py | 57 +++++++++---------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py index 7037d758d..94e91eced 100644 --- a/src/lighteval/models/transformers/transformers_model.py +++ b/src/lighteval/models/transformers/transformers_model.py @@ -22,7 +22,7 @@ import logging import os -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Union, Dict import torch import torch.nn.functional as F @@ -591,33 +591,32 @@ def _continious_greedy_until( num_samples=num_samples, ) - for _output in _outputs: - output_token_ids = [] - logprobs_raw = [] - result = [] - - # for output in _output.outputs: - output_token_ids.append(_output["output_ids"]) - # logprobs_raw.append(output.logprobs) - result.append(self.tokenizer.decode(_output["output_ids"])) - - if logprobs_raw and output_token_ids and False: - logprobs = [ - logprobs_raw[0][token_id].logprob - for token_id in output_token_ids[0] - ] - else: - logprobs = [] - - input_token_ids = _output["prompt_token_ids"] - - cur_response = GenerativeResponse( - result=result, - logits=logprobs, - generated_tokens=output_token_ids, - input_tokens=input_token_ids, - ) - results.append(cur_response) + for req_id, _output in _outputs.items(): + output_token_ids = [] + logprobs_raw = [] + result = [] + + # for output in _output.outputs: + output_token_ids.append(_output.static_outputs) + # logprobs_raw.append(output.logprobs) + result.append(self.tokenizer.decode(_output.static_outputs)) + + if logprobs_raw and output_token_ids and False: + logprobs = [ + logprobs_raw[0][token_id].logprob + for token_id in output_token_ids[0] + ] + else: + logprobs = [] + + input_token_ids = _output.full_prompt_ids + cur_response = GenerativeResponse( + result=result, + logits=logprobs, + generated_tokens=output_token_ids, + input_tokens=input_token_ids, + ) + results.append(cur_response) return dataset.get_original_order(results) @@ -762,7 +761,7 @@ def _generate_fast( returns_logits: Optional[bool] = False, num_samples: int = 1, generate: bool = True, - ) -> list[GenerativeResponse]: + ) -> Dict[str, GenerativeResponse]: # Compute model generation batch_outputs = self.model.generate_batch( inputs=inputs, From a7e2751b8e6d13bc60282896f733c777317026e3 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Mon, 19 May 2025 15:59:06 +0000 Subject: [PATCH 04/12] c'est une honte, 0.2.... ruff.... --- community_tasks/arabic_evals.py | 1 + examples/nanotron/custom_evaluation_tasks.py | 1 + src/lighteval/logging/evaluation_tracker.py | 2 +- src/lighteval/metrics/imports/bert_scorer.py | 1 + src/lighteval/metrics/metrics.py | 16 ++++++------- src/lighteval/metrics/metrics_corpus.py | 1 + .../models/endpoints/openai_model.py | 2 +- .../models/transformers/transformers_model.py | 12 ++++------ src/lighteval/pipeline.py | 2 +- src/lighteval/tasks/default_prompts.py | 18 ++++++-------- src/lighteval/tasks/extended/hle/main.py | 2 +- .../tasks/extended/ifeval/instructions.py | 24 +++++++++---------- .../extended/ifeval/instructions_registry.py | 1 + src/lighteval/tasks/extended/ifeval/main.py | 2 +- .../tasks/extended/tiny_benchmarks/main.py | 7 +++--- 15 files changed, 45 insertions(+), 47 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 31b1b8752..55165074a 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -26,6 +26,7 @@ This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. """ + import random import re from typing import Any, Dict, List, Optional, Union diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py index 2fd85f69b..49010098c 100644 --- a/examples/nanotron/custom_evaluation_tasks.py +++ b/examples/nanotron/custom_evaluation_tasks.py @@ -26,6 +26,7 @@ This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. """ + import re from dataclasses import asdict from typing import Dict, List, Tuple diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 2694bae81..374cb90d7 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -591,7 +591,7 @@ def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901 f"To load the details from a run, you can for instance do the following:\n" f'```python\nfrom datasets import load_dataset\ndata = load_dataset("{repo_id}",\n\t"{sanitized_task}",\n\tsplit="train")\n```\n\n' f"## Latest results\n\n" - f'These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace("/resolve/", "/blob/")})' + f"These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace('/resolve/', '/blob/')})" f"(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. " f'You find each in the results and the "latest" split for each eval):\n\n' f"```python\n{results_string}\n```", diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py index 1012bc3f7..80ee47357 100644 --- a/src/lighteval/metrics/imports/bert_scorer.py +++ b/src/lighteval/metrics/imports/bert_scorer.py @@ -22,6 +22,7 @@ # SOFTWARE. """Simplified version of the BertScorer lib - we only import what we need.""" + import logging import os import time diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 0aede953d..61c3dfefb 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -616,16 +616,16 @@ class Metrics(Enum): sample_level_fn=GPassAtK(k=16, n=48, strip_strings=True).compute, category=MetricCategory.GENERATIVE_SAMPLING, use_case=MetricUseCase.REASONING, - corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, - higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, + corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), + higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), ) g_pass_at_8_16 = SampleLevelMetricGrouping( metric_name="G-Pass@8-16:48_samples", sample_level_fn=GPassAtK(k=[8, 16], n=48, strip_strings=True).compute, category=MetricCategory.GENERATIVE_SAMPLING, use_case=MetricUseCase.REASONING, - corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, - higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, + corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), + higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), ) g_pass_at_16_expr_gold = SampleLevelMetricGrouping( metric_name="G-Pass@16:48_samples", @@ -645,8 +645,8 @@ class Metrics(Enum): ).compute, category=MetricCategory.GENERATIVE_SAMPLING, use_case=MetricUseCase.REASONING, - corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, - higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, + corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), + higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), ) g_pass_at_16_latex_gold = SampleLevelMetricGrouping( metric_name="G-Pass@16:48_samples", @@ -666,8 +666,8 @@ class Metrics(Enum): ).compute, category=MetricCategory.GENERATIVE_SAMPLING, use_case=MetricUseCase.REASONING, - corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, - higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, + corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), + higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), ) perfect_exact_match = SampleLevelMetric( metric_name="perfect_em", diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 3c2de418f..87b785fd0 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -24,6 +24,7 @@ Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus. A number of these aggregations come from the EleutherAIHarness """ + import logging import math from typing import Literal diff --git a/src/lighteval/models/endpoints/openai_model.py b/src/lighteval/models/endpoints/openai_model.py index 42771b1ae..18b2114c4 100644 --- a/src/lighteval/models/endpoints/openai_model.py +++ b/src/lighteval/models/endpoints/openai_model.py @@ -259,7 +259,7 @@ def _loglikelihood_tokens( new_tokens == 1 for new_tokens in max_new_tokens ), "Only single token continuations are supported when using openai API." - logit_biases = [{tok: 100 for tok in sample.tokenized_continuation} for sample in split] + logit_biases = [dict.fromkeys(sample.tokenized_continuation, 100) for sample in split] outputs = self.__call_api_parallel( inputs, return_logits=True, max_new_tokens=max_new_tokens, num_samples=1, logit_bias=logit_biases diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py index 94e91eced..a1124c85a 100644 --- a/src/lighteval/models/transformers/transformers_model.py +++ b/src/lighteval/models/transformers/transformers_model.py @@ -22,7 +22,7 @@ import logging import os -from typing import Optional, Tuple, Union, Dict +from typing import Dict, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -509,8 +509,7 @@ def greedy_until_multi_turn( # noqa: C901 ) -> GenerativeMultiturnResponse: raise NotImplementedError("This method is not implemented for this model") - - def _continious_greedy_until( + def _continious_greedy_until( self, requests: list[GreedyUntilRequest], ) -> list[GenerativeResponse]: @@ -602,10 +601,7 @@ def _continious_greedy_until( result.append(self.tokenizer.decode(_output.static_outputs)) if logprobs_raw and output_token_ids and False: - logprobs = [ - logprobs_raw[0][token_id].logprob - for token_id in output_token_ids[0] - ] + logprobs = [logprobs_raw[0][token_id].logprob for token_id in output_token_ids[0]] else: logprobs = [] @@ -737,7 +733,7 @@ def _padded_greedy_until( returns_logits=returns_logits, num_samples=num_samples, do_sample=do_sample, - use_fast=False + use_fast=False, ) results.extend(cur_reponses) diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 39e007b33..63c62964d 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -383,7 +383,7 @@ def tensor_replacer(match): try: return ast.literal_eval(processed) except Exception as e: - raise ValueError(f"Failed to parse after preprocessing. " f"Processed string:\n{processed}\n\nError: {e}") + raise ValueError(f"Failed to parse after preprocessing. Processed string:\n{processed}\n\nError: {e}") def _load_responses_from_details(self): logger.info("--- LOADING RESPONSES FROM DETAILS ---") diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 2745b63c5..afb210381 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -749,7 +749,7 @@ def ethics_commonsense(line, task_name: str = None): def ethics_deontology(line, task_name: str = None): return Doc( task_name=task_name, - query=f"Question: Would most people believe this reasonable or unreasonable to say? \"{line['scenario']} {line['excuse']}\"\nAnswer:", + query=f'Question: Would most people believe this reasonable or unreasonable to say? "{line["scenario"]} {line["excuse"]}"\nAnswer:', choices=[" unreasonable", " reasonable"], gold_index=int(line["label"]), instruction="Question: Would most people believe this reasonable or unreasonable to say?", @@ -759,7 +759,7 @@ def ethics_deontology(line, task_name: str = None): def ethics_justice(line, task_name: str = None): return Doc( task_name=task_name, - query=f"Question: Would most people believe this reasonable or unreasonable to say? \"{line['scenario']}\"\nAnswer:", + query=f'Question: Would most people believe this reasonable or unreasonable to say? "{line["scenario"]}"\nAnswer:', choices=[" unreasonable", " reasonable"], gold_index=int(line["label"]), instruction="Question: Would most people believe this reasonable or unreasonable to say?", @@ -782,7 +782,7 @@ def ethics_utilitarianism(line, task_name: str = None): def ethics_virtue(line, task_name: str = None): return Doc( task_name=task_name, - query=f"Sentence: {line['scenario']}\nQuestion: Does the character in this sentence exhibit the trait \"{line['trait']}\"?\nAnswer:", + query=f'Sentence: {line["scenario"]}\nQuestion: Does the character in this sentence exhibit the trait "{line["trait"]}"?\nAnswer:', choices=[" no", " yes"], gold_index=int(line["label"]), ) @@ -1159,24 +1159,21 @@ def lextreme_covid19_emergency_event(line, task_name: str = None): def lextreme_multi_eurlex_level_1(line, task_name: str = None): instruction = ( - "In this task, you are given a document from an EU law. " - "Predict the level 1 concept in the EUROVOC taxonomy." + "In this task, you are given a document from an EU law. Predict the level 1 concept in the EUROVOC taxonomy." ) return lextreme(line, instruction, task_name) def lextreme_multi_eurlex_level_2(line, task_name: str = None): instruction = ( - "In this task, you are given a document from an EU law. " - "Predict the level 2 concept in the EUROVOC taxonomy." + "In this task, you are given a document from an EU law. Predict the level 2 concept in the EUROVOC taxonomy." ) return lextreme(line, instruction, task_name) def lextreme_multi_eurlex_level_3(line, task_name: str = None): instruction = ( - "In this task, you are given a document from an EU law. " - "Predict the level 3 concept in the EUROVOC taxonomy." + "In this task, you are given a document from an EU law. Predict the level 3 concept in the EUROVOC taxonomy." ) return lextreme(line, instruction, task_name) @@ -1184,8 +1181,7 @@ def lextreme_multi_eurlex_level_3(line, task_name: str = None): def lextreme_greek_legal_ner(line, task_name: str = None): instruction = ( - "In this task, you are given a sentence from Greek legislation. " - "Predict the named entity type for each token." + "In this task, you are given a sentence from Greek legislation. Predict the named entity type for each token." ) return lextreme(line, instruction, task_name) diff --git a/src/lighteval/tasks/extended/hle/main.py b/src/lighteval/tasks/extended/hle/main.py index 9f6a85610..76a63c1ad 100644 --- a/src/lighteval/tasks/extended/hle/main.py +++ b/src/lighteval/tasks/extended/hle/main.py @@ -208,7 +208,7 @@ def hle_text_only(line, task_name: str = None): hle_metrics = CorpusLevelMetricGrouping( metric_name=["accuracy", "confidence_half_width", "calibration_error"], - higher_is_better={n: True for n in ["accuracy", "confidence_half_width", "calibration_error"]}, + higher_is_better=dict.fromkeys(["accuracy", "confidence_half_width", "calibration_error"], True), category=MetricCategory.LLM_AS_JUDGE, use_case=MetricUseCase.ACCURACY, sample_level_fn=JudgeLLMHLE().compute, diff --git a/src/lighteval/tasks/extended/ifeval/instructions.py b/src/lighteval/tasks/extended/ifeval/instructions.py index 7c8591ae2..ee9e7b88b 100644 --- a/src/lighteval/tasks/extended/ifeval/instructions.py +++ b/src/lighteval/tasks/extended/ifeval/instructions.py @@ -13,6 +13,7 @@ # limitations under the License. """Library of instructions.""" + import collections import json import logging @@ -204,7 +205,7 @@ def build_description(self, *, num_sentences=None, relation=None): self._comparison_relation = random.choice(_COMPARISON_RELATION) elif relation not in _COMPARISON_RELATION: raise ValueError( - "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given." + f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given." ) else: self._comparison_relation = relation @@ -663,7 +664,7 @@ def build_description(self, *, original_message): A string representing the instruction description. """ if not self.is_change(original_message): - raise ValueError(f"Message {original_message} does not contain changes " "in the form of *change me*.") + raise ValueError(f"Message {original_message} does not contain changes in the form of *change me*.") self._reference_without_change = original_message self._description = ( @@ -694,7 +695,7 @@ def check_following(self, value): """ if not self.is_change(value): - raise ValueError(f"value {value} does not contain " "changes in the form of *change me*.") + raise ValueError(f"value {value} does not contain changes in the form of *change me*.") response_without_changes = self.strip_changes(value) reference_without_changes = self.strip_changes(self._reference_without_change) @@ -782,7 +783,7 @@ def build_description(self, *, keyword=None, frequency=None, relation=None): self._comparison_relation = random.choice(_COMPARISON_RELATION) elif relation not in _COMPARISON_RELATION: raise ValueError( - "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given." + f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given." ) else: self._comparison_relation = relation @@ -846,7 +847,7 @@ def build_description(self, *, num_words=None, relation=None): self._comparison_relation = random.choice(_COMPARISON_RELATION) elif relation not in _COMPARISON_RELATION: raise ValueError( - "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given." + f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given." ) else: self._comparison_relation = relation @@ -878,7 +879,7 @@ class JsonFormat(Instruction): def build_description(self): self._description_pattern = ( - "Entire output should be wrapped in JSON format. You can use markdown" " ticks such as ```." + "Entire output should be wrapped in JSON format. You can use markdown ticks such as ```." ) return self._description_pattern @@ -1250,7 +1251,7 @@ def build_description(self, *, end_phrase=None): if self._end_phrase is None: self._end_phrase = random.choice(_ENDING_OPTIONS) self._description_pattern = ( - "Finish your response with this exact phrase {ender}. " "No other words should follow this phrase." + "Finish your response with this exact phrase {ender}. No other words should follow this phrase." ) return self._description_pattern.format(ender=self._end_phrase) @@ -1274,7 +1275,7 @@ class TitleChecker(Instruction): def build_description(self): """Build the instruction description.""" self._description_pattern = ( - "Your answer must contain a title, wrapped in double angular brackets," " such as <>." + "Your answer must contain a title, wrapped in double angular brackets, such as <>." ) return self._description_pattern @@ -1337,7 +1338,7 @@ def build_description(self, *, letter=None, let_frequency=None, let_relation=Non self._comparison_relation = let_relation self._description_pattern = ( - "In your response, the letter {letter} should appear {let_relation}" " {let_frequency} times." + "In your response, the letter {letter} should appear {let_relation} {let_frequency} times." ) return self._description_pattern.format( @@ -1402,8 +1403,7 @@ class LowercaseLettersEnglishChecker(Instruction): def build_description(self): """Build the instruction description.""" self._description_pattern = ( - "Your entire response should be in English, and in all lowercase" - " letters. No capital letters are allowed." + "Your entire response should be in English, and in all lowercase letters. No capital letters are allowed." ) return self._description_pattern @@ -1479,7 +1479,7 @@ def build_description( ) self._description_pattern = ( - "In your response, words with all capital letters should appear" " {relation} {frequency} times." + "In your response, words with all capital letters should appear {relation} {frequency} times." ) return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation) diff --git a/src/lighteval/tasks/extended/ifeval/instructions_registry.py b/src/lighteval/tasks/extended/ifeval/instructions_registry.py index 611e607dc..62becfbaa 100644 --- a/src/lighteval/tasks/extended/ifeval/instructions_registry.py +++ b/src/lighteval/tasks/extended/ifeval/instructions_registry.py @@ -13,6 +13,7 @@ # limitations under the License. """Registry of all instructions.""" + import lighteval.tasks.extended.ifeval.instructions as instructions diff --git a/src/lighteval/tasks/extended/ifeval/main.py b/src/lighteval/tasks/extended/ifeval/main.py index 60d1be5fa..f460c288a 100644 --- a/src/lighteval/tasks/extended/ifeval/main.py +++ b/src/lighteval/tasks/extended/ifeval/main.py @@ -127,7 +127,7 @@ def agg_inst_level_acc(items): ifeval_metrics = SampleLevelMetricGrouping( metric_name=submetric_names, - higher_is_better={n: True for n in submetric_names}, + higher_is_better=dict.fromkeys(submetric_names, True), category=MetricCategory.GENERATIVE, use_case=MetricUseCase.ACCURACY, sample_level_fn=ifeval_metric, diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py index fae6e89df..3e4cfed6f 100644 --- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py +++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py @@ -26,6 +26,7 @@ Test with `python run_evals_accelerate.py --model_args "pretrained=EleutherAI/pythia-70m" --tasks "extended|tiny:winogrande|0|0,extended|tiny:gsm8k|0|0,extended|tiny:hellaswag|0|0,extended|tiny:arc|0|0,extended|tiny:truthfulqa|0|0" --extended_tasks extended_tasks --output_dir "./evals"` """ + import os import pathlib import pickle @@ -105,10 +106,10 @@ def compute(self, **args): res = ExactMatches( strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer ).compute(**args) - return {m: res for m in self.METRICS} + return dict.fromkeys(self.METRICS, res) else: res = LoglikelihoodAcc().compute(**args) - return {m: res for m in self.METRICS} + return dict.fromkeys(self.METRICS, res) def aggregate(self, y_input): if len(y_input) == self.num_samples and self.estimates is not None: @@ -276,7 +277,7 @@ def aggregate(self, y_input): f"tinybench_metric_{name}", CorpusLevelMetricGrouping( metric_name=TinyCorpusAggregator.METRICS, - higher_is_better={m: True for m in TinyCorpusAggregator.METRICS}, + higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True), sample_level_fn=TinyCorpusAggregator(name).compute, category=category, use_case=use_case, From 0f772b1a3f7eb91f1650971133ae86c913df9dc9 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Mon, 30 Jun 2025 13:22:38 +0000 Subject: [PATCH 05/12] merge main --- .../model_configs/transformers_model.yaml | 11 +++-- src/lighteval/models/model_input.py | 5 ++ .../models/transformers/transformers_model.py | 46 +++++++------------ 3 files changed, 27 insertions(+), 35 deletions(-) diff --git a/examples/model_configs/transformers_model.yaml b/examples/model_configs/transformers_model.yaml index 5145d8252..61fcc31a2 100644 --- a/examples/model_configs/transformers_model.yaml +++ b/examples/model_configs/transformers_model.yaml @@ -1,12 +1,13 @@ model_parameters: - model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct" - revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3" + model_name: "meta-llama/Llama-3.1-8B-Instruct" + revision: "main" dtype: "float16" compile: false model_parallel: false batch_size: 1 - multichoice_continuations_start_space: null # If true/false, will force multiple choice continuations to start/not start with a space. If none, will do nothing use_chat_template: true + model_loading_kwargs: + attn_implementation: "sdpa_paged" generation_parameters: - temperature: 0.0 - top_p: 0.9 + num_blocks: 2048 + block_size: 256 diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py index d4e3d2bd2..2d8a53fcb 100644 --- a/src/lighteval/models/model_input.py +++ b/src/lighteval/models/model_input.py @@ -25,6 +25,9 @@ class GenerationParameters(BaseModel, extra="forbid"): + num_blocks: NonNegativeInt | None = None # transformers + block_size: NonNegativeInt | None = None # transformers + early_stopping: bool | None = None # transformers repetition_penalty: NonNegativeFloat | None = None # vllm, transformers, tgi, sglang frequency_penalty: NonNegativeFloat | None = None # vllm, tgi, sglang @@ -186,6 +189,8 @@ def to_transformers_dict(self) -> dict: "repetition_penalty": self.repetition_penalty, "length_penalty": self.length_penalty, "output_scores": True, + "num_blocks": self.num_blocks, + "block_size": self.block_size, "return_dict_in_generate": True, } return {k: v for k, v in args.items() if v is not None} diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py index 01d8e9505..5abe5517e 100644 --- a/src/lighteval/models/transformers/transformers_model.py +++ b/src/lighteval/models/transformers/transformers_model.py @@ -22,9 +22,9 @@ import logging import os -from typing import Dict, Optional, Tuple, Union from datetime import timedelta -from typing import Optional, Tuple, Union +from typing import Dict, Optional, Tuple, Union + import torch import torch.nn.functional as F import transformers @@ -505,13 +505,6 @@ def forward_batch(batch_size): logger.info(f"Determined largest batch size: {batch_size}") return batch_size - - def greedy_until_multi_turn( # noqa: C901 - self, - docs: list[Doc], - ) -> ModelResponse: - raise NotImplementedError("This method is not implemented for this model") - def _continious_greedy_until( self, docs: list[Doc], @@ -526,11 +519,7 @@ def _continious_greedy_until( Returns: list[GenerateReturn]: list of generated responses. """ - for request in requests: - request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token] - request.tokenized_context = self.tok_encode(request.context) - - dataset = GenerativeTaskDataset(requests=requests, num_dataset_splits=self.DATASET_SPLITS) + dataset = GenerativeTaskDataset(requests=docs, num_dataset_splits=self.DATASET_SPLITS) results = [] for split in tqdm( @@ -552,9 +541,8 @@ def _continious_greedy_until( max_new_tokens = self.config.generation_parameters.max_new_tokens or split[0].generation_size returns_logits = split[0].use_logits num_samples = split[0].num_samples - - context = [sample.context for sample in split] - tokenized = self.tokenizer(context, add_special_tokens=self.add_special_tokens) + contexts = [self.prompt_manager.prepare_prompt(doc) for doc in split] + tokenized = self.tokenizer(contexts, add_special_tokens=self.add_special_tokens) # The main question for this step is the following: # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk @@ -609,18 +597,17 @@ def _continious_greedy_until( logprobs = [] input_token_ids = _output.full_prompt_ids - cur_response = GenerativeResponse( - result=result, - logits=logprobs, - generated_tokens=output_token_ids, + cur_response = ModelResponse( + text=result, + logprobs=logprobs, + output_tokens=output_token_ids, input_tokens=input_token_ids, ) results.append(cur_response) return dataset.get_original_order(results) - - def greedy_until( + def _padded_greedy_until( self, docs: list[Doc], ) -> list[ModelResponse]: @@ -733,7 +720,6 @@ def greedy_until( stop_tokens=stop_tokens, returns_logits=False, num_samples=num_samples, - do_sample=do_sample, use_fast=False, ) results.extend(cur_reponses) @@ -742,13 +728,13 @@ def greedy_until( def greedy_until( self, - requests: list[GreedyUntilRequest], + docs: list[Doc], use_fast: bool = True, - ) -> list[GenerativeResponse]: + ) -> list[ModelResponse]: if use_fast: - return self._continious_greedy_until(requests) + return self._continious_greedy_until(docs) else: - return self._padded_greedy_until(requests) + return self._padded_greedy_until(docs) def _generate_fast( self, @@ -758,7 +744,7 @@ def _generate_fast( returns_logits: Optional[bool] = False, num_samples: int = 1, generate: bool = True, - ) -> Dict[str, GenerativeResponse]: + ) -> Dict[str, ModelResponse]: # Compute model generation batch_outputs = self.model.generate_batch( inputs=inputs, @@ -858,7 +844,7 @@ def _generate( self, use_fast: bool = True, **kwargs, - ) -> list[GenerativeResponse]: + ) -> list[ModelResponse]: if use_fast: return self._generate_fast(**kwargs) else: From df98d9be079fac2f89099b39d65763c733c070da Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 3 Jul 2025 13:42:39 +0000 Subject: [PATCH 06/12] fix model --- examples/model_configs/transformers_model.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/model_configs/transformers_model.yaml b/examples/model_configs/transformers_model.yaml index 61fcc31a2..763ce3f00 100644 --- a/examples/model_configs/transformers_model.yaml +++ b/examples/model_configs/transformers_model.yaml @@ -6,8 +6,10 @@ model_parameters: model_parallel: false batch_size: 1 use_chat_template: true + continuous_batching: true model_loading_kwargs: attn_implementation: "sdpa_paged" generation_parameters: - num_blocks: 2048 + num_blocks: 1024 block_size: 256 + #max_new_tokens: 512 From 1da56bdd4bcc48ea234d23817f5d8217b25d1885 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Fri, 4 Jul 2025 13:32:10 +0000 Subject: [PATCH 07/12] fix model --- .../model_configs/transformers_model.yaml | 12 ++++--- examples/model_configs/vllm_model_config.yaml | 8 ++--- .../models/transformers/transformers_model.py | 36 ++++++++++++------- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/examples/model_configs/transformers_model.yaml b/examples/model_configs/transformers_model.yaml index 763ce3f00..5a53d82e8 100644 --- a/examples/model_configs/transformers_model.yaml +++ b/examples/model_configs/transformers_model.yaml @@ -4,12 +4,14 @@ model_parameters: dtype: "float16" compile: false model_parallel: false - batch_size: 1 + batch_size: 10 use_chat_template: true continuous_batching: true model_loading_kwargs: - attn_implementation: "sdpa_paged" + attn_implementation: "paged_attention" + #tp_plan: "auto" generation_parameters: - num_blocks: 1024 - block_size: 256 - #max_new_tokens: 512 + num_blocks: 4096 + block_size: 64 + max_new_tokens: 256 + temperature: 0.2 diff --git a/examples/model_configs/vllm_model_config.yaml b/examples/model_configs/vllm_model_config.yaml index 66714a298..97e7efa8f 100644 --- a/examples/model_configs/vllm_model_config.yaml +++ b/examples/model_configs/vllm_model_config.yaml @@ -1,8 +1,8 @@ model_parameters: - model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct" - revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3" + model_name: "meta-llama/Llama-3.1-8B-Instruct" + revision: "main" dtype: "float16" - tensor_parallel_size: 1 + tensor_parallel_size: 2 data_parallel_size: 1 pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 @@ -28,7 +28,7 @@ model_parameters: top_p: 0.9 seed: 42 stop_tokens: null - max_new_tokens: 2048 + max_new_tokens: 216 min_new_tokens: 0 metrics_options: yo: null diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py index 5abe5517e..6a365b383 100644 --- a/src/lighteval/models/transformers/transformers_model.py +++ b/src/lighteval/models/transformers/transformers_model.py @@ -41,6 +41,7 @@ BitsAndBytesConfig, PretrainedConfig, ) +from transformers.generation.configuration_utils import GenerationConfig from transformers.generation.utils import GenerateOutput from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES @@ -110,6 +111,8 @@ class TransformersModelConfig(ModelConfig): True forces adding space, False removes leading space if present. pairwise_tokenization (bool): Whether to tokenize context and continuation separately or together. Defaults to False. + continuous_batching (bool): + Whether to use continuous batching for generation. Defaults to False. Example: ```python @@ -147,6 +150,7 @@ class TransformersModelConfig(ModelConfig): compile: bool = False multichoice_continuations_start_space: bool | None = None pairwise_tokenization: bool = False + continuous_batching: bool = False def model_post_init(self, __context): if self.multichoice_continuations_start_space is True: @@ -190,7 +194,9 @@ def __init__( self._add_special_tokens = config.add_special_tokens or False self.pairwise_tokenization = config.pairwise_tokenization self.batch_size = config.batch_size + self.continuous_batching = config.continuous_batching self.transformers_config = config.get_transformers_config() + self.generation_config_dict = config.generation_parameters.to_transformers_dict() self.model_sha = config.get_model_sha() self._max_length = self._init_max_length() @@ -210,8 +216,6 @@ def __init__( self.model_name = _simplify_name(config.model_name) - self.generation_config_dict = config.generation_parameters.to_transformers_dict() - if is_accelerate_available(): model_size, _ = calculate_maximum_sizes(self.model) model_size = convert_bytes(model_size) @@ -403,6 +407,11 @@ def _create_auto_model(self) -> transformers.PreTrainedModel: # model.to(self.device) model.eval() torch.set_grad_enabled(False) + if self.continuous_batching: + generation_config = GenerationConfig( + **self.generation_config_dict, + ) + model.generation_config = generation_config if self.config.compile: try: @@ -505,7 +514,7 @@ def forward_batch(batch_size): logger.info(f"Determined largest batch size: {batch_size}") return batch_size - def _continious_greedy_until( + def _continuous_greedy_until( self, docs: list[Doc], ) -> list[ModelResponse]: @@ -579,6 +588,7 @@ def _continious_greedy_until( stop_tokens=stop_tokens, returns_logits=returns_logits, num_samples=num_samples, + continuous_batching=True, ) for req_id, _output in _outputs.items(): @@ -587,16 +597,16 @@ def _continious_greedy_until( result = [] # for output in _output.outputs: - output_token_ids.append(_output.static_outputs) + output_token_ids.append(_output.generated_tokens) # logprobs_raw.append(output.logprobs) - result.append(self.tokenizer.decode(_output.static_outputs)) + result.append(self.tokenizer.decode(_output.generated_tokens)) if logprobs_raw and output_token_ids and False: logprobs = [logprobs_raw[0][token_id].logprob for token_id in output_token_ids[0]] else: logprobs = [] - input_token_ids = _output.full_prompt_ids + input_token_ids = _output.prompt_ids cur_response = ModelResponse( text=result, logprobs=logprobs, @@ -720,7 +730,7 @@ def _padded_greedy_until( stop_tokens=stop_tokens, returns_logits=False, num_samples=num_samples, - use_fast=False, + continuous_batching=False, ) results.extend(cur_reponses) @@ -729,10 +739,9 @@ def _padded_greedy_until( def greedy_until( self, docs: list[Doc], - use_fast: bool = True, ) -> list[ModelResponse]: - if use_fast: - return self._continious_greedy_until(docs) + if self.continuous_batching: + return self._continuous_greedy_until(docs) else: return self._padded_greedy_until(docs) @@ -746,6 +755,9 @@ def _generate_fast( generate: bool = True, ) -> Dict[str, ModelResponse]: # Compute model generation + self.model.generation_config.use_cuda_graph = False # Disable CUDA graph for batch generation + self.model.generation_config.max_batch_tokens = 256 # Disable CUDA graph for batch generation + # self.model.generation_config.do_sample = False # Disable CUDA graph for batch generation batch_outputs = self.model.generate_batch( inputs=inputs, generation_config=self.model.generation_config, @@ -842,10 +854,10 @@ def _generate_padded( def _generate( self, - use_fast: bool = True, + continuous_batching: bool, **kwargs, ) -> list[ModelResponse]: - if use_fast: + if continuous_batching: return self._generate_fast(**kwargs) else: return self._generate_padded(**kwargs) From fe6f24c47974bb6a5096e521ec06de049aa6f716 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Mon, 7 Jul 2025 14:37:58 +0000 Subject: [PATCH 08/12] fix tests --- examples/model_configs/transformers_model.yaml | 10 +++++----- tests/models/endpoints/test_endpoint_model.py | 2 ++ tests/models/endpoints/test_tgi_model.py | 2 ++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/model_configs/transformers_model.yaml b/examples/model_configs/transformers_model.yaml index 5a53d82e8..9f41d502a 100644 --- a/examples/model_configs/transformers_model.yaml +++ b/examples/model_configs/transformers_model.yaml @@ -4,14 +4,14 @@ model_parameters: dtype: "float16" compile: false model_parallel: false - batch_size: 10 + batch_size: 20 use_chat_template: true - continuous_batching: true + continuous_batching: false model_loading_kwargs: - attn_implementation: "paged_attention" + attn_implementation: "flash_attention_2" #tp_plan: "auto" generation_parameters: - num_blocks: 4096 - block_size: 64 + #num_blocks: 4096 + #block_size: 64 max_new_tokens: 256 temperature: 0.2 diff --git a/tests/models/endpoints/test_endpoint_model.py b/tests/models/endpoints/test_endpoint_model.py index 820a23327..5b3aa7563 100644 --- a/tests/models/endpoints/test_endpoint_model.py +++ b/tests/models/endpoints/test_endpoint_model.py @@ -52,6 +52,8 @@ class TestInferenceEndpointModelConfig: "add_special_tokens": True, "system_prompt": None, "generation_parameters": { + "num_blocks": None, + "block_size": None, "early_stopping": None, "frequency_penalty": None, "length_penalty": None, diff --git a/tests/models/endpoints/test_tgi_model.py b/tests/models/endpoints/test_tgi_model.py index 93184d5a4..895871597 100644 --- a/tests/models/endpoints/test_tgi_model.py +++ b/tests/models/endpoints/test_tgi_model.py @@ -38,6 +38,8 @@ class TestTGIModelConfig: "model_name": None, "system_prompt": None, "generation_parameters": { + "block_size": None, + "num_blocks": None, "early_stopping": None, "frequency_penalty": None, "length_penalty": None, From 96466e4e3effafdc662a4aae5d82cc079350c951 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Mon, 7 Jul 2025 14:39:57 +0000 Subject: [PATCH 09/12] fix slow tests --- examples/model_configs/transformers_model.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/model_configs/transformers_model.yaml b/examples/model_configs/transformers_model.yaml index 9f41d502a..79e00ada0 100644 --- a/examples/model_configs/transformers_model.yaml +++ b/examples/model_configs/transformers_model.yaml @@ -1,14 +1,14 @@ model_parameters: - model_name: "meta-llama/Llama-3.1-8B-Instruct" - revision: "main" + model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct" + revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3" dtype: "float16" compile: false model_parallel: false - batch_size: 20 + batch_size: 1 use_chat_template: true continuous_batching: false model_loading_kwargs: - attn_implementation: "flash_attention_2" + attn_implementation: "eager" #tp_plan: "auto" generation_parameters: #num_blocks: 4096 From 8344961fe8d7630cc0063873f65630cc9550f6f1 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Mon, 7 Jul 2025 14:40:38 +0000 Subject: [PATCH 10/12] fix slow tests --- examples/model_configs/transformers_model.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/model_configs/transformers_model.yaml b/examples/model_configs/transformers_model.yaml index 79e00ada0..acab31d33 100644 --- a/examples/model_configs/transformers_model.yaml +++ b/examples/model_configs/transformers_model.yaml @@ -13,5 +13,6 @@ model_parameters: generation_parameters: #num_blocks: 4096 #block_size: 64 - max_new_tokens: 256 - temperature: 0.2 + #max_new_tokens: 256 + temperature: 0.0 + top_p: 0.9 From 7453c6f23c9dcd9f7f1743b96fc0fc3abbc95245 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Mon, 7 Jul 2025 14:41:34 +0000 Subject: [PATCH 11/12] reset vllm model file config --- examples/model_configs/vllm_model_config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/model_configs/vllm_model_config.yaml b/examples/model_configs/vllm_model_config.yaml index 97e7efa8f..66714a298 100644 --- a/examples/model_configs/vllm_model_config.yaml +++ b/examples/model_configs/vllm_model_config.yaml @@ -1,8 +1,8 @@ model_parameters: - model_name: "meta-llama/Llama-3.1-8B-Instruct" - revision: "main" + model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct" + revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3" dtype: "float16" - tensor_parallel_size: 2 + tensor_parallel_size: 1 data_parallel_size: 1 pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 @@ -28,7 +28,7 @@ model_parameters: top_p: 0.9 seed: 42 stop_tokens: null - max_new_tokens: 216 + max_new_tokens: 2048 min_new_tokens: 0 metrics_options: yo: null From c179876e094bb049f06d6c79d0eef8fc8995a54e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Fri, 1 Aug 2025 13:52:10 +0200 Subject: [PATCH 12/12] Apply suggestions from code review --- src/lighteval/models/transformers/transformers_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py index 9eacb6301..fef8b0b5b 100644 --- a/src/lighteval/models/transformers/transformers_model.py +++ b/src/lighteval/models/transformers/transformers_model.py @@ -741,7 +741,7 @@ def greedy_until( else: return self._padded_greedy_until(docs) - def _generate_fast( + def _generate_continuous( self, inputs: list[list[int]], max_new_tokens: Optional[int] = None, @@ -854,7 +854,7 @@ def _generate( **kwargs, ) -> list[ModelResponse]: if continuous_batching: - return self._generate_fast(**kwargs) + return self._generate_continuous(**kwargs) else: return self._generate_padded(**kwargs)