Upgrade huggingface_hub to fix datasets import and add trust_remote_code in datasets (#84)

clefourrier · web-flow · commit fb725e2f51fd · 2024-03-04T13:52:40.000+01:00
diff --git a/README.md b/README.md
@@ -237,6 +237,7 @@ Summary: create a **line summary** of your evaluation, in `src/lighteval/tasks/t
 - `metric` (list), the metrics you want to use for your evaluation (see next section for a detailed explanation)
 - `output_regex` (str), A regex string that will be used to filter your generation. (Genrative metrics will only select tokens that are between the first and the second sequence matched by the regex. For example, for a regex matching `\n` and a generation `\nModel generation output\nSome other text` the metric will only be fed with `Model generation output`)
 - `frozen` (bool), for now is set to False, but we will steadily pass all stable tasks to True.
+- `trust_dataset` (bool), set to True if you trust the dataset.
 
 Make sure you can launch your model with your new task using `--tasks lighteval|yournewtask|2|0`.
 
diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
@@ -49,6 +49,7 @@ def __init__(
             stop_sequence=None,
             output_regex=None,
             frozen=False,
+            trust_dataset=True,
         )
 
 
@@ -115,6 +116,7 @@ def __init__(
             stop_sequence=None,
             output_regex=None,
             frozen=False,
+            trust_dataset=True,
         )
 
 
@@ -145,6 +147,7 @@ def acva(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc"],
+    trust_dataset=True,
 )
 
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,7 +50,7 @@ keywords = ["evaluation", "nlp", "llm"]
 dependencies = [
     # Base dependencies
     "transformers>=4.38.0",
-    "huggingface_hub==0.20.3",
+    "huggingface_hub>=0.21.2",
     "torch>=2.0",
     "GitPython==3.1.31", # for logging
     "datasets>=2.14.0",
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
-from datasets import load_dataset
+from datasets import DownloadMode, load_dataset
 
 from lighteval.few_shot_manager import FewShotSampler
 from lighteval.logging.hierarchical_logger import hlog, hlog_warn
@@ -62,7 +62,7 @@ class LightevalTaskConfig:
         truncated_num_docs (bool): Whether less than the total number of documents were used
         output_regex (str)
         frozen (bool)
-
+        trust_dataset (bool): Whether to trust the dataset at execution or not
     """
 
     name: str
@@ -84,6 +84,8 @@ class LightevalTaskConfig:
     original_num_docs: int = -1
     effective_num_docs: int = -1
 
+    trust_dataset: bool = None
+
     def as_dict(self):
         return {
             "name": self.name,
@@ -144,6 +146,7 @@ def __init__(self, name: str, cfg: LightevalTaskConfig, cache_dir: Optional[str]
         self.dataset_path = self.hf_repo
         self.dataset_config_name = self.hf_subset
         self.dataset = None  # Delayed download
+        self.trust_dataset = cfg.trust_dataset
         hlog(f"{self.dataset_path} {self.dataset_config_name}")
         self._fewshot_docs = None
         self._docs = None
@@ -521,14 +524,10 @@ def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int =
         """
 
         if dataset_loading_processes <= 1:
-            datasets = [
-                download_dataset_worker((task.dataset_path, task.dataset_config_name)) for task in tasks
-            ]  # Also help us with gdb
+            datasets = [download_dataset_worker(task) for task in tasks]  # Also help us with gdb
         else:
             with Pool(processes=dataset_loading_processes) as pool:
-                datasets = pool.map(
-                    download_dataset_worker, [(task.dataset_path, task.dataset_config_name) for task in tasks]
-                )
+                datasets = pool.map(download_dataset_worker, tasks)
 
         for task, dataset in zip(tasks, datasets):
             task.dataset = dataset
@@ -539,13 +538,14 @@ def download_dataset_worker(args):
     Worker function to download a dataset from the HuggingFace Hub.
     Used for parallel dataset loading.
     """
-    dataset_path, dataset_config_name = args
+    task: LightevalTask = args
     dataset = load_dataset(
-        path=dataset_path,
-        name=dataset_config_name,
+        path=task.dataset_path,
+        name=task.dataset_config_name,
         data_dir=None,
         cache_dir=None,
-        download_mode=None,
+        download_mode=DownloadMode.FORCE_REDOWNLOAD,  # None
+        trust_remote_code=task.trust_dataset,
     )
     return dataset
 
diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl
diff --git a/tasks_examples/custom_tasks/custom_evaluation_tasks.py b/tasks_examples/custom_tasks/custom_evaluation_tasks.py

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ def __init__(`
`49`	`49`	`stop_sequence=None,`
`50`	`50`	`output_regex=None,`
`51`	`51`	`frozen=False,`
	`52`	`+ trust_dataset=True,`
`52`	`53`	`)`
`53`	`54`
`54`	`55`
`@@ -115,6 +116,7 @@ def __init__(`
`115`	`116`	`stop_sequence=None,`
`116`	`117`	`output_regex=None,`
`117`	`118`	`frozen=False,`
	`119`	`+ trust_dataset=True,`
`118`	`120`	`)`
`119`	`121`
`120`	`122`
`@@ -145,6 +147,7 @@ def acva(line, task_name: str = None):`
`145`	`147`	`few_shots_split="validation",`
`146`	`148`	`few_shots_select="sequential",`
`147`	`149`	`metric=["loglikelihood_acc"],`
	`150`	`+ trust_dataset=True,`
`148`	`151`	`)`
`149`	`152`
`150`	`153`