From 81618c3a2e41890e606257d3ed5c73b61010dbe0 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Wed, 6 Mar 2024 15:53:01 +0000
Subject: [PATCH 1/3] init - now gives the path with an arg, maybe will remove

---
 README.md                                     | 13 ++++--
 .../ifeval/instructions.py                    |  2 +-
 .../ifeval/instructions_registry.py           |  2 +-
 .../ifeval/instructions_utils.py              |  0
 .../ifeval/main.py                            |  4 +-
 pyproject.toml                                |  5 ++-
 run_evals_accelerate.py                       |  6 +++
 src/lighteval/main_accelerate.py              |  2 +-
 src/lighteval/main_nanotron.py                |  3 +-
 src/lighteval/tasks/registry.py               | 45 ++++++++++++++++---
 .../ifeval/requirements.txt                   |  1 -
 11 files changed, 65 insertions(+), 18 deletions(-)
 rename {tasks_examples/custom_tasks_with_custom_metrics => extended_tasks}/ifeval/instructions.py (99%)
 rename {tasks_examples/custom_tasks_with_custom_metrics => extended_tasks}/ifeval/instructions_registry.py (98%)
 rename {tasks_examples/custom_tasks_with_custom_metrics => extended_tasks}/ifeval/instructions_utils.py (100%)
 rename tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py => extended_tasks/ifeval/main.py (97%)
 delete mode 100644 tasks_examples/custom_tasks_with_custom_metrics/ifeval/requirements.txt

diff --git a/README.md b/README.md
index 7cf415434..d49c3b6e1 100644
--- a/README.md
+++ b/README.md
@@ -210,9 +210,13 @@ However, we are very grateful to the Harness and HELM teams for their continued
 If your new task or metric has requirements, add a specific `requirements.txt` file with your evaluation.
 
 ### Adding a new task
-To add a new task, first either open an issue, to determine whether it will be integrated in the core evaluations of lighteval, or in the community tasks, and **add its dataset** on the hub.
-Note: Core evaluations are evals we will add to our test suite to ensure non regression through time, and which already see a high usage in the community.
-A popular community evaluation can move to become a core evaluation through time.
+To add a new task, first either open an issue, to determine whether it will be integrated in the core evaluations of lighteval, in the extended tasks, or in the community tasks, and **add its dataset** on the hub.
+
+- Core evaluations are evaluation which only require standard logic in their metrics and processing, and that we will add to our test suite to ensure non regression through time. They already see a high usage in the community.
+- Extended evaluations are evaluations which require custom logic in their metrics (complex normalisation, an LLM as a judge, ...), that we added to facilitate the life of users. They already see a high usage in the community.
+- Community evaluations are submissions by the community of new tasks.
+
+A popular community evaluation can move to becoming an extended or core evaluation through time.
 
 #### Core evaluations
 Prompt function: **find a suitable prompt function** in `src.lighteval.tasks.task_prompt_formatting.py`, or code your own. This function must output a `Doc` object, which should contain `query`, your prompt, and either `gold`, the gold output, or `choices` and `gold_index`, the list of choices and index or indices of correct answers. If your query contains an instruction which should not be repeated in a few shot setup, add it to an `instruction` field.
@@ -241,6 +245,9 @@ Summary: create a **line summary** of your evaluation, in `src/lighteval/tasks/t
 
 Make sure you can launch your model with your new task using `--tasks lighteval|yournewtask|2|0`.
 
+### Extended evaluations
+Proceed as for community evaluations, but in the `extended_tasks` folder.
+
 #### Community evaluations
 Copy the `community_tasks/_template.yml` to `community_tasks/yourevalname.py` and edit it to add your custom tasks (the parameters you can use are explained above). It contains an interesting mechanism if the dataset you are adding contains a lot of subsets.
 
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py b/extended_tasks/ifeval/instructions.py
similarity index 99%
rename from tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py
rename to extended_tasks/ifeval/instructions.py
index 6af99d819..07598c462 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py
+++ b/extended_tasks/ifeval/instructions.py
@@ -23,7 +23,7 @@
 
 import langdetect
 
-import tasks_examples.custom_tasks_with_custom_metrics.ifeval.instructions_utils as instructions_util
+import extended_tasks.ifeval.instructions_utils as instructions_util
 
 
 logger = logging.getLogger(__name__)
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py b/extended_tasks/ifeval/instructions_registry.py
similarity index 98%
rename from tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py
rename to extended_tasks/ifeval/instructions_registry.py
index 17089bd0a..6a6939356 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py
+++ b/extended_tasks/ifeval/instructions_registry.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """Registry of all instructions."""
-import tasks_examples.custom_tasks_with_custom_metrics.ifeval.instructions as instructions
+import extended_tasks.ifeval.instructions as instructions
 
 
 _KEYWORD = "keywords:"
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py b/extended_tasks/ifeval/instructions_utils.py
similarity index 100%
rename from tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py
rename to extended_tasks/ifeval/instructions_utils.py
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py b/extended_tasks/ifeval/main.py
similarity index 97%
rename from tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
rename to extended_tasks/ifeval/main.py
index 6f60f047d..70a3013f4 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
+++ b/extended_tasks/ifeval/main.py
@@ -23,7 +23,7 @@
 import numpy as np
 from aenum import extend_enum
 
-import tasks_examples.custom_tasks_with_custom_metrics.ifeval.instructions_registry as instructions_registry
+import extended_tasks.ifeval.instructions_registry as instructions_registry
 from lighteval.metrics import Metrics
 from lighteval.metrics.utils import (
     MetricCategory,
@@ -38,7 +38,7 @@
 ifeval = LightevalTaskConfig(
     name="ifeval",
     prompt_function="ifeval_prompt",
-    suite=["custom"],
+    suite=["extended"],
     hf_repo="wis-k/instruction-following-eval",
     hf_subset="default",
     metric=["ifeval_metric"],
diff --git a/pyproject.toml b/pyproject.toml
index a774bb09c..d8953d7d4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,7 +78,6 @@ dependencies = [
 accelerate = ["accelerate"]
 tgi = ["text-generation==0.6.0"]
 optimum = ["optimum==1.12.0"]
-# Quantization and adapter weights
 quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
 adapters = ["peft==0.3.0"]
 nanotron = [
@@ -88,7 +87,9 @@ nanotron = [
 quality = ["ruff==v0.2.2","pre-commit"]
 tests = ["pytest==7.4.0"]
 dev = ["lighteval[accelerate,quality,tests]"]
-
+extended_tasks = [
+  "langdetect", #ifeval
+]
 
 [project.urls]
 Homepage = "https://github.com/huggingface/lighteval"
diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
index 57400f99b..692daaf76 100644
--- a/run_evals_accelerate.py
+++ b/run_evals_accelerate.py
@@ -103,6 +103,12 @@ def get_parser():
         default=None,
         help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
     )
+    parser.add_argument(
+        "--extended_tasks",
+        type=str,
+        default=None,
+        help="Path to the folder which contains all extended tasks",
+    )
     group.add_argument(
         "--tasks",
         type=str,
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
index f106485f3..a59029fbc 100644
--- a/src/lighteval/main_accelerate.py
+++ b/src/lighteval/main_accelerate.py
@@ -82,7 +82,7 @@ def main(args):
         with accelerator.main_process_first() if accelerator is not None else nullcontext():
             task_names_list, few_shots_dict = taskinfo_selector(args.tasks)
             task_dict = Registry(cache_dir=env_config.cache_dir).get_task_dict(
-                task_names_list, custom_tasks=args.custom_tasks
+                task_names_list, custom_tasks=args.custom_tasks, extended_tasks=args.extended_tasks
             )
             # Loading all the dataset in a distributed manner
             LightevalTask.load_datasets(task_dict.values(), args.dataset_loading_processes)
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index 51b276ce6..4610ea869 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -135,7 +135,8 @@ def main(
 
             task_names_list, few_shots_dict = taskinfo_selector(tasks_selection)
             task_dict = Registry(cache_dir=cache_dir).get_task_dict(
-                task_names_list, custom_tasks=lighteval_config.tasks.custom_tasks
+                task_names_list,
+                custom_tasks=lighteval_config.tasks.custom_tasks,
             )
             # Loading all the dataset in a distributed manner
             LightevalTask.load_datasets(task_dict.values(), lighteval_config.tasks.dataset_loading_processes)
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index 3b0c338ed..5911d1ae0 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -39,8 +39,19 @@
 # Original follows the original implementation as closely as possible
 # Leaderboard are the evaluations we fixed on the open llm leaderboard - you should get similar results
 # Community are for community added evaluations
+# Extended are for evaluations with custom logic
 # Custom is for all the experiments you might want to do!
-DEFAULT_SUITES = ["helm", "bigbench", "harness", "leaderboard", "lighteval", "original", "custom", "community"]
+DEFAULT_SUITES = [
+    "helm",
+    "bigbench",
+    "harness",
+    "leaderboard",
+    "lighteval",
+    "original",
+    "extended",
+    "custom",
+    "community",
+]
 
 TRUNCATE_FEW_SHOTS_DEFAULTS = True
 
@@ -97,7 +108,10 @@ def get_task_class(
         )
 
     def get_task_dict(
-        self, task_name_list: List[str], custom_tasks: Optional[Union[str, ModuleType]] = None
+        self,
+        task_name_list: List[str],
+        custom_tasks: Optional[Union[str, ModuleType]] = None,
+        extended_tasks: str = None,
     ) -> Dict[str, LightevalTask]:
         """
         Get a dictionary of tasks based on the task name list.
@@ -105,6 +119,7 @@ def get_task_dict(
         Args:
             task_name_list (List[str]): A list of task names.
             custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
+            extended_tasks (Optional[str]): The path to the extended tasks group of submodules
 
         Returns:
             Dict[str, LightevalTask]: A dictionary containing the tasks.
@@ -116,12 +131,20 @@ def get_task_dict(
         # Import custom tasks provided by the user
         custom_tasks_registry = None
         custom_tasks_module = None
+        TASKS_TABLE = []
         if custom_tasks is not None:
             custom_tasks_module = create_custom_tasks_module(custom_tasks=custom_tasks)
-        if custom_tasks_module is not None:
-            custom_tasks_registry = create_config_tasks(
-                meta_table=custom_tasks_module.TASKS_TABLE, cache_dir=self.cache_dir
+            TASKS_TABLE.extend(custom_tasks_module.TASKS_TABLE)
+        if extended_tasks is not None:
+            hlog_warn(
+                "Did you make sure to install the extended_tasks dependencies, using `pip install -e .[extended_tasks]`?"
             )
+            extended_tasks_modules = load_extended_tasks_modules(extended_tasks_path=extended_tasks)
+            for module in extended_tasks_modules:
+                TASKS_TABLE.extend(module.TASKS_TABLE)
+
+        if len(TASKS_TABLE) > 0:
+            custom_tasks_registry = create_config_tasks(meta_table=TASKS_TABLE, cache_dir=self.cache_dir)
             hlog(custom_tasks_registry)
 
         # Select relevant tasks given the subset asked for by the user
@@ -133,6 +156,16 @@ def get_task_dict(
         return tasks_dict
 
 
+def load_extended_tasks_modules(extended_tasks_path: str):
+    all_modules = []
+    for folder in os.listdir(extended_tasks_path):
+        cur_module = create_custom_tasks_module(os.path.join(extended_tasks_path, folder, "main.py"))
+        hlog(f"Successfully loaded extended task: {folder}.")
+        all_modules.append(cur_module)
+
+    return all_modules
+
+
 def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleType:
     """Creates a custom task module to load tasks defined by the user in their own file.
 
@@ -153,7 +186,7 @@ def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleTy
 
 
 def get_custom_tasks(custom_tasks: Union[str, ModuleType]) -> Tuple[ModuleType, str]:
-    """Get custom tasks from the given custom tasks file or module.
+    """Get all the custom tasks available from the given custom tasks file or module.
 
     Args:
         custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/requirements.txt b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/requirements.txt
deleted file mode 100644
index 7f42284c9..000000000
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-langdetect

From 4fafc3c96657455218a4a2bed6d14072d97f638f Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Thu, 7 Mar 2024 11:07:53 +0000
Subject: [PATCH 2/3] allows several custom task modules to be loaded

---
 src/lighteval/tasks/lighteval_task.py | 32 ++++++++++++++++++---------
 src/lighteval/tasks/registry.py       | 13 +++++------
 2 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index a94b5995d..6d89341ba 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -143,7 +143,9 @@ def __post_init__(self):
 
 
 class LightevalTask:
-    def __init__(self, name: str, cfg: LightevalTaskConfig, cache_dir: Optional[str] = None, custom_tasks_module=None):
+    def __init__(
+        self, name: str, cfg: LightevalTaskConfig, cache_dir: Optional[str] = None, custom_tasks_module: list = None
+    ):
         """
         Initialize a LightEval task.
 
@@ -200,16 +202,26 @@ def __init__(self, name: str, cfg: LightevalTaskConfig, cache_dir: Optional[str]
         # to use once prompt formatting is managed as a module
         if custom_tasks_module is None:
             self.formatter = getattr(tasks_prompt_formatting, cfg.prompt_function)
-        elif hasattr(custom_tasks_module, cfg.prompt_function):
-            # If we have a prompt in both the custom_tasks_module and our tasks_prompt_formatting
-            # We take the prompt from the custom_tasks_module
-            if hasattr(tasks_prompt_formatting, cfg.prompt_function):
-                hlog_warn(
-                    f"Be careful you are using custom prompt function {cfg.prompt_function} and not the default one."
-                )
-            self.formatter = getattr(custom_tasks_module, cfg.prompt_function)
         else:
-            self.formatter = getattr(tasks_prompt_formatting, cfg.prompt_function)
+            formatter = []
+            for module in custom_tasks_module:
+                if hasattr(module, cfg.prompt_function):
+                    formatter.append(getattr(module, cfg.prompt_function))
+
+            if len(formatter) == 0:  # Default version
+                self.formatter = getattr(tasks_prompt_formatting, cfg.prompt_function)
+            elif len(formatter) == 1:
+                # If we have a prompt in both the module and our tasks_prompt_formatting
+                # We take the prompt from the module
+                if hasattr(tasks_prompt_formatting, cfg.prompt_function):
+                    hlog_warn(
+                        f"Be careful you are using custom prompt function {cfg.prompt_function} and not the default one."
+                    )
+                self.formatter = getattr(module, cfg.prompt_function)
+            else:
+                raise Exception(
+                    f"You defined the prompt function {cfg.prompt_function} several times in the different custom modules you are loading."
+                )
         self.generation_size = cfg.generation_size
         self.stop_sequence = cfg.stop_sequence
         self.output_regex = cfg.output_regex
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index 5911d1ae0..5625fe75d 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -130,18 +130,17 @@ def get_task_dict(
         """
         # Import custom tasks provided by the user
         custom_tasks_registry = None
-        custom_tasks_module = None
+        custom_tasks_module = []
         TASKS_TABLE = []
         if custom_tasks is not None:
-            custom_tasks_module = create_custom_tasks_module(custom_tasks=custom_tasks)
-            TASKS_TABLE.extend(custom_tasks_module.TASKS_TABLE)
+            custom_tasks_module.append(create_custom_tasks_module(custom_tasks=custom_tasks))
         if extended_tasks is not None:
             hlog_warn(
-                "Did you make sure to install the extended_tasks dependencies, using `pip install -e .[extended_tasks]`?"
+                "You are using extended_tasks. Make sure you installed their dependencies using `pip install -e .[extended_tasks]`."
             )
-            extended_tasks_modules = load_extended_tasks_modules(extended_tasks_path=extended_tasks)
-            for module in extended_tasks_modules:
-                TASKS_TABLE.extend(module.TASKS_TABLE)
+            custom_tasks_module.extend(load_extended_tasks_modules(extended_tasks_path=extended_tasks))
+        for module in custom_tasks_module:
+            TASKS_TABLE.extend(module.TASKS_TABLE)
 
         if len(TASKS_TABLE) > 0:
             custom_tasks_registry = create_config_tasks(meta_table=TASKS_TABLE, cache_dir=self.cache_dir)

From 0d13b4a0ee692d87517aab3b180f1df0c330bcec Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Sat, 9 Mar 2024 14:26:03 +0000
Subject: [PATCH 3/3] fix quality

---
 src/lighteval/tasks/lighteval_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 239adfce9..40a2c7adb 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -145,7 +145,7 @@ def __post_init__(self):
 
 
 class LightevalTask:
-    def __init__(
+    def __init__(  # noqa: C901
         self, name: str, cfg: LightevalTaskConfig, cache_dir: Optional[str] = None, custom_tasks_module: list = None
     ):
         """