huggingface
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 10 deletions b/‎.gitignore‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎README.md‎
Lines changed: 21 additions & 13 deletions b/‎README.md‎
Lines changed: 21 additions & 13 deletions
diff --git a/‎examples/model_configs/base_model.yaml‎
Lines changed: 12 additions & 0 deletions b/‎examples/model_configs/base_model.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎examples/model_configs/endpoint_model.yaml‎
Lines changed: 18 additions & 0 deletions b/‎examples/model_configs/endpoint_model.yaml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎examples/model_configs/tgi_model.yaml‎
Lines changed: 5 additions & 0 deletions b/‎examples/model_configs/tgi_model.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tasks_examples/custom_tasks/custom_evaluation_tasks.py‎ renamed to ‎examples/nanotron/custom_evaluation_tasks.py‎ b/‎tasks_examples/custom_tasks/custom_evaluation_tasks.py‎ renamed to ‎examples/nanotron/custom_evaluation_tasks.py‎
diff --git a/‎tasks_examples/custom_tasks/custom_task.py‎ renamed to ‎examples/nanotron/custom_task.py‎ b/‎tasks_examples/custom_tasks/custom_task.py‎ renamed to ‎examples/nanotron/custom_task.py‎
diff --git a/‎tasks_examples/custom_tasks/lighteval_config_override_template.yaml‎ renamed to ‎examples/nanotron/lighteval_config_override_template.yaml‎ b/‎tasks_examples/custom_tasks/lighteval_config_override_template.yaml‎ renamed to ‎examples/nanotron/lighteval_config_override_template.yaml‎
diff --git a/‎tasks_examples/OALL_tasks.txt‎ renamed to ‎examples/tasks/OALL_tasks.txt‎ b/‎tasks_examples/OALL_tasks.txt‎ renamed to ‎examples/tasks/OALL_tasks.txt‎
diff --git a/‎tasks_examples/all_arabic_tasks.txt‎ renamed to ‎examples/tasks/all_arabic_tasks.txt‎ b/‎tasks_examples/all_arabic_tasks.txt‎ renamed to ‎examples/tasks/all_arabic_tasks.txt‎
@@ -164,29 +164,19 @@ tests/.data
 tests/data
 
 # outputs folder
-examples/*/outputs
-examples/*/NeMo_experiments
-examples/*/nemo_experiments
-examples/*/.hydra
-examples/*/wandb
-examples/*/data
 wandb
 dump.py
 
 docs/sources/source/test_build/
 
 # Checkpoints, config files and temporary files created in tutorials.
-examples/neural_graphs/*.chkpt
-examples/neural_graphs/*.yml
-
 .hydra/
 nemo_experiments/
 
 .ruff_cache
 
 tmp.py
 
-examples
 benchmark_output
 prod_env
 
 
@@ -10,9 +10,6 @@ We're releasing it with the community in the spirit of building in the open.
 Note that it is still very much early so don't expect 100% stability ^^'
 In case of problems or question, feel free to open an issue!
 
-## News
-- **Feb 08, 2024**: Release of `lighteval`
-
 ## Installation
 
 Clone the repo:
@@ -98,7 +95,7 @@ Here, `--tasks` refers to either a _comma-separated_ list of supported tasks fro
 suite|task|num_few_shot|{0 or 1 to automatically reduce `num_few_shot` if prompt is too long}
 ```
 
-or a file path like [`tasks_examples/recommended_set.txt`](./tasks_examples/recommended_set.txt) which specifies multiple task configurations. For example, to evaluate GPT-2 on the Truthful QA benchmark run:
+or a file path like [`examples/tasks/recommended_set.txt`](./examples/tasks/recommended_set.txt) which specifies multiple task configurations. For example, to evaluate GPT-2 on the Truthful QA benchmark run:
 
 ```shell
 accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
@@ -118,7 +115,20 @@ accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
     --output_dir="./evals/"
 ```
 
-See the [`tasks_examples/recommended_set.txt`](./tasks_examples/recommended_set.txt) file for a list of recommended task configurations.
+See the [`examples/tasks/recommended_set.txt`](./examples/tasks/recommended_set.txt) file for a list of recommended task configurations.
+
+### Evaluating a model with a complex configuration
+
+If you want to evaluate a model by spinning up inference endpoints, or use adapter/delta weights, or more complex configuration options, you can load models using a configuration file. This is done as follows:
+
+```shell
+accelerate launch --multi_gpu --num_processes=<num_gpus> run_evals_accelerate.py \
+    --model_config_path="<path to your model configuration>" \
+    --tasks <task parameters> \
+    --output_dir output_dir
+```
+
+Examples of possible configuration files are provided in `examples/model_configs`.
 
 ### Evaluating a large model with pipeline parallelism
 
@@ -127,15 +137,13 @@ To evaluate models larger that ~40B parameters in 16-bit precision, you will nee
 ```shell
 # PP=2, DP=4 - good for models < 70B params
 accelerate launch --multi_gpu --num_processes=4 run_evals_accelerate.py \
-    --model_args="pretrained=<path to model on the hub>" \
-    --model_parallel \
+    --model_args="pretrained=<path to model on the hub>,model_parallel=True" \
     --tasks <task parameters> \
     --output_dir output_dir
 
 # PP=4, DP=2 - good for huge models >= 70B params
 accelerate launch --multi_gpu --num_processes=2 run_evals_accelerate.py \
-    --model_args="pretrained=<path to model on the hub>" \
-    --model_parallel \
+    --model_args="pretrained=<path to model on the hub>,model_parallel=True" \
     --tasks <task parameters> \
     --output_dir output_dir
 ```
@@ -147,7 +155,7 @@ To evaluate a model on all the benchmarks of the [Open LLM Leaderboard](https://
 ```shell
 accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
     --model_args "pretrained=<model name>" \
-    --tasks tasks_examples/open_llm_leaderboard_tasks.txt \
+    --tasks examples/tasks/open_llm_leaderboard_tasks.txt \
     --override_batch_size 1 \
     --output_dir="./evals/"
 ```
@@ -220,7 +228,7 @@ However, we are very grateful to the Harness and HELM teams for their continued
         - [metrics](https://github.com/huggingface/lighteval/tree/main/src/lighteval/metrics): All the available metrics you can use. They are described in metrics, and divided between sample metrics (applied at the sample level, such as a prediction accuracy) and corpus metrics (applied over the whole corpus). You'll also find available normalisation functions.
         - [models](https://github.com/huggingface/lighteval/tree/main/src/lighteval/models): Possible models to use. We cover transformers (base_model), with adapter or delta weights, as well as TGI models locally deployed (it's likely the code here is out of date though), and brrr/nanotron models.
         - [tasks](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks): Available tasks. The complete list is in `tasks_table.jsonl`, and you'll find all the prompts in `tasks_prompt_formatting.py`. Popular tasks requiring custom logic are exceptionally added in the [extended tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/extended).
-- [tasks_examples](https://github.com/huggingface/lighteval/tree/main/tasks_examples) contains a list of available tasks you can launch. We advise using tasks in the `recommended_set`, as it's possible that some of the other tasks need double checking.
+- [examples/tasks](https://github.com/huggingface/lighteval/tree/main/examples/tasks) contains a list of available tasks you can launch. We advise using tasks in the `recommended_set`, as it's possible that some of the other tasks need double checking.
 - [tests](https://github.com/huggingface/lighteval/tree/main/tests) contains our test suite, that we run at each PR to prevent regressions in metrics/prompts/tasks, for a subset of important tasks.
 
 ## Customisation
@@ -291,7 +299,7 @@ if __name__ == "__main__":
 
 You can then give your custom metric to lighteval by using `--custom-tasks path_to_your_file` when launching it.
 
-To see an example of a custom metric added along with a custom task, look at `tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py`.
+To see an example of a custom metric added along with a custom task, look at `examples/tasks/custom_tasks_with_custom_metrics/ifeval/ifeval.py`.
 
 ## Available metrics
 ### Metrics for multiple choice tasks
@@ -414,7 +422,7 @@ source <path_to_your_venv>/activate #or conda activate yourenv
 cd <path_to_your_lighteval>/lighteval
 
 export CUDA_LAUNCH_BLOCKING=1
-srun accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py --model_args "pretrained=your model name" --tasks tasks_examples/open_llm_leaderboard_tasks.txt --override_batch_size 1 --save_details --output_dir=your output dir
+srun accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py --model_args "pretrained=your model name" --tasks examples/tasks/open_llm_leaderboard_tasks.txt --override_batch_size 1 --save_details --output_dir=your output dir
 ```
 
 ## Releases
 
@@ -0,0 +1,12 @@
+model:
+  type: "base" # can be base, tgi, or endpoint
+  base_params:
+    model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
+    dtype: "bfloat16"
+  merged_weights: # Ignore this section if you are not using PEFT models
+    delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name
+    adapter_weights: false # set to True of your model has been trained with peft, also need to provide the base model name
+    base_model: null # path to the base_model
+  generation:
+    multichoice_continuations_start_space: false # Whether to force multiple choice continuations to start with a space
+    no_multichoice_continuations_start_space: false # Whether to force multiple choice continuations to not start with a space
@@ -0,0 +1,18 @@
+model:
+  type: "endpoint" # can be base, tgi, or endpoint
+  base_params:
+    endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
+    model: "meta-llama/Llama-2-7b-hf"
+    revision: "main"
+    dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
+    reuse_existing: false # if true, ignore all params in instance
+  instance:
+    accelerator: "gpu"
+    region: "eu-west-1"
+    vendor: "aws"
+    instance_size: "medium"
+    instance_type: "g5.2xlarge"
+    framework: "pytorch"
+    endpoint_type: "protected"
+  generation:
+    add_special_tokens: true
@@ -0,0 +1,5 @@
+model:
+  type: "tgi" # can be base, tgi, or endpoint
+  instance:
+    inference_server_address: ""
+    inference_server_auth: null