huggingface · NathanHB · Apr 14, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 7, 2025
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -23,8 +23,8 @@
     title: Use vllm as backend
   - local: use-sglang-as-backend
     title: Use SGLang as backend
-  - local: evaluate-the-model-on-a-server-or-container
-    title: Evaluate on Server
+  - local: use-huggingface-inference-endpoints-or-tgi-as-backend
+    title: Use Hugging Face inference endpoints or TGI as backend
   - local: contributing-to-multilingual-evaluations
     title: Contributing to multilingual evaluations
   title: Guides

diff --git a/docs/source/package_reference/models.mdx b/docs/source/package_reference/models.mdx
@@ -31,10 +31,6 @@
 ### Open AI Models
 [[autodoc]] models.endpoints.openai_model.OpenAIClient
 
-## Nanotron Model
-### NanotronLightevalModel
-[[autodoc]] models.nanotron.nanotron_model.NanotronLightevalModel
-
 ## VLLM Model
 ### VLLMModel
 [[autodoc]] models.vllm.vllm_model.VLLMModelConfig

diff --git a/...te-the-model-on-a-server-or-container.mdx → ...inference-endpoints-or-tgi-as-backend.mdx b/...te-the-model-on-a-server-or-container.mdx → ...inference-endpoints-or-tgi-as-backend.mdx
@@ -25,15 +25,12 @@ be deleted afterwards).
 __configuration file example:__
 
 ```yaml
-model:
-  base_params:
-    # Pass either model_name, or endpoint_name and true reuse_existing
-    # endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
-    # reuse_existing: true # defaults to false; if true, ignore all params in instance, and don't delete the endpoint after evaluation
+model_parameters:
+    reuse_existing: false # if true, ignore all params in instance, and don't delete the endpoint after evaluation
+# endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
     model_name: "meta-llama/Llama-2-7b-hf"
-    # revision: "main" # defaults to "main"
+    revision: "main"  # defaults to "main"
     dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
-  instance:
     accelerator: "gpu"
     region: "eu-west-1"
     vendor: "aws"
@@ -44,7 +41,7 @@ model:
     namespace: null # The namespace under which to launch the endpoint. Defaults to the current user's namespace
     image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
     env_vars:
-      null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
+    null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
 ```
 
 ### Text Generation Inference (TGI)
@@ -55,25 +52,8 @@ serverless inference.
 __configuration file example:__
 
 ```yaml
-model:
-  instance:
+model_parameters:
     inference_server_address: ""
     inference_server_auth: null
     model_id: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory
 ```
-
-### OpenAI API
-
-Lighteval also supports evaluating models on the OpenAI API. To do so you need to set your OpenAI API key in the environment variable.
-
-```bash
-export  OPENAI_API_KEY={your_key}
-```
-
-And then run the following command:
-
-```bash
-lighteval endpoint openai \
-    {model-name} \
-    <task parameters>
-```
diff --git a/docs/source/use-inference-providers-as-backend.mdx b/docs/source/use-inference-providers-as-backend.mdx
@@ -11,7 +11,7 @@ Lighteval allows to use Hugging Face's Inference Providers to evaluate llms on s
 
 ```bash
 lighteval endpoint inference-providers \
-    "model=deepseek-ai/DeepSeek-R1,provider=hf-inference" \
+    "model_name=deepseek-ai/DeepSeek-R1,provider=hf-inference" \
     "lighteval|gsm8k|0|0"
 ```
 
@@ -28,13 +28,13 @@ lighteval endpoint inference-providers \
 with the following config file:
 
 ```yaml
-model:
+model_parameters:
   model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
   provider: "novita"
   timeout: null
   proxies: null
   parallel_calls_count: 10
-  generation:
+  generation_parameters:
     temperature: 0.8
     top_k: 10
     max_new_tokens: 10000

diff --git a/docs/source/use-litellm-as-backend.mdx b/docs/source/use-litellm-as-backend.mdx
@@ -10,11 +10,14 @@ Documentation for available APIs and compatible endpoints can be found [here](ht
 
 ```bash
 lighteval endpoint litellm \
-    "gpt-3.5-turbo" \
+    "provider=openai,model_name=gpt-3.5-turbo" \
     "lighteval|gsm8k|0|0" \
     --use-chat-template
 ```
 
+> [!WARNING]
+> `--use-chat-template` is required for litellm to work properly.
+
 ## Using a config file
 
 Litellm allows generation with any OpenAI compatible endpoint, for example you
@@ -23,17 +26,16 @@ can evaluate a model running on a local vllm server.
 To do so you will need to use a config file like so:
 
 ```yaml
-model:
-  base_params:
+model_parameters:
     model_name: "openai/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
     base_url: "URL OF THE ENDPOINT YOU WANT TO USE"
     api_key: "" # remove or keep empty as needed
-  generation:
-    temperature: 0.5
-    max_new_tokens: 256
-    stop_tokens: [""]
-    top_p: 0.9
-    seed: 0
-    repetition_penalty: 1.0
-    frequency_penalty: 0.0
+    generation_parameters:
+      temperature: 0.5
+      max_new_tokens: 256
+      stop_tokens: [""]
+      top_p: 0.9
+      seed: 0
+      repetition_penalty: 1.0
+      frequency_penalty: 0.0
 ```
diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx
@@ -5,7 +5,7 @@ To use, simply change the `model_args` to reflect the arguments you want to pass
 
 ```bash
 lighteval sglang \
-    "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
+    "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
@@ -17,15 +17,15 @@ For example if you have 4 GPUs you can split it across using `tp_size`:
 
 ```bash
 lighteval sglang \
-    "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tp_size=4" \
+    "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tp_size=4" \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
 Or, if your model fits on a single GPU, you can use `dp_size` to speed up the evaluation:
 
 ```bash
 lighteval sglang \
-    "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,dp_size=4" \
+    "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,dp_size=4" \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
@@ -40,20 +40,38 @@ lighteval sglang \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
+> [!TIP]
+> Documentation for the config file of sglang can be found [here](https://docs.sglang.ai/backend/server_arguments.html)
+
 ```yaml
-model: # Model specific parameters
-  base_params:
-    model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B,dtype=float16,chunked_prefill_size=4096,mem_fraction_static=0.9" # Model args that you would pass in the command line
-  generation: # Generation specific parameters
-    temperature: 0.3
-    repetition_penalty: 1.0
-    frequency_penalty: 0.0
-    presence_penalty: 0.0
-    top_k: -1
-    min_p: 0.0
-    top_p: 0.9
-    max_new_tokens: 256
-    stop_tokens: ["<EOS>", "<PAD>"]
+model_parameters:
+    model_name: "HuggingFaceTB/SmolLM-1.7B-Instruct"
+    dtype: "auto"
+    tp_size: 1
+    dp_size: 1
+    context_length: null
+    random_seed: 1
+    trust_remote_code: False
+    use_chat_template: False
+    device: "cuda"
+    skip_tokenizer_init: False
+    kv_cache_dtype: "auto"
+    add_special_tokens: True
+    pairwise_tokenization: False
+    sampling_backend: null
+    attention_backend: null
+    mem_fraction_static: 0.8
+    chunked_prefill_size: 4096
+    generation_parameters:
+      max_new_tokens: 1024
+      min_new_tokens: 0
+      temperature: 1.0
+      top_k: 50
+      min_p: 0.0
+      top_p: 1.0
+      presence_penalty: 0.0
+      repetition_penalty: 1.0
+      frequency_penalty: 0.0
 ```
 
 > [!WARNING]

diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx
@@ -3,9 +3,13 @@
 Lighteval allows you to use `vllm` as backend allowing great speedups.
 To use, simply change the `model_args` to reflect the arguments you want to pass to vllm.
 
+
+> [!TIP]
+> Documentation for vllm engine args can be found [here](https://docs.vllm.ai/en/latest/serving/engine_args.html)
+
 ```bash
 lighteval vllm \
-    "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
+    "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
@@ -17,15 +21,15 @@ For example if you have 4 GPUs you can split it across using `tensor_parallelism
 
 ```bash
 export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval vllm \
-    "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tensor_parallel_size=4" \
+    "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tensor_parallel_size=4" \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
 Or, if your model fits on a single GPU, you can use `data_parallelism` to speed up the evaluation:
 
 ```bash
 lighteval vllm \
-    "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,data_parallel_size=4" \
+    "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,data_parallel_size=4" \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
@@ -41,18 +45,35 @@ lighteval vllm \
 ```
 
 ```yaml
-model: # Model specific parameters
-  base_params:
-    model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B,revision=main,dtype=bfloat16" # Model args that you would pass in the command line
-  generation: # Generation specific parameters
-    temperature: 0.3
-    repetition_penalty: 1.0
-    frequency_penalty: 0.0
-    presence_penalty: 0.0
-    seed: 42
-    top_k: 0
-    min_p: 0.0
-    top_p: 0.9
+model_parameters:
+    model_name: "HuggingFaceTB/SmolLM-1.7B-Instruct"
+    revision: "main"
+    dtype: "bfloat16"
+    tensor_parallel_size: 1
+    data_parallel_size: 1
+    pipeline_parallel_size: 1
+    gpu_memory_utilization: 0.9
+    max_model_length: 2048
+    swap_space: 4
+    seed: 1
+    trust_remote_code: True
+    use_chat_template: True
+    add_special_tokens: True
+    multichoice_continuations_start_space: True
+    pairwise_tokenization: True
+    subfolder: null
+    generation_parameters:
+      presence_penalty: 0.0
+      repetition_penalty: 1.0
+      frequency_penalty: 0.0
+      temperature: 1.0
+      top_k: 50
+      min_p: 0.0
+      top_p: 1.0
+      seed: 42
+      stop_tokens: null
+      max_new_tokens: 1024
+      min_new_tokens: 0
 ```
 
 > [!WARNING]
@@ -66,21 +87,38 @@ For special kinds of metrics like `Pass@K` or LiveCodeBench's `codegen` metric,
 generations. This can be done in the `yaml` file in the following way:
 
 ```yaml
-model: # Model specific parameters
-  base_params:
-    model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B,revision=main,dtype=bfloat16" # Model args that you would pass in the command line
-  generation: # Generation specific parameters
-    temperature: 0.3
-    repetition_penalty: 1.0
-    frequency_penalty: 0.0
-    presence_penalty: 0.0
-    seed: 42
-    top_k: 0
-    min_p: 0.0
-    top_p: 0.9
-  metric_options: # Optional metric arguments
+model_parameters:
+    model_name: "HuggingFaceTB/SmolLM-1.7B-Instruct"
+    revision: "main"
+    dtype: "bfloat16"
+    tensor_parallel_size: 1
+    data_parallel_size: 1
+    pipeline_parallel_size: 1
+    gpu_memory_utilization: 0.9
+    max_model_length: 2048
+    swap_space: 4
+    seed: 1
+    trust_remote_code: True
+    use_chat_template: True
+    add_special_tokens: True
+    multichoice_continuations_start_space: True
+    pairwise_tokenization: True
+    subfolder: null
+    generation_parameters:
+      presence_penalty: 0.0
+      repetition_penalty: 1.0
+      frequency_penalty: 0.0
+      temperature: 1.0
+      top_k: 50
+      min_p: 0.0
+      top_p: 1.0
+      seed: 42
+      stop_tokens: null
+      max_new_tokens: 1024
+      min_new_tokens: 0
+metric_options: # Optional metric arguments
     codegen_pass@1:16:
-      num_samples: 16
+        num_samples: 16
 ```
 
 An optional key `metric_options` can be passed in the yaml file,

diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx
@@ -40,7 +40,7 @@ def main():
     )
 
     model_config = VLLMModelConfig(
-            pretrained="HuggingFaceH4/zephyr-7b-beta",
+            model_name="HuggingFaceH4/zephyr-7b-beta",
             dtype="float16",
             use_chat_template=True,
     )