huggingface · lewtun · Feb 21, 2025 · Feb 20, 2025 · Feb 20, 2025 · Feb 20, 2025
diff --git a/Makefile b/Makefile
@@ -28,7 +28,7 @@ evaluate:
 		fi \
 	),))
 	$(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \
-	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilisation=0.8" && \
+	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilization=0.8" && \
 	lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \
 		--custom-tasks src/open_r1/evaluate.py \
 		--use-chat-template \

diff --git a/README.md b/README.md
@@ -51,19 +51,23 @@ To install `uv`, follow the [UV Installation Guide](https://docs.astral.sh/uv/ge
 
 
 ```shell
-uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --upgrade pip --link-mode=copy
+uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --upgrade pip
 ```
 
-Next, install vLLM:
+> [!TIP]
+> For Hugging Face cluster users, add `export UV_LINK_MODE=copy` to your `.bashrc` to suppress cache warnings from `uv`
+
+Next, install vLLM and FlashAttention:
 
 ```shell
-uv pip install vllm==0.7.2 --link-mode=copy
+uv pip install vllm==0.7.2
+uv pip install setuptools && uv pip install flash-attn --no-build-isolation
 ```
 
 This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend:
 
 ```shell
-GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]" --link-mode=copy
+GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"
 ```
 
 Next, log into your Hugging Face and Weights and Biases accounts as follows:
@@ -233,7 +237,7 @@ We use `lighteval` to evaluate models, with custom tasks defined in `src/open_r1
 
 ```shell
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
 OUTPUT_DIR=data/evals/$MODEL
 
 # AIME 2024
@@ -266,7 +270,7 @@ To increase throughput across multiple GPUs, use _data parallel_ as follows:
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
 
@@ -281,7 +285,7 @@ For large models which require sharding across GPUs, use _tensor parallel_ and r
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
 
@@ -335,7 +339,7 @@ To reproduce these results use the following command:
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8,tensor_parallel_size=$NUM_GPUS"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.0}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "custom|math_500|0|0" \
@@ -347,7 +351,7 @@ lighteval vllm $MODEL_ARGS "custom|math_500|0|0" \
 Alternatively, you can launch Slurm jobs as follows:
 
 ```shell
-python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks math_500
+python scripts/run_benchmarks.py --model-id {model_id}  --benchmarks math_500
 ```
 
 ### GPQA Diamond
@@ -368,7 +372,7 @@ To reproduce these results use the following command:
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8,tensor_parallel_size=$NUM_GPUS"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.0}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "custom|gpqa:diamond|0|0" \
@@ -378,28 +382,28 @@ lighteval vllm $MODEL_ARGS "custom|gpqa:diamond|0|0" \
 ```
 
 ```shell
-python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks gpqa
+python scripts/run_benchmarks.py --model-id {model_id}  --benchmarks gpqa
 ```
 
 ### LiveCodeBench
 
 We are able to reproduce Deepseek's reported results on the LiveCodeBench code generation benchmark within ~1-3 standard deviations:
 
 | Model                         | LiveCodeBench (🤗 LightEval) | GPQA Diamond (DeepSeek Reported) |
-|:------------------------------|:---------------------------:|:--------------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |            16.3             |               16.9               |
-| DeepSeek-R1-Distill-Qwen-7B   |            36.6             |               37.6               |
-| DeepSeek-R1-Distill-Qwen-14B  |            51.5             |               53.1               |
-| DeepSeek-R1-Distill-Qwen-32B  |            56.6                |               57.2               |
-| DeepSeek-R1-Distill-Llama-8B  |            37.0             |               39.6               |
-| DeepSeek-R1-Distill-Llama-70B |            54.5             |               57.5               |
+|:------------------------------|:----------------------------:|:--------------------------------:|
+| DeepSeek-R1-Distill-Qwen-1.5B |             16.3             |               16.9               |
+| DeepSeek-R1-Distill-Qwen-7B   |             36.6             |               37.6               |
+| DeepSeek-R1-Distill-Qwen-14B  |             51.5             |               53.1               |
+| DeepSeek-R1-Distill-Qwen-32B  |             56.6             |               57.2               |
+| DeepSeek-R1-Distill-Llama-8B  |             37.0             |               39.6               |
+| DeepSeek-R1-Distill-Llama-70B |             54.5             |               57.5               |
 
 To reproduce these results use the following command:
 
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models, or data_parallel_size=8 with the smaller models for speed
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8,tensor_parallel_size=$NUM_GPUS,generation_parameters={temperature:0.6,top_p:0.95}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
@@ -408,7 +412,7 @@ lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
 ```
 
 ```shell
-python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks lcb
+python scripts/run_benchmarks.py --model-id {model_id}  --benchmarks lcb
 ```
 
 ## Data generation

diff --git a/setup.py b/setup.py
@@ -39,7 +39,7 @@
 
 
 # IMPORTANT: all dependencies should be listed here with their version requirements, if any.
-#   * If a dependency is fast-moving (e.g. transformers), pin to the exact version
+#   * If a dependency is fast-moving (e.g. trl), pin to the exact version
 _deps = [
     "accelerate>=1.2.1",
     "bitsandbytes>=0.43.0",
@@ -53,9 +53,10 @@
     "hf_transfer>=0.1.4",
     "huggingface-hub[cli]>=0.19.2,<1.0",
     "isort>=5.12.0",
+    "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
     "liger_kernel==0.5.2",
-    "lighteval @ git+https://github.com/huggingface/lighteval.git@86f62259f105ae164f655e0b91c92a823a742724#egg=lighteval[math]",
+    "lighteval @ git+https://github.com/huggingface/lighteval.git@ebb7377b39a48ab0691e6fbd9dea57e9fe290a7e",
     "math-verify==0.5.2",  # Used for math verification in grpo
     "packaging>=23.0",
     "parameterized>=0.9.0",
@@ -68,7 +69,7 @@
     "torch==2.5.1",
     "transformers @ git+https://github.com/huggingface/transformers.git@main",
     "trl @ git+https://github.com/huggingface/trl.git@main",
-    "vllm==0.7.1",
+    "vllm==0.7.2",
     "wandb>=0.19.1",
 ]
 
@@ -89,10 +90,9 @@ def deps_list(*pkgs):
 extras["tests"] = deps_list("pytest", "parameterized", "math-verify")
 extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("ruff", "isort", "flake8")
-extras["train"] = deps_list("flash_attn")
 extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv")
 extras["eval"] = deps_list("lighteval", "math-verify")
-extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"] + extras["train"]
+extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"]
 
 # core dependencies shared across the whole project - keep this to a bare minimum :)
 install_requires = [
@@ -103,6 +103,7 @@ def deps_list(*pkgs):
     deps["deepspeed"],
     deps["hf_transfer"],
     deps["huggingface-hub"],
+    deps["langdetect"],
     deps["latex2sympy2_extended"],
     deps["math-verify"],
     deps["liger_kernel"],
@@ -111,6 +112,7 @@ def deps_list(*pkgs):
     deps["sentencepiece"],
     deps["transformers"],
     deps["trl"],
+    deps["wandb"],
 ]
 
 setup(

diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
@@ -29,9 +29,16 @@ NUM_GPUS=$(nvidia-smi -L | wc -l)
 if [ "$TENSOR_PARALLEL" = "True" ]; then
     # use TP to shard model across NUM_GPUS
     export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+    # FIXME: lighteval is broken on `main`so we need to manually pass the generation params
+    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
 else
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
+fi
+
+# FIXME: enable sampling for pass@1 (remove once this is fixed on lighteval side). We use the defaults from Qwen2.5-Coder: https://github.com/QwenLM/Qwen2.5-Coder/blob/main/qwencoder-eval/instruct/livecode_bench/lcb_runner/runner/parser.py#L8
+if [ "$TASK_NAME" = "lcb" ]; then
+    MODEL_ARGS="${MODEL_ARGS/temperature:0.0/temperature:0.2}"
+    MODEL_ARGS="${MODEL_ARGS/generation_parameters={/generation_parameters={top_p:0.95,}"
 fi
 
 LM_EVAL_REPO_ID="open-r1/open-r1-eval-leaderboard"
@@ -48,14 +55,14 @@ echo "Eval results will be saved to $OUTPUT_DIR"
 # Check if "custom" is a substring of TASKS
 if [[ $TASKS == *"custom"* ]]; then
     echo "Custom task detected. Running custom task evaluation script ..."
-    lighteval vllm $MODEL_ARGS $TASKS \
+    lighteval vllm "$MODEL_ARGS" $TASKS \
     --custom-tasks "src/open_r1/evaluate.py" \
     --use-chat-template \
     --output-dir $OUTPUT_DIR \
     --save-details \
     ${7:+--system-prompt "$7"}
 else
-    lighteval vllm $MODEL_ARGS $TASKS \
+    lighteval vllm "$MODEL_ARGS" $TASKS \
     --use-chat-template \
     --output-dir $OUTPUT_DIR \
     --save-details \