[ENH] add example for LLama 3 vllm (#381)

YannDubs · web-flow · commit 2c8984a4529e · 2024-07-31T09:22:07.000-07:00
diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_vllm_llama3_70b_fn/alpaca_eval_fn.txt b/src/alpaca_eval/evaluators_configs/alpaca_eval_vllm_llama3_70b_fn/alpaca_eval_fn.txt
@@ -0,0 +1,31 @@
+<|im_start|>system
+You are a helpful assistant that ranks models by the quality of their answers.
+<|im_end|>
+<|im_start|>user
+I want you to create a leaderboard of different large language models. To do so, I will give you the instructions (prompts) given to the models, and the responses of two models. Please rank the models based on which responses would be preferred by humans. All inputs and outputs should be Python dictionaries.
+
+Here is the prompt:
+{
+    "instruction": """{instruction}""",
+}
+
+Here are the outputs of the models:
+[
+    {
+        "model": "model_1",
+        "answer": """{output_1}"""
+    },
+    {
+        "model": "model_2",
+        "answer": """{output_2}"""
+    }
+]
+
+Now please rank the models by the quality of their answers, so that the model with rank 1 has the best output. Then return a list of the model names and ranks, i.e., produce the following output:
+[
+    {'model': <model-name>, 'rank': <model-rank>},
+    {'model': <model-name>, 'rank': <model-rank>}
+]
+
+Your response must be a valid Python dictionary and should contain nothing else because we will directly execute it in Python. Please provide the ranking that the majority of humans would give.
+<|im_end|>
diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_vllm_llama3_70b_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_vllm_llama3_70b_fn/configs.yaml
@@ -0,0 +1,16 @@
+alpaca_eval_vllm_llama3_70b_fn:
+  prompt_template: "alpaca_eval_vllm_llama3_70b_fn/alpaca_eval_fn.txt"
+  fn_completions: "vllm_local_completions"
+  completions_kwargs:
+    model_name: "/home/shared/Meta-Llama-3-70B-Instruct" # TODO: replace with path to the model
+    model_kwargs:
+      tokenizer_mode: "auto"
+      trust_remote_code: True
+      max_model_len: 7000
+      tensor_parallel_size: 2 # 2 GPUs
+    is_chatml_prompt: true
+    max_new_tokens: 100
+    temperature: 0.0
+    top_p: 1.0
+    batch_size: 128
+  fn_completion_parser: "ranking_parser"