tatsu-lab
diff --git a/‎README.md
+178-55 b/‎README.md
+178-55
diff --git a/‎docs/data_AlpacaEval/alpaca_eval_gpt4_leaderboard.csv
+101-102 b/‎docs/data_AlpacaEval/alpaca_eval_gpt4_leaderboard.csv
+101-102
diff --git a/‎docs/data_AlpacaEval_2/alpaca_eval_cot_gpt4_turbo_fn_leaderboard.csv
+1-1 b/‎docs/data_AlpacaEval_2/alpaca_eval_cot_gpt4_turbo_fn_leaderboard.csv
+1-1
diff --git a/‎docs/data_AlpacaEval_2/alpaca_eval_gpt4_turbo_fn_leaderboard.csv
+1-1 b/‎docs/data_AlpacaEval_2/alpaca_eval_gpt4_turbo_fn_leaderboard.csv
+1-1
diff --git a/‎docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
+124-136 b/‎docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
+124-136
diff --git a/‎docs/format_export_leaderboards.py
+30-2 b/‎docs/format_export_leaderboards.py
+30-2
diff --git a/‎docs/index.html
+25-14 b/‎docs/index.html
+25-14
diff --git a/‎figures/all_metrics_length.png
144 KB b/‎figures/all_metrics_length.png
144 KB
diff --git a/‎figures/annotator_bias.png
170 KB b/‎figures/annotator_bias.png
170 KB
diff --git a/‎figures/causal_graph.png
156 KB b/‎figures/causal_graph.png
156 KB
diff --git a/‎figures/chat_correlations.png
376 KB b/‎figures/chat_correlations.png
376 KB
diff --git a/‎figures/chat_correlations_no_ae.png
333 KB b/‎figures/chat_correlations_no_ae.png
333 KB
diff --git a/‎figures/different_baselines.png
732 KB b/‎figures/different_baselines.png
732 KB
diff --git a/‎figures/lc_ae_leaderboard.png
1.3 MB b/‎figures/lc_ae_leaderboard.png
1.3 MB
diff --git a/‎figures/length_gameability.png
538 KB b/‎figures/length_gameability.png
538 KB
diff --git a/‎notebooks/benchmarks.csv
+71 b/‎notebooks/benchmarks.csv
+71
diff --git a/‎notebooks/causal_graph.png
-31.4 KB b/‎notebooks/causal_graph.png
-31.4 KB
diff --git a/‎notebooks/figures_length_controlled.ipynb
+949 b/‎notebooks/figures_length_controlled.ipynb
+949
@@ -1,5 +1,5 @@
 name,win_rate,avg_length,link,samples,filter
-GPT-4 Turbo,50.0,2049.0,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_turbo/model_outputs.json,minimal
+GPT-4 Preview,50.0,2049.0,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_turbo/model_outputs.json,minimal
 Yi 34B Chat,35.3416149068323,2123.0,https://huggingface.co/01-ai/Yi-34B-Chat,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Yi-34B-Chat/model_outputs.json,minimal
 GPT-4,20.0,1365.0,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4/model_outputs.json,minimal
 Mixtral 8x7B v0.1,19.937888198757765,1465.0,https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Mixtral-8x7B-Instruct-v0.1/model_outputs.json,minimal
 
@@ -1,5 +1,5 @@
 name,win_rate,avg_length,link,samples,filter
-GPT-4 Turbo,50.0,2049,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_turbo/model_outputs.json,minimal
+GPT-4 Preview,50.0,2049,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_turbo/model_outputs.json,minimal
 Yi 34B Chat,34.84472049689441,2123,https://huggingface.co/01-ai/Yi-34B-Chat,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Yi-34B-Chat/model_outputs.json,minimal
 GPT-4,25.093167701863354,1365,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4/model_outputs.json,minimal
 Mixtral 8x7B v0.1,22.795031055900623,1465,https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Mixtral-8x7B-Instruct-v0.1/model_outputs.json,minimal
 
@@ -2,6 +2,7 @@
 from pathlib import Path
 
 from alpaca_eval.constants import MODELS_CONFIG_DIR, PRECOMPUTED_LEADERBOARDS
+from alpaca_eval.metrics.glm_winrate import get_is_extreme_changes
 from alpaca_eval.utils import load_configs, load_or_convert_to_dataframe
 
 CURRENT_DIR = Path(__file__).parents[1]
@@ -11,7 +12,14 @@
     df = load_or_convert_to_dataframe(leaderboard_file)
     df["link"] = ""
     df["samples"] = ""
-    df = df[["win_rate", "avg_length", "link", "samples", "mode"]]
+    cols_to_keep = ["win_rate", "avg_length", "link", "samples", "mode"]
+    if "length_controlled_winrate" in df.columns:
+        cols_to_keep = ["length_controlled_winrate"] + cols_to_keep
+    df = df[cols_to_keep]
+
+    # drop mode == 'dev'
+    df = df[df["mode"] != "dev"]
+
     df = df.rename(columns={"mode": "filter"})
     df = df.reset_index(names="name")
     for idx in range(len(df)):
@@ -36,7 +44,27 @@
             df.loc[
                 idx, "samples"
             ] = f"https://github.com/tatsu-lab/alpaca_eval/blob/main/results/{informal_name}/model_outputs.json"
-    df = df.sort_values(by=["win_rate"], ascending=False)
+
+    # if "length_controlled_winrate" never nan then we can use it as the main metric
+    if "length_controlled_winrate" in cols_to_keep and df["length_controlled_winrate"].notna().all():
+        df = df.sort_values(by=["length_controlled_winrate"], ascending=False)
+    else:
+        df = df.sort_values(by=["win_rate"], ascending=False)
+
+    # run get_is_extreme_changes on each row where length_controlled_winrate is not nan to avoid merging PRs
+    # where the length controlled results seem very suspicious
+    if "length_controlled_winrate" in cols_to_keep:
+        idx_notna = df["length_controlled_winrate"].notna()
+        arr_is_extreme = df[idx_notna].apply(
+            lambda row: get_is_extreme_changes(row["win_rate"], row["length_controlled_winrate"], min_warn=False),
+            axis=1,
+        )
+        if arr_is_extreme.any():
+            raise ValueError(
+                f"Found extreme changes in the length controlled winrate. Please check the following rows: "
+                f"{df[idx_notna][arr_is_extreme][['name', 'win_rate','length_controlled_winrate']]}"
+            )
+
     save_dir = Path("docs") / leaderboard_file.parent.name
     save_dir.mkdir(exist_ok=True, parents=True)
     df.to_csv(save_dir / leaderboard_file.name, index=False)
@@ -73,7 +73,7 @@
 
         th:first-child,
         td:first-child {
-            width: 70%;
+            width: 60%;
             padding-left: 30px;
             text-align: left;
         }
@@ -86,7 +86,7 @@
 
         th:last-child,
         td:last-child {
-            width: 15%;
+            width: 17%;
             padding-right: 30px;
         }
 
@@ -169,11 +169,11 @@ <h1>AlpacaEval
         <br>
         <h2>An Automatic Evaluator for Instruction-following Language Models</h2>
 <!--        <small id="alpaca_eval_info" style="color: #777;">-->
-<!--            Baseline: GPT-4 Turbo &nbsp; | &nbsp; Auto-annotator: GPT-4 Turbo-->
+<!--            Baseline: GPT-4 Preview &nbsp; | &nbsp; Auto-annotator: GPT-4 Preview-->
 <!--        </small>-->
 <!--        <br>-->
         <small id="caution" style="color: #8C1515;">
-            Caution: GPT-4 may favor models with longer outputs and/or those that were fine-tuned on GPT-4 outputs.
+            <b> Length-controlled</b> (LC) win rates alleviate length biases of GPT-4, but it may favor models finetuned on its outputs.
         </small>
         <br>
         <a href="https://github.com/tatsu-lab/alpaca_eval">
@@ -205,15 +205,15 @@ <h2>An Automatic Evaluator for Instruction-following Language Models</h2>
 
     <div class="container" style="text-align: center; margin-bottom: 10px; margin-top: -10px;">
         <small id="alpaca_eval_info" style="color: #777;">
-            Baseline: GPT-4 Turbo &nbsp; | &nbsp; Auto-annotator: GPT-4 Turbo
+            Baseline: GPT-4 Preview &nbsp; | &nbsp; Auto-annotator: GPT-4 Preview
         </small>
     </div>
 
     <table id="leaderboard">
         <tr>
-            <th>Model Name</th>
+            <th><br>Model Name</th>
+            <th>LC Win Rate</th>
             <th>Win Rate</th>
-            <th>Length</th>
         </tr>
     </table>
 
@@ -233,7 +233,7 @@ <h2>About AlpacaEval</h2>
             <a href="https://crfm.stanford.edu/2023/05/22/alpaca-farm.html">AlpacaFarm</a>
             evaluation set,
             which tests the ability of models to follow general user instructions.
-            These responses are then compared to reference responses (Davinci003 for AlpacaEval, GPT-4 Turbo for AlpacaEval 2.0) by
+            These responses are then compared to reference responses (Davinci003 for AlpacaEval, GPT-4 Preview for AlpacaEval 2.0) by
             the provided GPT-4 based auto-annotators,
             which results in the win rates presented above.
             AlpacaEval displays a high agreement rate with ground truth human annotations,
@@ -314,7 +314,7 @@ <h2>AlpacaEval limitations</h2>
             complete: function (results) {
                 console.log(results.data);
                 results.data.forEach(row => {
-                    if (row['name'] || row['win_rate'] || row['avg_length']) {
+                    if (row['name'] || row['win_rate'] || row['length_controlled_winrate']) { //|| row['avg_length']
                         let filter = row['filter'];
 
                         if ((communityRadio.checked && (filter === 'verified' || filter === 'minimal' || filter === 'community')) ||
@@ -323,7 +323,8 @@ <h2>AlpacaEval limitations</h2>
                             const tr = document.createElement('tr');
                             const nameTd = document.createElement('td');
                             const winRateTd = document.createElement('td');
-                            const lengthTd = document.createElement('td');
+                            //const lengthTd = document.createElement('td');
+                            const lenWinRateTd = document.createElement('td');
 
                             if (row['link'] && row['link'].trim() !== '') {
                                 const a = document.createElement('a');
@@ -344,12 +345,22 @@ <h2>AlpacaEval limitations</h2>
                                 nameTd.appendChild(samplesLink);
                             }
 
-                            winRateTd.textContent = Number(row['win_rate']).toFixed(2) + '%';
-                            lengthTd.textContent = Math.round(Number(row['avg_length'])).toString() ;
+                            winRateTd.textContent = Number(row['win_rate']).toFixed(1) + '%';
+
+                            LCWinRate = Number(row['length_controlled_winrate'])
+                            if (Number.isNaN(LCWinRate)) {
+                                lenWinRateTd.textContent = '';
+                            } else {
+                                lenWinRateTd.textContent = LCWinRate.toFixed(1) + '%';
+                            }
+                            //lenWinRateTd.textContent = Number(row['length_controlled_winrate']).toFixed(1) + '%';
+                            //lengthTd.textContent = Math.round(Number(row['avg_length'])).toString() ;
+
 
                             tr.appendChild(nameTd);
+                            tr.appendChild(lenWinRateTd);
                             tr.appendChild(winRateTd);
-                            tr.appendChild(lengthTd);
+                            //tr.appendChild(lengthTd);
 
                             table.appendChild(tr);
                         }
@@ -362,7 +373,7 @@ <h2>AlpacaEval limitations</h2>
     function updateInfoMessage(version) {
         let infoText;
         if (version === 'alpaca_eval_2') {
-            infoText = 'Baseline: GPT-4 Turbo &nbsp; | &nbsp; Auto-annotator: GPT-4 Turbo';
+            infoText = 'Baseline: GPT-4 Preview &nbsp; | &nbsp; Auto-annotator: GPT-4 Preview';
         } else if (version === 'alpaca_eval') {
             infoText = 'Baseline: Davinci003 &nbsp; | &nbsp; Auto-annotator: GPT-4';
         }
 
@@ -0,0 +1,71 @@
+Model,"Arena Elo
+[Feb 2, 2024]",LC AlpacaEval 2.0,AlpacaEval 2.0,AlpacaEval 1.0,"MT-bench
+(multi-turn)",WildBench,"Open LLM
+(average)","ARC-C
+(25-shot)","HellaSwag
+(10-shot)","MMLU
+(5-shot)","TruthfulQA
+(0-shot)","WinoGrande
+(5-shot)","GSM-8K
+(5-shot)","GPT4All
+(average)","AGI Eval 
+(en)","HELM 
+Lite","BBH, cot
+(3-shot)","HumanEval
+(pass @1)","LLMonitor
+(01-10)","OpenComp.
+(en, avg)","MBPP
+(pass @1)",Output Length
+gpt4_1106_preview,1251.0,50.0,50.0,97.69900497512438,9.32,,,,,80.5,,,,,,0.834,83.9,85.4,,,83.0,2049.0
+gpt4_0125_preview,1249.0,,,,,940.6,,,,,,,,,,,,,,,,
+claude-3-opus-20240229,1247.0,40.39177606350116,29.04176413403727,,,852.6,,,,86.8,,,,,,,,,,,,1388.0
+claude-3-sonnet-20240229,1190.0,34.87247436243302,25.556325292273296,,,835.8,,,,79.0,,,,,,,,,,,,1420.0
+gpt4_0314,1185.0,35.30706121640206,22.073258928708075,94.78260869565216,8.96,,,96.3,95.3,86.4,59.0,,,,57.0,,86.7,88.4,93.0,73.3,,1371.0
+gpt4_0613,1160.0,30.18332231673423,15.75503808763975,93.78109452736318,9.18,,,,,,,,,,57.0,0.962,86.7,88.4,89.0,73.3,,1140.0
+mistral-large-2402,1155.0,32.65207998531868,21.43877598137888,,,824.2,,,,81.2,,,,,,,,,,,,1362.0
+Qwen1.5-72B-Chat,1146.0,36.571754111987296,26.49828339562733,,8.61,,,,,77.5,,,,,,,,,,,,1549.0
+claude,1145.0,27.289504443727107,16.98534361236025,91.5527950310559,7.9,,,,,77.0,,,,,49.7,0.724,67.3,56.0,66.0,46.3,,1082.0
+mistral-medium,1145.0,28.614337401726104,21.855772543652176,96.83229813664596,8.61,,,89.9,88.0,75.3,,88.0,66.7,,,,,,,,62.3,1500.0
+claude-2,1126.0,28.155196141629148,17.188240356708075,91.35572139303484,8.06,,,,,78.5,,,,,,0.679,,71.2,68.0,,,1069.0
+Mistral-Next,1123.0,,,,,,,,,,,,,,,,,,,,,
+Gemini Pro (Dev API),1118.0,,,,,,,,,71.8,,,,,,,,,,,,
+claude-2.1,1115.0,25.251943886133027,15.733506736409938,87.0807453416149,8.18,,,,,,,,,,,0.593,,,,,,1096.0
+Mixtral-8x7B-Instruct-v0.1,1114.0,23.68848260134481,18.25531762637268,94.78260869565216,8.3,765.7,72.62,70.22,87.63,70.6,64.58,81.37,60.73,76.41,45.3,0.728,67.0,54.9,,56.8,60.7,1465.0
+gpt-3.5-turbo-0613,1113.0,22.35251298054288,14.09579857390062,,8.39,,,,,,,,,,,0.507,71.0,72.6,81.0,,,1331.0
+gemini-pro,1110.0,24.38177610802152,18.177644540571432,79.66417910447761,,788.0,,,,71.8,,,,,,,65.6,63.4,,,72.9,1456.0
+GPT-3.5-Turbo-0314,1104.0,,,,7.94,,,85.5,70.6,70.0,,85.2,57.1,,43.2,,,73.2,79.0,63.5,81.6,
+claude-instant-1.2,1104.0,25.61225902543337,16.12739962159006,,7.85,,,,,73.4,,,,,,,,52.8,60.0,,,1112.0
+wizardlm-70b,1102.0,17.575060737493747,14.383896086782608,,7.71,,61.25,65.44,84.41,63.7,54.81,80.82,17.97,,,,,,,,,1545.0
+Yi-34B-Chat,1099.0,27.19054787762733,29.65994671879504,94.08468244084682,,743.9,65.32,65.44,84.16,73.5,55.37,80.11,31.92,72.13,50.8,0.772,71.7,,,63.3,,2123.0
+tulu-2-dpo-70b,1097.0,21.238610038371124,15.982854374136648,95.03105590062113,7.89,685.9,73.77,72.1,88.99,69.84,65.78,83.27,62.62,,,,66.0,,,,,1418.0
+GPT-3.5-Turbo-0125,1096.0,,,,,736.4,,,,,,,,,,,,,,,,
+vicuna-33b-v1.3,1089.0,17.574575310874923,12.705947921540371,88.99253731,7.12,,58.5,,,59.2,,,,,37.3,,52.0,,,53.0,,1479.0
+Starling-LM-7B-alpha,1084.0,14.690471079424972,14.24592352162733,,8.09,,67.13,63.82,84.9,63.9,46.39,80.58,62.4,72.72,40.1,,,,,,,1895.0
+llama-2-70b-chat-hf,1082.0,14.689648588392544,13.88825834374378,92.66169154228857,6.86,697.4,62.4,64.59,85.88,63.0,52.8,80.51,26.69,,45.0,,60.8,,60.0,58.6,,1790.0
+OpenHermes-2.5-Mistral-7B,1079.0,16.248577696674843,10.340415705751552,,,,61.52,64.93,84.18,63.8,52.24,78.06,26.08,73.12,43.0,,,48.2,,,,1107.0
+NV-Llama2-70B-SteerLM-Chat,1076.0,,,,7.54,,,,,68.5,,,,,,,,,,,,
+Mistral-7B-Instruct-v0.2,1073.0,17.111251846021165,14.722772657714286,92.77708592777088,7.6,,,,,,,,,,,,,,,,,1676.0
+deepseek-llm-67b-chat,1073.0,17.843384089909343,12.093422264919258,,,,71.79,67.75,86.82,72.42,55.85,84.21,63.68,,,,,,,,,1151.0
+OpenChat-3.5,1071.0,,,,7.81,,61.24,63.91,84.79,64.3,46.38,80.58,26.84,72.92,42.7,,,55.5,,,,
+pplx-70b-online,1068.0,,,,,,,,,,,,,,,,,,,,,
+SOLAR-10.7B-Instruct-v1.0,1065.0,,,,7.58,,74.2,71.08,88.16,66.2,71.43,83.58,64.75,75.11,47.6,,,,,,42.9,
+dolphin-2.2.1-mistral-7b,1058.0,13.121477650433736,9.039799728223604,,,,64.93,63.31,83.76,63.2,53.11,78.14,48.07,72.24,39.2,,59.8,,,58.0,,1130.0
+wizardlm-13b-v1.2,1054.0,14.462590694316631,12.027480342770186,89.16562889,7.2,,54.76,59.04,82.21,52.7,47.27,71.9,13.5,,,,,,,,,1635.0
+zephyr-7b-beta,1046.0,13.203198493136666,10.992885755354038,90.5977584059776,7.34,662.3,61.95,62.03,84.36,61.4,57.45,77.74,29.04,71.83,40.6,,,30.0,,,41.1,1444.0
+llama-2-13b-chat-hf,1043.0,8.436014548885215,7.702309957875775,81.09452736318407,6.65,678.2,54.91,59.04,81.94,53.6,44.12,74.51,15.24,,33.6,0.348,58.2,,50.0,50.3,,1513.0
+MPT-30B-chat,1042.0,,,,6.39,,55.38,58.7,82.54,50.4,52.42,75.3,12.13,,,,,,40.0,,,
+CodeLlama-34B-instruct,1040.0,,,,,,57.29,54.27,76.92,53.7,44.44,74.59,37.98,,,,,51.8,34.0,,,
+vicuna-13b-v1.5,1037.0,10.484438298504218,6.722122014857143,,6.57,593.2,55.4,57.08,81.24,55.8,51.51,74.66,11.3,63.1,36.8,,51.5,17.1,50.0,52.1,,1061.0
+pplx-7b-online,1035.0,,,,,,,,,,,,,,,,,,,,,
+zephyr-7b-alpha,1033.0,10.289760888704258,8.352663968198758,85.7587064676617,6.88,,59.5,61.01,84.04,61.4,57.9,78.61,14.03,72.24,38.0,,,,,,,1302.0
+Qwen-14B-Chat,1032.0,12.378741790737235,7.502333484720497,,6.96,,,,,66.5,,,59.7,,39.6,,53.7,43.9,,,,1013.0
+guanaco-33b,1031.0,5.690019090866207,5.002493724956522,65.96273292,6.53,,,,,57.6,,,,,,,,,43.0,,,1311.0
+gemma-7b-it,1029.0,10.425760403690134,6.937294379677018,,,676.5,,,,64.3,,,,,,,,,,,,1115.0
+llama-2-7b-chat-hf,1027.0,5.354821279508294,4.961339547167702,71.36645962732919,6.27,651.9,50.74,52.9,78.55,45.8,45.57,71.74,7.35,,29.6,0.217,35.6,,50.0,,,1479.0
+falcon-180b-chat,1026.0,,,,,,67.85,69.45,88.86,68.0,45.47,86.9,45.94,,,,,,67.0,,,
+Mistral-7B-Instruct-v0.1,1002.0,,,,6.84,545.9,54.96,54.52,75.63,55.4,56.28,73.72,14.25,67.95,33.5,0.438,56.7,28.7,57.0,53.6,,
+vicuna-7b-v1.5,1001.0,7.616892731870527,4.797493939167703,,6.17,,50.1,53.24,77.39,49.8,50.34,72.14,8.19,61.0,31.4,,43.4,11.6,41.0,,,1083.0
+gemma-2b-it,985.0,5.437453620377121,3.4019714381366457,,,,,,,42.3,,,,,,,,,,,,1041.0
+chatglm2-6b,925.0,4.35928292679035,2.7621847964596284,47.12858926,4.96,,,,,45.5,,,,,,,,,,,,1027.0
+oasst-sft-pythia-12b,893.0,3.270102114456748,1.790114083180124,25.96273292,4.32,,40.77,46.42,70.0,26.19,39.19,62.19,0.61,,,,,,,,,726.0
+Yi-34Bx2-MoE-60B,,,,,,,76.72,71.08,85.23,77.5,66.19,84.85,75.51,,,,,,,,,