|
| 1 | +Model,"Arena Elo |
| 2 | +[Feb 2, 2024]",LC AlpacaEval 2.0,AlpacaEval 2.0,AlpacaEval 1.0,"MT-bench |
| 3 | +(multi-turn)",WildBench,"Open LLM |
| 4 | +(average)","ARC-C |
| 5 | +(25-shot)","HellaSwag |
| 6 | +(10-shot)","MMLU |
| 7 | +(5-shot)","TruthfulQA |
| 8 | +(0-shot)","WinoGrande |
| 9 | +(5-shot)","GSM-8K |
| 10 | +(5-shot)","GPT4All |
| 11 | +(average)","AGI Eval |
| 12 | +(en)","HELM |
| 13 | +Lite","BBH, cot |
| 14 | +(3-shot)","HumanEval |
| 15 | +(pass @1)","LLMonitor |
| 16 | +(01-10)","OpenComp. |
| 17 | +(en, avg)","MBPP |
| 18 | +(pass @1)",Output Length |
| 19 | +gpt4_1106_preview,1251.0,50.0,50.0,97.69900497512438,9.32,,,,,80.5,,,,,,0.834,83.9,85.4,,,83.0,2049.0 |
| 20 | +gpt4_0125_preview,1249.0,,,,,940.6,,,,,,,,,,,,,,,, |
| 21 | +claude-3-opus-20240229,1247.0,40.39177606350116,29.04176413403727,,,852.6,,,,86.8,,,,,,,,,,,,1388.0 |
| 22 | +claude-3-sonnet-20240229,1190.0,34.87247436243302,25.556325292273296,,,835.8,,,,79.0,,,,,,,,,,,,1420.0 |
| 23 | +gpt4_0314,1185.0,35.30706121640206,22.073258928708075,94.78260869565216,8.96,,,96.3,95.3,86.4,59.0,,,,57.0,,86.7,88.4,93.0,73.3,,1371.0 |
| 24 | +gpt4_0613,1160.0,30.18332231673423,15.75503808763975,93.78109452736318,9.18,,,,,,,,,,57.0,0.962,86.7,88.4,89.0,73.3,,1140.0 |
| 25 | +mistral-large-2402,1155.0,32.65207998531868,21.43877598137888,,,824.2,,,,81.2,,,,,,,,,,,,1362.0 |
| 26 | +Qwen1.5-72B-Chat,1146.0,36.571754111987296,26.49828339562733,,8.61,,,,,77.5,,,,,,,,,,,,1549.0 |
| 27 | +claude,1145.0,27.289504443727107,16.98534361236025,91.5527950310559,7.9,,,,,77.0,,,,,49.7,0.724,67.3,56.0,66.0,46.3,,1082.0 |
| 28 | +mistral-medium,1145.0,28.614337401726104,21.855772543652176,96.83229813664596,8.61,,,89.9,88.0,75.3,,88.0,66.7,,,,,,,,62.3,1500.0 |
| 29 | +claude-2,1126.0,28.155196141629148,17.188240356708075,91.35572139303484,8.06,,,,,78.5,,,,,,0.679,,71.2,68.0,,,1069.0 |
| 30 | +Mistral-Next,1123.0,,,,,,,,,,,,,,,,,,,,, |
| 31 | +Gemini Pro (Dev API),1118.0,,,,,,,,,71.8,,,,,,,,,,,, |
| 32 | +claude-2.1,1115.0,25.251943886133027,15.733506736409938,87.0807453416149,8.18,,,,,,,,,,,0.593,,,,,,1096.0 |
| 33 | +Mixtral-8x7B-Instruct-v0.1,1114.0,23.68848260134481,18.25531762637268,94.78260869565216,8.3,765.7,72.62,70.22,87.63,70.6,64.58,81.37,60.73,76.41,45.3,0.728,67.0,54.9,,56.8,60.7,1465.0 |
| 34 | +gpt-3.5-turbo-0613,1113.0,22.35251298054288,14.09579857390062,,8.39,,,,,,,,,,,0.507,71.0,72.6,81.0,,,1331.0 |
| 35 | +gemini-pro,1110.0,24.38177610802152,18.177644540571432,79.66417910447761,,788.0,,,,71.8,,,,,,,65.6,63.4,,,72.9,1456.0 |
| 36 | +GPT-3.5-Turbo-0314,1104.0,,,,7.94,,,85.5,70.6,70.0,,85.2,57.1,,43.2,,,73.2,79.0,63.5,81.6, |
| 37 | +claude-instant-1.2,1104.0,25.61225902543337,16.12739962159006,,7.85,,,,,73.4,,,,,,,,52.8,60.0,,,1112.0 |
| 38 | +wizardlm-70b,1102.0,17.575060737493747,14.383896086782608,,7.71,,61.25,65.44,84.41,63.7,54.81,80.82,17.97,,,,,,,,,1545.0 |
| 39 | +Yi-34B-Chat,1099.0,27.19054787762733,29.65994671879504,94.08468244084682,,743.9,65.32,65.44,84.16,73.5,55.37,80.11,31.92,72.13,50.8,0.772,71.7,,,63.3,,2123.0 |
| 40 | +tulu-2-dpo-70b,1097.0,21.238610038371124,15.982854374136648,95.03105590062113,7.89,685.9,73.77,72.1,88.99,69.84,65.78,83.27,62.62,,,,66.0,,,,,1418.0 |
| 41 | +GPT-3.5-Turbo-0125,1096.0,,,,,736.4,,,,,,,,,,,,,,,, |
| 42 | +vicuna-33b-v1.3,1089.0,17.574575310874923,12.705947921540371,88.99253731,7.12,,58.5,,,59.2,,,,,37.3,,52.0,,,53.0,,1479.0 |
| 43 | +Starling-LM-7B-alpha,1084.0,14.690471079424972,14.24592352162733,,8.09,,67.13,63.82,84.9,63.9,46.39,80.58,62.4,72.72,40.1,,,,,,,1895.0 |
| 44 | +llama-2-70b-chat-hf,1082.0,14.689648588392544,13.88825834374378,92.66169154228857,6.86,697.4,62.4,64.59,85.88,63.0,52.8,80.51,26.69,,45.0,,60.8,,60.0,58.6,,1790.0 |
| 45 | +OpenHermes-2.5-Mistral-7B,1079.0,16.248577696674843,10.340415705751552,,,,61.52,64.93,84.18,63.8,52.24,78.06,26.08,73.12,43.0,,,48.2,,,,1107.0 |
| 46 | +NV-Llama2-70B-SteerLM-Chat,1076.0,,,,7.54,,,,,68.5,,,,,,,,,,,, |
| 47 | +Mistral-7B-Instruct-v0.2,1073.0,17.111251846021165,14.722772657714286,92.77708592777088,7.6,,,,,,,,,,,,,,,,,1676.0 |
| 48 | +deepseek-llm-67b-chat,1073.0,17.843384089909343,12.093422264919258,,,,71.79,67.75,86.82,72.42,55.85,84.21,63.68,,,,,,,,,1151.0 |
| 49 | +OpenChat-3.5,1071.0,,,,7.81,,61.24,63.91,84.79,64.3,46.38,80.58,26.84,72.92,42.7,,,55.5,,,, |
| 50 | +pplx-70b-online,1068.0,,,,,,,,,,,,,,,,,,,,, |
| 51 | +SOLAR-10.7B-Instruct-v1.0,1065.0,,,,7.58,,74.2,71.08,88.16,66.2,71.43,83.58,64.75,75.11,47.6,,,,,,42.9, |
| 52 | +dolphin-2.2.1-mistral-7b,1058.0,13.121477650433736,9.039799728223604,,,,64.93,63.31,83.76,63.2,53.11,78.14,48.07,72.24,39.2,,59.8,,,58.0,,1130.0 |
| 53 | +wizardlm-13b-v1.2,1054.0,14.462590694316631,12.027480342770186,89.16562889,7.2,,54.76,59.04,82.21,52.7,47.27,71.9,13.5,,,,,,,,,1635.0 |
| 54 | +zephyr-7b-beta,1046.0,13.203198493136666,10.992885755354038,90.5977584059776,7.34,662.3,61.95,62.03,84.36,61.4,57.45,77.74,29.04,71.83,40.6,,,30.0,,,41.1,1444.0 |
| 55 | +llama-2-13b-chat-hf,1043.0,8.436014548885215,7.702309957875775,81.09452736318407,6.65,678.2,54.91,59.04,81.94,53.6,44.12,74.51,15.24,,33.6,0.348,58.2,,50.0,50.3,,1513.0 |
| 56 | +MPT-30B-chat,1042.0,,,,6.39,,55.38,58.7,82.54,50.4,52.42,75.3,12.13,,,,,,40.0,,, |
| 57 | +CodeLlama-34B-instruct,1040.0,,,,,,57.29,54.27,76.92,53.7,44.44,74.59,37.98,,,,,51.8,34.0,,, |
| 58 | +vicuna-13b-v1.5,1037.0,10.484438298504218,6.722122014857143,,6.57,593.2,55.4,57.08,81.24,55.8,51.51,74.66,11.3,63.1,36.8,,51.5,17.1,50.0,52.1,,1061.0 |
| 59 | +pplx-7b-online,1035.0,,,,,,,,,,,,,,,,,,,,, |
| 60 | +zephyr-7b-alpha,1033.0,10.289760888704258,8.352663968198758,85.7587064676617,6.88,,59.5,61.01,84.04,61.4,57.9,78.61,14.03,72.24,38.0,,,,,,,1302.0 |
| 61 | +Qwen-14B-Chat,1032.0,12.378741790737235,7.502333484720497,,6.96,,,,,66.5,,,59.7,,39.6,,53.7,43.9,,,,1013.0 |
| 62 | +guanaco-33b,1031.0,5.690019090866207,5.002493724956522,65.96273292,6.53,,,,,57.6,,,,,,,,,43.0,,,1311.0 |
| 63 | +gemma-7b-it,1029.0,10.425760403690134,6.937294379677018,,,676.5,,,,64.3,,,,,,,,,,,,1115.0 |
| 64 | +llama-2-7b-chat-hf,1027.0,5.354821279508294,4.961339547167702,71.36645962732919,6.27,651.9,50.74,52.9,78.55,45.8,45.57,71.74,7.35,,29.6,0.217,35.6,,50.0,,,1479.0 |
| 65 | +falcon-180b-chat,1026.0,,,,,,67.85,69.45,88.86,68.0,45.47,86.9,45.94,,,,,,67.0,,, |
| 66 | +Mistral-7B-Instruct-v0.1,1002.0,,,,6.84,545.9,54.96,54.52,75.63,55.4,56.28,73.72,14.25,67.95,33.5,0.438,56.7,28.7,57.0,53.6,, |
| 67 | +vicuna-7b-v1.5,1001.0,7.616892731870527,4.797493939167703,,6.17,,50.1,53.24,77.39,49.8,50.34,72.14,8.19,61.0,31.4,,43.4,11.6,41.0,,,1083.0 |
| 68 | +gemma-2b-it,985.0,5.437453620377121,3.4019714381366457,,,,,,,42.3,,,,,,,,,,,,1041.0 |
| 69 | +chatglm2-6b,925.0,4.35928292679035,2.7621847964596284,47.12858926,4.96,,,,,45.5,,,,,,,,,,,,1027.0 |
| 70 | +oasst-sft-pythia-12b,893.0,3.270102114456748,1.790114083180124,25.96273292,4.32,,40.77,46.42,70.0,26.19,39.19,62.19,0.61,,,,,,,,,726.0 |
| 71 | +Yi-34Bx2-MoE-60B,,,,,,,76.72,71.08,85.23,77.5,66.19,84.85,75.51,,,,,,,,, |
0 commit comments