Here're some resources about Evaluation on LLMs Inference
tag: LLM Decontaminator
| UC Berkeley
| SJTU
paper link: here
code link: here
citation:
@misc{yang2023rethinking,
title={Rethinking Benchmark and Contamination for Language Models with Rephrased Samples},
author={Shuo Yang and Wei-Lin Chiang and Lianmin Zheng and Joseph E. Gonzalez and Ion Stoica},
year={2023},
eprint={2311.04850},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: Benchmark Cheater
| RUC
paper link: here
code link: here
citation:
@misc{zhou2023dont,
title={Don't Make Your LLM an Evaluation Benchmark Cheater},
author={Kun Zhou and Yutao Zhu and Zhipeng Chen and Wentong Chen and Wayne Xin Zhao and Xu Chen and Yankai Lin and Ji-Rong Wen and Jiawei Han},
year={2023},
eprint={2311.01964},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
GPT-Fathom- Benchmarking Large Language Models to Decipher the Evolutionary Path towards GPT-4 and Beyond
tag: GPT-Fathom
| NAACL24
| ByteDance
paper link: here
code link: here
citation:
@article{zheng2023gpt,
title={GPT-Fathom: Benchmarking Large Language Models to Decipher the Evolutionary Path towards GPT-4 and Beyond},
author={Zheng, Shen and Zhang, Yuyu and Zhu, Yijie and Xi, Chenguang and Gao, Pengyang and Zhou, Xun and Chang, Kevin Chen-Chuan},
journal={arXiv preprint arXiv:2309.16583},
year={2023}
}
tag: RGB
| RAG
| AAAI24
| UCAS
paper link: here
code link: here
citation:
@misc{chen2023benchmarking,
title={Benchmarking Large Language Models in Retrieval-Augmented Generation},
author={Jiawei Chen and Hongyu Lin and Xianpei Han and Le Sun},
year={2023},
eprint={2309.01431},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: LongBench
| ACL24
| Tsinghua University
paper link: here
code link: here
citation:
@misc{bai2023longbench,
title={LongBench: A Bilingual, Multitask Benchmark for Long Context Understanding},
author={Yushi Bai and Xin Lv and Jiajie Zhang and Hongchang Lyu and Jiankai Tang and Zhidian Huang and Zhengxiao Du and Xiao Liu and Aohan Zeng and Lei Hou and Yuxiao Dong and Jie Tang and Juanzi Li},
year={2023},
eprint={2308.14508},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: LLM-as-a-judge
| MT-Bench
| Chatbot Arena
| NIPS23
| UC Berkeley
paper link: here
code link: here
homepage link: here
citation:
@misc{zheng2023judging,
title={Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena},
author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
year={2023},
eprint={2306.05685},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: InstructEval
| ACL24
| DeCLaRe Lab
paper link: here
code link: here
dataset link: here
citation:
@misc{chia2023instructeval,
title={INSTRUCTEVAL: Towards Holistic Evaluation of Instruction-Tuned Large Language Models},
author={Yew Ken Chia and Pengfei Hong and Lidong Bing and Soujanya Poria},
year={2023},
eprint={2306.04757},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: C-eval
| NIPS23
| SJTU
| Tsinghua University
paper link: here
code link: here
homepage link: here
citation:
@article{huang2023c,
title={C-eval: A multi-level multi-discipline chinese evaluation suite for foundation models},
author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and others},
journal={arXiv preprint arXiv:2305.08322},
year={2023}
}
tag: OpenAGI
| RLTF
| NIPS23
| Rutgers University
paper link: here
code link: here
citation:
@article{ge2023openagi,
title={Openagi: When llm meets domain experts},
author={Ge, Yingqiang and Hua, Wenyue and Ji, Jianchao and Tan, Juntao and Xu, Shuyuan and Zhang, Yongfeng},
journal={arXiv preprint arXiv:2304.04370},
year={2023}
}
tag: HELM
| TMLR23
| Stanford University
paper link: here
code link: here
citation:
@article{liang2022holistic,
title={Holistic evaluation of language models},
author={Liang, Percy and Bommasani, Rishi and Lee, Tony and Tsipras, Dimitris and Soylu, Dilara and Yasunaga, Michihiro and Zhang, Yian and Narayanan, Deepak and Wu, Yuhuai and Kumar, Ananya and others},
journal={arXiv preprint arXiv:2211.09110},
year={2022}
}
tag: Beyond Perplexity
| ACL21
| University of Cambridge
paper link: here
citation:
@article{meister2021language,
title={Language model evaluation beyond perplexity},
author={Meister, Clara and Cotterell, Ryan},
journal={arXiv preprint arXiv:2106.00085},
year={2021}
}
tag: MMLU
| ICLR21
| UC Berkeley
paper link: here
code link: here
citation:
@article{hendrycks2020measuring,
title={Measuring massive multitask language understanding},
author={Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou, Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob},
journal={arXiv preprint arXiv:2009.03300},
year={2020}
}