Here're some resources about Alignment Fine-Tuning strategies on LLMs, especically instruction-following tuning (IFT)
tag: REINFORCE++
| OpenRLHF
| ICML23
| UC Berkeley
paper link: here
code link: here
citation:
@misc{hu2025reinforcesimpleefficientapproach,
title={REINFORCE++: A Simple and Efficient Approach for Aligning Large Language Models},
author={Jian Hu},
year={2025},
eprint={2501.03262},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2501.03262},
}
tag: TULU 3
| RLVR
| Allen AI
paper link: here
blog link: here
code link: here
modelhub link: here
citation:
@misc{lambert2024tulu3pushingfrontiers,
title={T\"ULU 3: Pushing Frontiers in Open Language Model Post-Training},
author={Nathan Lambert and Jacob Morrison and Valentina Pyatkin and Shengyi Huang and Hamish Ivison and Faeze Brahman and Lester James V. Miranda and Alisa Liu and Nouha Dziri and Shane Lyu and Yuling Gu and Saumya Malik and Victoria Graf and Jena D. Hwang and Jiangjiang Yang and Ronan Le Bras and Oyvind Tafjord and Chris Wilhelm and Luca Soldaini and Noah A. Smith and Yizhong Wang and Pradeep Dasigi and Hannaneh Hajishirzi},
year={2024},
eprint={2411.15124},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2411.15124},
}
tag: DQO
| ByteDance
paper link: here
citation:
@article{liu2024enhancing,
title={Enhancing Multi-Step Reasoning Abilities of Language Models through Direct Q-Function Optimization},
author={Liu, Guanlin and Ji, Kaixuan and Zheng, Renjie and Wu, Zheng and Dun, Chen and Gu, Quanquan and Yan, Lin},
journal={arXiv preprint arXiv:2410.09302},
year={2024}
}
tag: HybridFlow
| EuroSys25
| ByteDance
paper link: here
code link: here
citation:
@article{sheng2024hybridflow,
title = {HybridFlow: A Flexible and Efficient RLHF Framework},
author = {Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu},
year = {2024},
journal = {arXiv preprint arXiv: 2409.19256}
}
tag: CPO
| NIPS24
| Sea AI Lab
paper link: here
code link: here
citation:
@misc{zhang2024chainpreferenceoptimizationimproving,
title={Chain of Preference Optimization: Improving Chain-of-Thought Reasoning in LLMs},
author={Xuan Zhang and Chao Du and Tianyu Pang and Qian Liu and Wei Gao and Min Lin},
year={2024},
eprint={2406.09136},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2406.09136},
}
tag: RLAIF-V
| Tsinghua University
paper link: here
code link: here
citation:
@misc{yu2024rlaifvaligningmllmsopensource,
title={RLAIF-V: Aligning MLLMs through Open-Source AI Feedback for Super GPT-4V Trustworthiness},
author={Tianyu Yu and Haoye Zhang and Yuan Yao and Yunkai Dang and Da Chen and Xiaoman Lu and Ganqu Cui and Taiwen He and Zhiyuan Liu and Tat-Seng Chua and Maosong Sun},
year={2024},
eprint={2405.17220},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2405.17220},
}
tag: RLOO
| ACL24
| Cohere
paper link: here
citation:
@misc{ahmadian2024basicsrevisitingreinforcestyle,
title={Back to Basics: Revisiting REINFORCE Style Optimization for Learning from Human Feedback in LLMs},
author={Arash Ahmadian and Chris Cremer and Matthias Gallé and Marzieh Fadaee and Julia Kreutzer and Olivier Pietquin and Ahmet Üstün and Sara Hooker},
year={2024},
eprint={2402.14740},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2402.14740},
}
tag: OAIF
| DAP
| Google DeepMind
paper link: here
citation:
@misc{guo2024directlanguagemodelalignment,
title={Direct Language Model Alignment from Online AI Feedback},
author={Shangmin Guo and Biao Zhang and Tianlin Liu and Tianqi Liu and Misha Khalman and Felipe Llinares and Alexandre Rame and Thomas Mesnard and Yao Zhao and Bilal Piot and Johan Ferret and Mathieu Blondel},
year={2024},
eprint={2402.04792},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2402.04792},
}
tag: DeepSeekMath
| GRPO
| DeepSeek AI
| Tsinghua University
| Peking University
paper link: here
code link: here
citation:
@misc{shao2024deepseekmath,
title={DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models},
author={Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
year={2024},
eprint={2402.03300},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: SPIN
| Self-Play
| ICML24
| UCLA
paper link: here
code link: here
citation:
@misc{chen2024selfplay,
title={Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models},
author={Zixiang Chen and Yihe Deng and Huizhuo Yuan and Kaixuan Ji and Quanquan Gu},
year={2024},
eprint={2401.01335},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: Zephyr
| dDPO
| COLM24
| HuggingFace
paper link: here
code link: here
modelhub link: here
citation:
@misc{tunstall2023zephyr,
title={Zephyr: Direct Distillation of LM Alignment},
author={Lewis Tunstall and Edward Beeching and Nathan Lambert and Nazneen Rajani and Kashif Rasul and Younes Belkada and Shengyi Huang and Leandro von Werra and Clémentine Fourrier and Nathan Habib and Nathan Sarrazin and Omar Sanseviero and Alexander M. Rush and Thomas Wolf},
year={2023},
eprint={2310.16944},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: FD-Align
| NIPS23
| USTB
paper link: here
code link: here
citation:
@article{song2023fd,
title={FD-Align: Feature Discrimination Alignment for Fine-tuning Pre-Trained Models in Few-Shot Learning},
author={Song, Kun and Ma, Huimin and Zou, Bochao and Zhang, Huishuai and Huang, Weiran},
journal={arXiv preprint arXiv:2310.15105},
year={2023}
}
tag: GPO
| ICLR24
| UCLA
paper link: here
code link: here
homepage link: here
citation:
@misc{zhao2023group,
title={Group Preference Optimization: Few-Shot Alignment of Large Language Models},
author={Siyan Zhao and John Dang and Aditya Grover},
year={2023},
eprint={2310.11523},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: P3O
| UC Berkeley
paper link: here
citation:
@misc{wu2023pairwise,
title={Pairwise Proximal Policy Optimization: Harnessing Relative Feedback for LLM Alignment},
author={Tianhao Wu and Banghua Zhu and Ruoyu Zhang and Zhaojin Wen and Kannan Ramchandran and Jiantao Jiao},
year={2023},
eprint={2310.00212},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: OpenChat
| C-RLFT
| ICLR24
| Tsinghua University
paper link: here
code link: here
modelhub link: here
citation:
@article{wang2023openchat,
title={Openchat: Advancing open-source language models with mixed-quality data},
author={Wang, Guan and Cheng, Sijie and Zhan, Xianyuan and Li, Xiangang and Song, Sen and Liu, Yang},
journal={arXiv preprint arXiv:2309.11235},
year={2023}
}
tag: APA
| UC Berkeley
paper link: here
citation:
@misc{zhu2023finetuning,
title={Fine-Tuning Language Models with Advantage-Induced Policy Alignment},
author={Banghua Zhu and Hiteshi Sharma and Felipe Vieira Frujeri and Shi Dong and Chenguang Zhu and Michael I. Jordan and Jiantao Jiao},
year={2023},
eprint={2306.02231},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: DPO
| NIPS23
| Stanford University
paper link: here
citation:
@article{rafailov2023direct,
title={Direct preference optimization: Your language model is secretly a reward model},
author={Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Ermon, Stefano and Manning, Christopher D and Finn, Chelsea},
journal={arXiv preprint arXiv:2305.18290},
year={2023}
}
tag: Lima
| NIPS23
| Meta
| CMU
paper link: here
citation:
@article{zhou2023lima,
title={Lima: Less is more for alignment},
author={Zhou, Chunting and Liu, Pengfei and Xu, Puxin and Iyer, Srini and Sun, Jiao and Mao, Yuning and Ma, Xuezhe and Efrat, Avia and Yu, Ping and Yu, Lili and others},
journal={arXiv preprint arXiv:2305.11206},
year={2023}
}
tag: Self-Align
| NIPS23
| IBM Research
| CMU
| MIT
paper link: here
code link: here
citation:
@article{sun2023principle,
title={Principle-driven self-alignment of language models from scratch with minimal human supervision},
author={Sun, Zhiqing and Shen, Yikang and Zhou, Qinhong and Zhang, Hongxin and Chen, Zhenfang and Cox, David and Yang, Yiming and Gan, Chuang},
journal={arXiv preprint arXiv:2305.03047},
year={2023}
}
tag: WizardLM
| ICLR24
| Microsoft
| Peking University
paper link: here
code link: here
modelhub link: here
citation:
@misc{xu2023wizardlm,
title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},
author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},
year={2023},
eprint={2304.12244},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: RRHF
| NIPS23
| DAMO Academy
| Alibaba Group
| Tsinghua University
paper link: here
code link: here
citation:
@article{yuan2023rrhf,
title={Rrhf: Rank responses to align language models with human feedback without tears},
author={Yuan, Zheng and Yuan, Hongyi and Tan, Chuanqi and Wang, Wei and Huang, Songfang and Huang, Fei},
journal={arXiv preprint arXiv:2304.05302},
year={2023}
}
tag: OpenAGI
| RLTF
| NIPS23
| Rutgers University
paper link: here
code link: here
citation:
@article{ge2023openagi,
title={Openagi: When llm meets domain experts},
author={Ge, Yingqiang and Hua, Wenyue and Ji, Jianchao and Tan, Juntao and Xu, Shuyuan and Zhang, Yongfeng},
journal={arXiv preprint arXiv:2304.04370},
year={2023}
}
tag: K-wise Comparison
| ICML23
| UC Berkeley
paper link: here
citation:
@misc{zhu2023principled,
title={Principled Reinforcement Learning with Human Feedback from Pairwise or $K$-wise Comparisons},
author={Banghua Zhu and Jiantao Jiao and Michael I. Jordan},
year={2023},
eprint={2301.11270},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: Self-instruct
| IFT
| Instruction Tuning
| ACL23
| University of Washington
paper link: here
code link: here
citation:
@article{wang2022self,
title={Self-instruct: Aligning language model with self generated instructions},
author={Wang, Yizhong and Kordi, Yeganeh and Mishra, Swaroop and Liu, Alisa and Smith, Noah A and Khashabi, Daniel and Hajishirzi, Hannaneh},
journal={arXiv preprint arXiv:2212.10560},
year={2022}
}
tag: Self-Prompting
| IFT
| Instruction Tuning
| NAACL24
| SJTU
paper link: here
code link: here
citation:
@misc{li2023selfprompting,
title={Self-Prompting Large Language Models for Zero-Shot Open-Domain QA},
author={Junlong Li and Zhuosheng Zhang and Hai Zhao},
year={2023},
eprint={2212.08635},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: RLAIF
| Constitutional AI
| Anthropic
paper link: here
code link: here
citation:
@article{bai2022constitutional,
title={Constitutional ai: Harmlessness from ai feedback},
author={Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and Kernion, Jackson and Jones, Andy and Chen, Anna and Goldie, Anna and Mirhoseini, Azalia and McKinnon, Cameron and others},
journal={arXiv preprint arXiv:2212.08073},
year={2022}
}
tag: InstructGPT
| IFT
| Instruction Tuning
| NIPS22
| OpenAI
paper link: here
citation:
@article{ouyang2022training,
title={Training language models to follow instructions with human feedback},
author={Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and others},
journal={Advances in Neural Information Processing Systems},
volume={35},
pages={27730--27744},
year={2022}
}
tag: ILQL
| ICLR23
| UC Berkeley
paper link: here
code link: here
homepage link: here
citation:
@misc{snell2023offlinerlnaturallanguage,
title={Offline RL for Natural Language Generation with Implicit Language Q Learning},
author={Charlie Snell and Ilya Kostrikov and Yi Su and Mengjiao Yang and Sergey Levine},
year={2023},
eprint={2206.11871},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2206.11871},
}
tag: FLAN-T5
| FLAN-PaLM
| IFT
| Instruction Tuning
| JMLR24
| Google
paper link: here
code link: here
citation:
@misc{chung2022scaling,
title={Scaling Instruction-Finetuned Language Models},
author={Hyung Won Chung and Le Hou and Shayne Longpre and Barret Zoph and Yi Tay and William Fedus and Yunxuan Li and Xuezhi Wang and Mostafa Dehghani and Siddhartha Brahma and Albert Webson and Shixiang Shane Gu and Zhuyun Dai and Mirac Suzgun and Xinyun Chen and Aakanksha Chowdhery and Alex Castro-Ros and Marie Pellat and Kevin Robinson and Dasha Valter and Sharan Narang and Gaurav Mishra and Adams Yu and Vincent Zhao and Yanping Huang and Andrew Dai and Hongkun Yu and Slav Petrov and Ed H. Chi and Jeff Dean and Jacob Devlin and Adam Roberts and Denny Zhou and Quoc V. Le and Jason Wei},
year={2022},
eprint={2210.11416},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: FLAN
| IFT
| Instruction Tuning
| ICLR22
| Google
paper link: here
code link: here
citation:
@article{wei2021finetuned,
title={Finetuned language models are zero-shot learners},
author={Wei, Jason and Bosma, Maarten and Zhao, Vincent Y and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M and Le, Quoc V},
journal={arXiv preprint arXiv:2109.01652},
year={2021}
}
tag: RLHF
| PPO
| OpenAI
paper link: here
blog link: here
code link: here
citation:
@article{ziegler2019fine,
title={Fine-tuning language models from human preferences},
author={Ziegler, Daniel M and Stiennon, Nisan and Wu, Jeffrey and Brown, Tom B and Radford, Alec and Amodei, Dario and Christiano, Paul and Irving, Geoffrey},
journal={arXiv preprint arXiv:1909.08593},
year={2019}
}
tag: RLHF
| Online Alignment
| Offline Alignment
| Google DeepMind
paper link: here
citation:
@misc{tang2024understandingperformancegaponline,
title={Understanding the performance gap between online and offline alignment algorithms},
author={Yunhao Tang and Daniel Zhaohan Guo and Zeyu Zheng and Daniele Calandriello and Yuan Cao and Eugene Tarassov and Rémi Munos and Bernardo Ávila Pires and Michal Valko and Yong Cheng and Will Dabney},
year={2024},
eprint={2405.08448},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2405.08448},
}