Here're some resources about General Pretraining Recipes for LLMs Training
tag: MeCo
| Metadata Conditioning then Cooldown
| Princeton University
paper link: here
code link: here
citation:
@misc{gao2025metadataconditioningaccelerateslanguage,
title={Metadata Conditioning Accelerates Language Model Pre-training},
author={Tianyu Gao and Alexander Wettig and Luxi He and Yihe Dong and Sadhika Malladi and Danqi Chen},
year={2025},
eprint={2501.01956},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2501.01956},
}
tag: Stability
| Nvidia
paper link: here
citation:
@misc{rybakov2024methodsimprovingllmtraining,
title={Methods of improving LLM training stability},
author={Oleg Rybakov and Mike Chrzanowski and Peter Dykas and Jinze Xue and Ben Lanir},
year={2024},
eprint={2410.16682},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2410.16682},
}
tag: DCLM
| DataComp-LM
| Data Curation
| Apple
paper link: here
code link: here
citation:
@misc{li2024datacomplmsearchgenerationtraining,
title={DataComp-LM: In search of the next generation of training sets for language models},
author={Jeffrey Li and Alex Fang and Georgios Smyrnis and Maor Ivgi and Matt Jordan and Samir Gadre and Hritik Bansal and Etash Guha and Sedrick Keh and Kushal Arora and Saurabh Garg and Rui Xin and Niklas Muennighoff and Reinhard Heckel and Jean Mercat and Mayee Chen and Suchin Gururangan and Mitchell Wortsman and Alon Albalak and Yonatan Bitton and Marianna Nezhurina and Amro Abbas and Cheng-Yu Hsieh and Dhruba Ghosh and Josh Gardner and Maciej Kilian and Hanlin Zhang and Rulin Shao and Sarah Pratt and Sunny Sanyal and Gabriel Ilharco and Giannis Daras and Kalyani Marathe and Aaron Gokaslan and Jieyu Zhang and Khyathi Chandu and Thao Nguyen and Igor Vasiljevic and Sham Kakade and Shuran Song and Sujay Sanghavi and Fartash Faghri and Sewoong Oh and Luke Zettlemoyer and Kyle Lo and Alaaeldin El-Nouby and Hadi Pouransari and Alexander Toshev and Stephanie Wang and Dirk Groeneveld and Luca Soldaini and Pang Wei Koh and Jenia Jitsev and Thomas Kollar and Alexandros G. Dimakis and Yair Carmon and Achal Dave and Ludwig Schmidt and Vaishaal Shankar},
year={2024},
eprint={2406.11794},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2406.11794},
}
tag: Domain Upsampling
| Databricks Mosaic Research
paper link: here
citation:
@misc{blakeney2024doesdatasparkjoy,
title={Does your data spark joy? Performance gains from domain upsampling at the end of training},
author={Cody Blakeney and Mansheej Paul and Brett W. Larsen and Sean Owen and Jonathan Frankle},
year={2024},
eprint={2406.03476},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2406.03476},
}
tag: LLM Inheritune
| NYU
paper link: here
code link: here
citation:
@misc{sanyal2024pretrainingsmallbaselms,
title={Pre-training Small Base LMs with Fewer Tokens},
author={Sunny Sanyal and Sujay Sanghavi and Alexandros G. Dimakis},
year={2024},
eprint={2404.08634},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.08634},
}
tag: MiniCPM
| COLM24
| OpenBMB
| Tsinghua University
paper link: here
code link: here
modelhub link: here
citation:
@misc{hu2024minicpmunveilingpotentialsmall,
title={MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies},
author={Shengding Hu and Yuge Tu and Xu Han and Chaoqun He and Ganqu Cui and Xiang Long and Zhi Zheng and Yewei Fang and Yuxiang Huang and Weilin Zhao and Xinrong Zhang and Zheng Leng Thai and Kaihuo Zhang and Chongyi Wang and Yuan Yao and Chenyang Zhao and Jie Zhou and Jie Cai and Zhongwu Zhai and Ning Ding and Chao Jia and Guoyang Zeng and Dahai Li and Zhiyuan Liu and Maosong Sun},
year={2024},
eprint={2404.06395},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.06395},
}
tag: Long-Context Data Engineering
| Length Upsampling
| Continual Pretraining
| ICML24
paper link: here
code link: here
citation:
@misc{fu2024data,
title={Data Engineering for Scaling Language Models to 128K Context},
author={Yao Fu and Rameswar Panda and Xinyao Niu and Xiang Yue and Hannaneh Hajishirzi and Yoon Kim and Hao Peng},
year={2024},
eprint={2402.10171},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: Loss Spike
| Embed LN
| Scaled Embed
paper link: here
citation:
@misc{takase2024spike,
title={Spike No More: Stabilizing the Pre-training of Large Language Models},
author={Sho Takase and Shun Kiyono and Sosuke Kobayashi and Jun Suzuki},
year={2024},
eprint={2312.16903},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: Data Mixing
| UCSB
| NIPS23 Ro-FoMo Workshop
paper link: here
citation:
@misc{albalak2023efficient,
title={Efficient Online Data Mixing For Language Model Pre-Training},
author={Alon Albalak and Liangming Pan and Colin Raffel and William Yang Wang},
year={2023},
eprint={2312.02406},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: OpenChat
| Mixed-Quality Data
| ICLR24
| Tsinghua University
paper link: here
code link: here
citation:
@misc{wang2024openchatadvancingopensourcelanguage,
title={OpenChat: Advancing Open-source Language Models with Mixed-Quality Data},
author={Guan Wang and Sijie Cheng and Xianyuan Zhan and Xiangang Li and Sen Song and Yang Liu},
year={2024},
eprint={2309.11235},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2309.11235},
}
tag: Data-Constrained
| NIPS23
| HuggingFace
paper link: here
code link: here
citation:
@article{muennighoff2023scaling,
title={Scaling Data-Constrained Language Models},
author={Muennighoff, Niklas and Rush, Alexander M and Barak, Boaz and Scao, Teven Le and Piktus, Aleksandra and Tazi, Nouamane and Pyysalo, Sampo and Wolf, Thomas and Raffel, Colin},
journal={arXiv preprint arXiv:2305.16264},
year={2023}
}
tag: UL2R
| U-PaLM
| CPT
| Continual Pretraining
| Google
paper link: here
citation:
@misc{tay2022transcending,
title={Transcending Scaling Laws with 0.1% Extra Compute},
author={Yi Tay and Jason Wei and Hyung Won Chung and Vinh Q. Tran and David R. So and Siamak Shakeri and Xavier Garcia and Huaixiu Steven Zheng and Jinfeng Rao and Aakanksha Chowdhery and Denny Zhou and Donald Metzler and Slav Petrov and Neil Houlsby and Quoc V. Le and Mostafa Dehghani},
year={2022},
eprint={2210.11399},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: Chinchilla
| Scaling Law
| Google DeepMind
paper link: here
citation:
@article{hoffmann2022training,
title={Training compute-optimal large language models},
author={Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and Buchatskaya, Elena and Cai, Trevor and Rutherford, Eliza and Casas, Diego de Las and Hendricks, Lisa Anne and Welbl, Johannes and Clark, Aidan and others},
journal={arXiv preprint arXiv:2203.15556},
year={2022}
}
tag: MUP
| μP
| μTransfer
| Microsoft
| OpenAI
paper link: here
code link: here
citation:
@misc{yang2022tensorprogramsvtuning,
title={Tensor Programs V: Tuning Large Neural Networks via Zero-Shot Hyperparameter Transfer},
author={Greg Yang and Edward J. Hu and Igor Babuschkin and Szymon Sidor and Xiaodong Liu and David Farhi and Nick Ryder and Jakub Pachocki and Weizhu Chen and Jianfeng Gao},
year={2022},
eprint={2203.03466},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2203.03466},
}
tag: Large-Batch Training
| OpenAI
paper link: here
citation:
@misc{mccandlish2018empirical,
title={An Empirical Model of Large-Batch Training},
author={Sam McCandlish and Jared Kaplan and Dario Amodei and OpenAI Dota Team},
year={2018},
eprint={1812.06162},
archivePrefix={arXiv},
primaryClass={cs.LG}
}