Here're some resources about Offloading Strategies for LLMs Training
Accelerating the Training of Large Language Models using Efficient Activation Rematerialization and Optimal Hybrid Parallelism
tag: Pipeline-Parallel-Aware Offloading
| ATC24
| Kuaishou Technology
paper link: here
slides link: here
citation:
@inproceedings {yuan2024pipelineparallelawareoffloading,
author = {Tailing Yuan and Yuliang Liu and Xucheng Ye and Shenglong Zhang and Jianchao Tan and Bin Chen and Chengru Song and Di Zhang},
title = {Accelerating the Training of Large Language Models using Efficient Activation Rematerialization and Optimal Hybrid Parallelism},
booktitle = {2024 USENIX Annual Technical Conference (USENIX ATC 24)},
year = {2024},
isbn = {978-1-939133-41-0},
address = {Santa Clara, CA},
pages = {545--561},
url = {https://www.usenix.org/conference/atc24/presentation/yuan},
publisher = {USENIX Association},
month = jul
}
tag: Attention Offloading
| Tsinghua University
paper link: here
citation:
@misc{chen2024efficienteconomiclargelanguage,
title={Efficient and Economic Large Language Model Inference with Attention Offloading},
author={Shaoyuan Chen and Yutong Lin and Mingxing Zhang and Yongwei Wu},
year={2024},
eprint={2405.01814},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2405.01814},
}
tag: Lohan
| ICDE25
| Zhejiang University
| HKU
paper link: here
code link: here
citation:
@misc{liao2024addingnvmessdsenable,
title={Adding NVMe SSDs to Enable and Accelerate 100B Model Fine-tuning on a Single GPU},
author={Changyue Liao and Mo Sun and Zihan Yang and Kaiqi Chen and Binhang Yuan and Fei Wu and Zeke Wang},
year={2024},
eprint={2403.06504},
archivePrefix={arXiv},
primaryClass={cs.DC},
url={https://arxiv.org/abs/2403.06504},
}
tag: TE-Offloading
| Transformer-Engine
| NVIDIA
code link: here
docs link: here
citation:
@misc{transformerenginecpuoffload2024nvidia,
author = {NVIDIA},
title = {NVIDIA Transformer-Engine CPU Offloading},
year = {2024},
howpublished = {\url{https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/cpu_offload.py}},
}
tag: STR
| TPDS23
| Tsinghua University
paper link: here
citation:
@article{zong2023str,
author={Zong, Zan and Lin, Li and Lin, Leilei and Wen, Lijie and Sun, Yu},
journal={IEEE Transactions on Parallel and Distributed Systems},
title={STR: Hybrid Tensor Re-Generation to Break Memory Wall for DNN Training},
year={2023},
volume={34},
number={8},
pages={2403-2418},
keywords={Tensors;Graphics processing units;Training;Optimization;Costs;Bandwidth;Memory management;DNN training;offload memory;recomputation;rematerialization;swap},
doi={10.1109/TPDS.2023.3266110}
}
MPress: Democratizing Billion-Scale Model Training on Multi-GPU Servers via Memory-Saving Inter-Operator Parallelism
tag: MPress
| HPCA23
| USTC
paper link: here
citation:
@inproceedings{zhou2023mpress,
author={Zhou, Quan and Wang, Haiquan and Yu, Xiaoyan and Li, Cheng and Bai, Youhui and Yan, Feng and Xu, Yinlong},
booktitle={2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)},
title={MPress: Democratizing Billion-Scale Model Training on Multi-GPU Servers via Memory-Saving Inter-Operator Parallelism},
year={2023},
volume={},
number={},
pages={556-569},
keywords={Training;Performance evaluation;Tensors;Costs;Computational modeling;Graphics processing units;Parallel processing;Inter-Operator Parallelism;DNN Training;Swap;Recomputation},
doi={10.1109/HPCA56546.2023.10071077}
}
tag: STRONGHOLD
| SC22
| Alibaba Group
paper link: here
citation:
@inproceedings{sun2022stronghold,
author={Sun, Xiaoyang and Wang, Wei and Qiu, Shenghao and Yang, Renyu and Huang, Songfang and Xu, Jie and Wang, Zheng},
booktitle={SC22: International Conference for High Performance Computing, Networking, Storage and Analysis},
title={STRONGHOLD: Fast and Affordable Billion-Scale Deep Learning Model Training},
year={2022},
volume={},
number={},
pages={1-17},
keywords={Training;Deep learning;Codes;Computational modeling;Memory management;Graphics processing units;Random access memory;Deep learning;Distributed training;DNNs training acceleration},
doi={10.1109/SC41404.2022.00076}
}
tag: DELTA
| Tensor Swapping
| Tensor Recomputation
| ACM TACO24
paper link: here
citation:
@misc{tang2022deltadynamicallyoptimizinggpu,
title={DELTA: Dynamically Optimizing GPU Memory beyond Tensor Recomputation},
author={Yu Tang and Chenyu Wang and Yufan Zhang and Yuliang Liu and Xingcheng Zhang and Linbo Qiao and Zhiquan Lai and Dongsheng Li},
year={2022},
eprint={2203.15980},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2203.15980},
}
tag: PatrickStar
| TPDS22
| Tencent
| NUS
paper link: here
code link: here
citation:
@article{Fang_2023,
title={Parallel Training of Pre-Trained Models via Chunk-Based Dynamic Memory Management},
volume={34},
ISSN={2161-9883},
url={http://dx.doi.org/10.1109/TPDS.2022.3219819},
DOI={10.1109/tpds.2022.3219819},
number={1},
journal={IEEE Transactions on Parallel and Distributed Systems},
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
author={Fang, Jiarui and Zhu, Zilin and Li, Shenggui and Su, Hui and Yu, Yang and Zhou, Jie and You, Yang},
year={2023},
month=jan, pages={304–315} }
tag: ZeRO-Infinity
| NVMe SSD
| Infinity Offload Engine
| Microsoft
paper link: here
blog link: here
citation:
@misc{rajbhandari2021zeroinfinity,
title={ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning},
author={Samyam Rajbhandari and Olatunji Ruwase and Jeff Rasley and Shaden Smith and Yuxiong He},
year={2021},
eprint={2104.07857},
archivePrefix={arXiv},
primaryClass={cs.DC}
}
tag: ZeRO-Offload
| Microsoft
paper link: here
blog link: here
citation:
@misc{ren2021zerooffload,
title={ZeRO-Offload: Democratizing Billion-Scale Model Training},
author={Jie Ren and Samyam Rajbhandari and Reza Yazdani Aminabadi and Olatunji Ruwase and Shuangyan Yang and Minjia Zhang and Dong Li and Yuxiong He},
year={2021},
eprint={2101.06840},
archivePrefix={arXiv},
primaryClass={cs.DC}
}
tag: SwapAdvisor
| ASPLOS20
| New York University
paper link: here
citation:
@inproceedings{swapadvisor2020huang,
author = {Huang, Chien-Chin and Jin, Gu and Li, Jinyang},
title = {SwapAdvisor: Pushing Deep Learning Beyond the GPU Memory Limit via Smart Swapping},
year = {2020},
isbn = {9781450371025},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3373376.3378530},
doi = {10.1145/3373376.3378530},
booktitle = {Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {1341–1355},
numpages = {15},
keywords = {scheduling and resource management, gpu, deep learning systems},
location = {Lausanne, Switzerland},
series = {ASPLOS '20}
}
tag: Capuchin
| ASPLOS20
| MSRA
paper link: here
citation:
@inproceedings{peng2020capuchin,
author = {Peng, Xuan and Shi, Xuanhua and Dai, Hulin and Jin, Hai and Ma, Weiliang and Xiong, Qian and Yang, Fan and Qian, Xuehai},
title = {Capuchin: Tensor-based GPU Memory Management for Deep Learning},
year = {2020},
isbn = {9781450371025},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3373376.3378505},
doi = {10.1145/3373376.3378505},
pages = {891–905},
numpages = {15},
keywords = {deep learning training, gpu memory management, tensor access},
location = {Lausanne, Switzerland},
series = {ASPLOS '20}
}
tag: L2L
| EPS
| Microsoft
paper link: here
citation:
@misc{pudipeddi2020training,
title={Training Large Neural Networks with Constant Memory using a New Execution Algorithm},
author={Bharadwaj Pudipeddi and Maral Mesmakhosroshahi and Jinwen Xi and Sujeeth Bharadwaj},
year={2020},
eprint={2002.05645},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: SuperNeurons
| PPoPP18
| MIT
paper link: here
citation:
@inproceedings{li2018supernurons,
author = {Wang, Linnan and Ye, Jinmian and Zhao, Yiyang and Wu, Wei and Li, Ang and Song, Shuaiwen Leon and Xu, Zenglin and Kraska, Tim},
title = {Superneurons: dynamic GPU memory management for training deep neural networks},
year = {2018},
isbn = {9781450349826},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3178487.3178491},
doi = {10.1145/3178487.3178491},
pages = {41–53},
numpages = {13},
keywords = {runtime scheduling, neural networks, GPU memory management},
location = {Vienna, Austria},
series = {PPoPP '18}
}
tag: vDNN
| Nvidia
paper link: here
citation:
@misc{rhu2016vdnn,
title={vDNN: Virtualized Deep Neural Networks for Scalable, Memory-Efficient Neural Network Design},
author={Minsoo Rhu and Natalia Gimelshein and Jason Clemons and Arslan Zulfiqar and Stephen W. Keckler},
year={2016},
eprint={1602.08124},
archivePrefix={arXiv},
primaryClass={cs.DC}
}