Here're some resources about Data Parallelism for LLMs Training Note that the modern data parallelism is beyond the traditional ddp and involves model sharding as well
LuWu: An End-to-End In-Network Out-of-Core Optimizer for 100B-Scale Model-in-Network Data-Parallel Training on Distributed GPUs
tag: Luwu
| DP
| Data Parallelism
| Zhejiang University
paper link: here
citation:
@misc{sun2024luwuendtoendinnetworkoutofcore,
title={LuWu: An End-to-End In-Network Out-of-Core Optimizer for 100B-Scale Model-in-Network Data-Parallel Training on Distributed GPUs},
author={Mo Sun and Zihan Yang and Changyue Liao and Yingtao Li and Fei Wu and Zeke Wang},
year={2024},
eprint={2409.00918},
archivePrefix={arXiv},
primaryClass={cs.DC},
url={https://arxiv.org/abs/2409.00918},
}
tag: FSDP
| HSDP
| VLDB23
| Pytorch
| Meta
paper link: here
blog link: here
docs link: here
tutorial link: here
notes link: here
follow-up work: here
citation:
@article{zhao2023pytorch,
title={Pytorch FSDP: experiences on scaling fully sharded data parallel},
author={Zhao, Yanli and Gu, Andrew and Varma, Rohan and Luo, Liang and Huang, Chien-Chin and Xu, Min and Wright, Less and Shojanazeri, Hamid and Ott, Myle and Shleifer, Sam and others},
journal={arXiv preprint arXiv:2304.11277},
year={2023}
}
tag: MiCS
| VLDB22
| Amazon
| JHU
paper link: here
citation:
@misc{zhang2022micsnearlinearscalingtraining,
title={MiCS: Near-linear Scaling for Training Gigantic Model on Public Cloud},
author={Zhen Zhang and Shuai Zheng and Yida Wang and Justin Chiu and George Karypis and Trishul Chilimbi and Mu Li and Xin Jin},
year={2022},
eprint={2205.00119},
archivePrefix={arXiv},
primaryClass={cs.DC},
url={https://arxiv.org/abs/2205.00119},
}
tag: HiPress
| CaSync
| CompLL
| SOSP21
| USTC
paper link: here
citation:
@inproceedings{bai2021gradient,
author = {Bai, Youhui and Li, Cheng and Zhou, Quan and Yi, Jun and Gong, Ping and Yan, Feng and Chen, Ruichuan and Xu, Yinlong},
title = {Gradient Compression Supercharged High-Performance Data Parallel DNN Training},
year = {2021},
isbn = {9781450387095},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3477132.3483553},
doi = {10.1145/3477132.3483553},
pages = {359–375},
numpages = {17},
keywords = {gradient compression, DNN training},
location = {Virtual Event, Germany},
series = {SOSP '21}
}
tag: Cross-Replica
| XLA
| Google
paper link: here
citation:
@misc{xu2020automatic,
title={Automatic Cross-Replica Sharding of Weight Update in Data-Parallel Training},
author={Yuanzhong Xu and HyoukJoong Lee and Dehao Chen and Hongjun Choi and Blake Hechtman and Shibo Wang},
year={2020},
eprint={2004.13336},
archivePrefix={arXiv},
primaryClass={cs.DC}
}
tag: ZERO
| ZERO-1
| ZERO-2
| ZERO-3
| DeepSpeed
| SC20
| Microsoft
paper link: here
blog link: here
citation:
@inproceedings{rajbhandari2020zero,
author = {Rajbhandari, Samyam and Rasley, Jeff and Ruwase, Olatunji and He, Yuxiong},
title = {ZeRO: memory optimizations toward training trillion parameter models},
year = {2020},
isbn = {9781728199986},
publisher = {IEEE Press},
articleno = {20},
numpages = {16},
location = {Atlanta, Georgia},
series = {SC '20}
}
tag: DDP
| Pytorch
| Meta
blog link: here
docs link: here
tutorial link: here
citation:
@misc{pytorch2019ddp,
title={Distributed Data Parallel},
author={PyTorch contributors},
year={2019},
url={https://pytorch.org/docs/master/notes/ddp.html}
}