Skip to content

Latest commit

 

History

History
166 lines (117 loc) · 5.24 KB

File metadata and controls

166 lines (117 loc) · 5.24 KB

Data Parallelism for LLMs Training

Here're some resources about Data Parallelism for LLMs Training Note that the modern data parallelism is beyond the traditional ddp and involves model sharding as well

LuWu: An End-to-End In-Network Out-of-Core Optimizer for 100B-Scale Model-in-Network Data-Parallel Training on Distributed GPUs

tag: Luwu | DP | Data Parallelism | Zhejiang University

paper link: here

citation:

@misc{sun2024luwuendtoendinnetworkoutofcore,
      title={LuWu: An End-to-End In-Network Out-of-Core Optimizer for 100B-Scale Model-in-Network Data-Parallel Training on Distributed GPUs}, 
      author={Mo Sun and Zihan Yang and Changyue Liao and Yingtao Li and Fei Wu and Zeke Wang},
      year={2024},
      eprint={2409.00918},
      archivePrefix={arXiv},
      primaryClass={cs.DC},
      url={https://arxiv.org/abs/2409.00918}, 
}

PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel

tag: FSDP | HSDP | VLDB23 | Pytorch | Meta

paper link: here

blog link: here

docs link: here

tutorial link: here

notes link: here

follow-up work: here

citation:

@article{zhao2023pytorch,
  title={Pytorch FSDP: experiences on scaling fully sharded data parallel},
  author={Zhao, Yanli and Gu, Andrew and Varma, Rohan and Luo, Liang and Huang, Chien-Chin and Xu, Min and Wright, Less and Shojanazeri, Hamid and Ott, Myle and Shleifer, Sam and others},
  journal={arXiv preprint arXiv:2304.11277},
  year={2023}
}

MiCS: near-linear scaling for training gigantic model on public cloud

tag: MiCS | VLDB22 | Amazon | JHU

paper link: here

citation:

@misc{zhang2022micsnearlinearscalingtraining,
      title={MiCS: Near-linear Scaling for Training Gigantic Model on Public Cloud}, 
      author={Zhen Zhang and Shuai Zheng and Yida Wang and Justin Chiu and George Karypis and Trishul Chilimbi and Mu Li and Xin Jin},
      year={2022},
      eprint={2205.00119},
      archivePrefix={arXiv},
      primaryClass={cs.DC},
      url={https://arxiv.org/abs/2205.00119}, 
}

Gradient Compression Supercharged High-Performance Data Parallel DNN Training

tag: HiPress | CaSync | CompLL | SOSP21 | USTC

paper link: here

citation:

@inproceedings{bai2021gradient,
      author = {Bai, Youhui and Li, Cheng and Zhou, Quan and Yi, Jun and Gong, Ping and Yan, Feng and Chen, Ruichuan and Xu, Yinlong},
      title = {Gradient Compression Supercharged High-Performance Data Parallel DNN Training},
      year = {2021},
      isbn = {9781450387095},
      publisher = {Association for Computing Machinery},
      address = {New York, NY, USA},
      url = {https://doi.org/10.1145/3477132.3483553},
      doi = {10.1145/3477132.3483553},
      pages = {359–375},
      numpages = {17},
      keywords = {gradient compression, DNN training},
      location = {Virtual Event, Germany},
      series = {SOSP '21}
}

Automatic Cross-Replica Sharding of Weight Update in Data-Parallel Training

tag: Cross-Replica | XLA | Google

paper link: here

citation:

@misc{xu2020automatic,
      title={Automatic Cross-Replica Sharding of Weight Update in Data-Parallel Training}, 
      author={Yuanzhong Xu and HyoukJoong Lee and Dehao Chen and Hongjun Choi and Blake Hechtman and Shibo Wang},
      year={2020},
      eprint={2004.13336},
      archivePrefix={arXiv},
      primaryClass={cs.DC}
}

ZeRO: Memory Optimizations Toward Training Trillion Parameter Models

tag: ZERO | ZERO-1 | ZERO-2 | ZERO-3 | DeepSpeed | SC20 | Microsoft

paper link: here

blog link: here

citation:

@inproceedings{rajbhandari2020zero,
      author = {Rajbhandari, Samyam and Rasley, Jeff and Ruwase, Olatunji and He, Yuxiong},
      title = {ZeRO: memory optimizations toward training trillion parameter models},
      year = {2020},
      isbn = {9781728199986},
      publisher = {IEEE Press},
      articleno = {20},
      numpages = {16},
      location = {Atlanta, Georgia},
      series = {SC '20}
}

Pytorch Distributed Data Parallelism

tag: DDP | Pytorch | Meta

blog link: here

docs link: here

tutorial link: here

citation:

@misc{pytorch2019ddp,
      title={Distributed Data Parallel},
      author={PyTorch contributors},
      year={2019},
      url={https://pytorch.org/docs/master/notes/ddp.html}
}