Here're some resources about Context Parallelism for LLMs Training Note that the "sequence parallelism" usually refers to one attached parallelism strategy along with tensor parallelism, different from context parallelism
tag: DCP
| VideoSys
| NUS
docs link: here
code link: here
homepage link: here
citation:
@misc{zhang2024dcp,
title={Training Variable Sequences with Data-Centric Parallel},
author={Geng Zhang and Xuanlei Zhao and Kai Wang and Yang You},
year={2024},
}
TokenRing: An Efficient Parallelism Framework for Infinite-Context LLMs via Bidirectional Communication
tag: TokenRing
| SJTU
paper link: here
code link: here
citation:
@misc{wang2024tokenringefficientparallelismframework,
title={TokenRing: An Efficient Parallelism Framework for Infinite-Context LLMs via Bidirectional Communication},
author={Zongwu Wang and Fangxin Liu and Mingshuai Li and Li Jiang},
year={2024},
eprint={2412.20501},
archivePrefix={arXiv},
primaryClass={cs.DC},
url={https://arxiv.org/abs/2412.20501},
}
tag: FlexSP
| Peking University
paper link: here
citation:
@misc{wang2024datacentricheterogeneityadaptivesequenceparallelism,
title={Data-Centric and Heterogeneity-Adaptive Sequence Parallelism for Efficient LLM Training},
author={Yujie Wang and Shiju Wang and Shenhan Zhu and Fangcheng Fu and Xinyi Liu and Xuefeng Xiao and Huixia Li and Jiashi Li and Faming Wu and Bin Cui},
year={2024},
eprint={2412.01523},
archivePrefix={arXiv},
primaryClass={cs.DC},
url={https://arxiv.org/abs/2412.01523},
}
tag: FPDT
| DeepSpeed Ulysses
| ZERO-3
| Microsoft
paper link: here
code link: here
citation:
@misc{yao2024trainingultralongcontext,
title={Training Ultra Long Context Language Model with Fully Pipelined Distributed Transformer},
author={Jinghan Yao and Sam Ade Jacobs and Masahiro Tanaka and Olatunji Ruwase and Aamir Shafi and Hari Subramoni and Dhabaleswar K. Panda},
year={2024},
eprint={2408.16978},
archivePrefix={arXiv},
primaryClass={cs.DC},
url={https://arxiv.org/abs/2408.16978},
}
tag: LongVILA
| MM-SP
| NVIDIA
| MIT
| UC Berkeley
paper link: here
code link: here
citation:
@misc{chen2024longvilascalinglongcontextvisual,
title={LongVILA: Scaling Long-Context Visual Language Models for Long Videos},
author={Yukang Chen and Fuzhao Xue and Dacheng Li and Qinghao Hu and Ligeng Zhu and Xiuyu Li and Yunhao Fang and Haotian Tang and Shang Yang and Zhijian Liu and Ethan He and Hongxu Yin and Pavlo Molchanov and Jan Kautz and Linxi Fan and Yuke Zhu and Yao Lu and Song Han},
year={2024},
eprint={2408.10188},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2408.10188},
}
tag: WallFacer
| Ring Attention
| NUS
| UC Berkeley
paper link: here
citation:
@misc{liu2024wallfacerharnessingmultidimensionalring,
title={WallFacer: Harnessing Multi-dimensional Ring Parallelism for Efficient Long Sequence Model Training},
author={Ziming Liu and Shaoyu Wang and Shenggan Cheng and Zhongkai Zhao and Kai Wang and Xuanlei Zhao and James Demmel and Yang You},
year={2024},
eprint={2407.00611},
archivePrefix={arXiv},
primaryClass={cs.DC},
url={https://arxiv.org/abs/2407.00611},
}
tag: USP
| Tencent
paper link: here
code link: here
citation:
@misc{fang2024uspunifiedsequenceparallelism,
title={USP: A Unified Sequence Parallelism Approach for Long Context Generative AI},
author={Jiarui Fang and Shangchun Zhao},
year={2024},
eprint={2405.07719},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2405.07719},
}
tag: Striped Attention
| Ring Attention
| Load Balance
| MIT
paper link: here
code link: here
citation:
@misc{brandon2023stripedattentionfasterring,
title={Striped Attention: Faster Ring Attention for Causal Transformers},
author={William Brandon and Aniruddha Nrusimha and Kevin Qian and Zachary Ankner and Tian Jin and Zhiye Song and Jonathan Ragan-Kelley},
year={2023},
eprint={2311.09431},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2311.09431},
}
tag: DISTFLASHATTN
| COLM24
| UC Berkeley
paper link: here
code link: here
citation:
@misc{li2024distflashattndistributedmemoryefficientattention,
title={DISTFLASHATTN: Distributed Memory-efficient Attention for Long-context LLMs Training},
author={Dacheng Li and Rulin Shao and Anze Xie and Eric P. Xing and Xuezhe Ma and Ion Stoica and Joseph E. Gonzalez and Hao Zhang},
year={2024},
eprint={2310.03294},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2310.03294},
}
tag: Ring Attention
| Ring Flash Attention
| ICLR24
| UC Berkeley
paper link: here
code link: here
citation:
@misc{liu2023ringattentionblockwisetransformers,
title={Ring Attention with Blockwise Transformers for Near-Infinite Context},
author={Hao Liu and Matei Zaharia and Pieter Abbeel},
year={2023},
eprint={2310.01889},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2310.01889},
}
tag: DeepSpeed Ulysses
| PODC24
| Microsoft
paper link: here
blog link: here
code link: here
citation:
@misc{jacobs2023deepspeed,
title={DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models},
author={Sam Ade Jacobs and Masahiro Tanaka and Chengming Zhang and Minjia Zhang and Shuaiwen Leon Song and Samyam Rajbhandari and Yuxiong He},
year={2023},
eprint={2309.14509},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: RSA
| Ring Self-Attention
| ACL23
| NUS
paper link: here
citation:
@article{li2021sequence,
title={Sequence parallelism: Long sequence training from system perspective},
author={Li, Shenggui and Xue, Fuzhao and Baranwal, Chaitanya and Li, Yongbin and You, Yang},
journal={arXiv preprint arXiv:2105.13120},
year={2021}
}