Here're some resources about Sparse Transformer Language Modeling for LLMs, especially Mixture-of-Experts (MoE)
tag: MiniMax-01
| Lightning Attention
| MiniMax
paper link: here
blog link: here
code link: here
model link: here
citation:
@misc{minimax2025scaling,
author = {MiniMax AI},
title = {MiniMax-01: Scaling Foundation Models with Lightning Attention},
year = {2025},
url = {https://filecdn.minimax.chat/_Arxiv_MiniMax_01_Report.pdf},
}
tag: DeepSeek-V3
| DeepSeek
paper link: here
code link: here
model link: DeepSeek-V3
homepage link: here
citation:
@misc{deepseekai2024deepseekv3technicalreport,
title={DeepSeek-V3 Technical Report},
author={DeepSeek-AI and Aixin Liu and Bei Feng and Bing Xue and Bingxuan Wang and Bochao Wu and Chengda Lu and Chenggang Zhao and Chengqi Deng and Chenyu Zhang and Chong Ruan and Damai Dai and Daya Guo and Dejian Yang and Deli Chen and Dongjie Ji and Erhang Li and Fangyun Lin and Fucong Dai and Fuli Luo and Guangbo Hao and Guanting Chen and Guowei Li and H. Zhang and Han Bao and Hanwei Xu and Haocheng Wang and Haowei Zhang and Honghui Ding and Huajian Xin and Huazuo Gao and Hui Li and Hui Qu and J. L. Cai and Jian Liang and Jianzhong Guo and Jiaqi Ni and Jiashi Li and Jiawei Wang and Jin Chen and Jingchang Chen and Jingyang Yuan and Junjie Qiu and Junlong Li and Junxiao Song and Kai Dong and Kai Hu and Kaige Gao and Kang Guan and Kexin Huang and Kuai Yu and Lean Wang and Lecong Zhang and Lei Xu and Leyi Xia and Liang Zhao and Litong Wang and Liyue Zhang and Meng Li and Miaojun Wang and Mingchuan Zhang and Minghua Zhang and Minghui Tang and Mingming Li and Ning Tian and Panpan Huang and Peiyi Wang and Peng Zhang and Qiancheng Wang and Qihao Zhu and Qinyu Chen and Qiushi Du and R. J. Chen and R. L. Jin and Ruiqi Ge and Ruisong Zhang and Ruizhe Pan and Runji Wang and Runxin Xu and Ruoyu Zhang and Ruyi Chen and S. S. Li and Shanghao Lu and Shangyan Zhou and Shanhuang Chen and Shaoqing Wu and Shengfeng Ye and Shengfeng Ye and Shirong Ma and Shiyu Wang and Shuang Zhou and Shuiping Yu and Shunfeng Zhou and Shuting Pan and T. Wang and Tao Yun and Tian Pei and Tianyu Sun and W. L. Xiao and Wangding Zeng and Wanjia Zhao and Wei An and Wen Liu and Wenfeng Liang and Wenjun Gao and Wenqin Yu and Wentao Zhang and X. Q. Li and Xiangyue Jin and Xianzu Wang and Xiao Bi and Xiaodong Liu and Xiaohan Wang and Xiaojin Shen and Xiaokang Chen and Xiaokang Zhang and Xiaosha Chen and Xiaotao Nie and Xiaowen Sun and Xiaoxiang Wang and Xin Cheng and Xin Liu and Xin Xie and Xingchao Liu and Xingkai Yu and Xinnan Song and Xinxia Shan and Xinyi Zhou and Xinyu Yang and Xinyuan Li and Xuecheng Su and Xuheng Lin and Y. K. Li and Y. Q. Wang and Y. X. Wei and Y. X. Zhu and Yang Zhang and Yanhong Xu and Yanhong Xu and Yanping Huang and Yao Li and Yao Zhao and Yaofeng Sun and Yaohui Li and Yaohui Wang and Yi Yu and Yi Zheng and Yichao Zhang and Yifan Shi and Yiliang Xiong and Ying He and Ying Tang and Yishi Piao and Yisong Wang and Yixuan Tan and Yiyang Ma and Yiyuan Liu and Yongqiang Guo and Yu Wu and Yuan Ou and Yuchen Zhu and Yuduan Wang and Yue Gong and Yuheng Zou and Yujia He and Yukun Zha and Yunfan Xiong and Yunxian Ma and Yuting Yan and Yuxiang Luo and Yuxiang You and Yuxuan Liu and Yuyang Zhou and Z. F. Wu and Z. Z. Ren and Zehui Ren and Zhangli Sha and Zhe Fu and Zhean Xu and Zhen Huang and Zhen Zhang and Zhenda Xie and Zhengyan Zhang and Zhewen Hao and Zhibin Gou and Zhicheng Ma and Zhigang Yan and Zhihong Shao and Zhipeng Xu and Zhiyu Wu and Zhongyu Zhang and Zhuoshu Li and Zihui Gu and Zijia Zhu and Zijun Liu and Zilin Li and Ziwei Xie and Ziyang Song and Ziyi Gao and Zizheng Pan},
year={2024},
eprint={2412.19437},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2412.19437},
}
tag: MoT
| Meta
| Stanford University
paper link: here
citation:
@misc{liang2024mixtureoftransformerssparsescalablearchitecture,
title={Mixture-of-Transformers: A Sparse and Scalable Architecture for Multi-Modal Foundation Models},
author={Weixin Liang and Lili Yu and Liang Luo and Srinivasan Iyer and Ning Dong and Chunting Zhou and Gargi Ghosh and Mike Lewis and Wen-tau Yih and Luke Zettlemoyer and Xi Victoria Lin},
year={2024},
eprint={2411.04996},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2411.04996},
}
tag: OLMoE
| Allen AI
paper link: here
code link: here
citation:
@misc{muennighoff2024olmoeopenmixtureofexpertslanguage,
title={OLMoE: Open Mixture-of-Experts Language Models},
author={Niklas Muennighoff and Luca Soldaini and Dirk Groeneveld and Kyle Lo and Jacob Morrison and Sewon Min and Weijia Shi and Pete Walsh and Oyvind Tafjord and Nathan Lambert and Yuling Gu and Shane Arora and Akshita Bhagia and Dustin Schwenk and David Wadden and Alexander Wettig and Binyuan Hui and Tim Dettmers and Douwe Kiela and Ali Farhadi and Noah A. Smith and Pang Wei Koh and Amanpreet Singh and Hannaneh Hajishirzi},
year={2024},
eprint={2409.02060},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.02060},
}
tag: MoNE
| ViT
| Google DeepMind
paper link: here
citation:
@misc{jain2024mixturenestedexpertsadaptive,
title={Mixture of Nested Experts: Adaptive Processing of Visual Tokens},
author={Gagan Jain and Nidhi Hegde and Aditya Kusupati and Arsha Nagrani and Shyamal Buch and Prateek Jain and Anurag Arnab and Sujoy Paul},
year={2024},
eprint={2407.19985},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2407.19985},
}
tag: PEER
| Google DeepMind
paper link: here
citation:
@misc{he2024mixturemillionexperts,
title={Mixture of A Million Experts},
author={Xu Owen He},
year={2024},
eprint={2407.04153},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2407.04153},
}
tag: DeepSeek-V2
| DeepSeek
paper link: here
code link: here
model links:
model name | link |
---|---|
DeepSeek-V2-Chat | here |
DeepSeek-V2 | here |
DeepSeek-V2-Lite-Chat | here |
DeepSeek-V2-Lite | here |
citation:
@misc{deepseekai2024deepseekv2strongeconomicalefficient,
title={DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model},
author={DeepSeek-AI and Aixin Liu and Bei Feng and Bin Wang and Bingxuan Wang and Bo Liu and Chenggang Zhao and Chengqi Dengr and Chong Ruan and Damai Dai and Daya Guo and Dejian Yang and Deli Chen and Dongjie Ji and Erhang Li and Fangyun Lin and Fuli Luo and Guangbo Hao and Guanting Chen and Guowei Li and H. Zhang and Hanwei Xu and Hao Yang and Haowei Zhang and Honghui Ding and Huajian Xin and Huazuo Gao and Hui Li and Hui Qu and J. L. Cai and Jian Liang and Jianzhong Guo and Jiaqi Ni and Jiashi Li and Jin Chen and Jingyang Yuan and Junjie Qiu and Junxiao Song and Kai Dong and Kaige Gao and Kang Guan and Lean Wang and Lecong Zhang and Lei Xu and Leyi Xia and Liang Zhao and Liyue Zhang and Meng Li and Miaojun Wang and Mingchuan Zhang and Minghua Zhang and Minghui Tang and Mingming Li and Ning Tian and Panpan Huang and Peiyi Wang and Peng Zhang and Qihao Zhu and Qinyu Chen and Qiushi Du and R. J. Chen and R. L. Jin and Ruiqi Ge and Ruizhe Pan and Runxin Xu and Ruyi Chen and S. S. Li and Shanghao Lu and Shangyan Zhou and Shanhuang Chen and Shaoqing Wu and Shengfeng Ye and Shirong Ma and Shiyu Wang and Shuang Zhou and Shuiping Yu and Shunfeng Zhou and Size Zheng and T. Wang and Tian Pei and Tian Yuan and Tianyu Sun and W. L. Xiao and Wangding Zeng and Wei An and Wen Liu and Wenfeng Liang and Wenjun Gao and Wentao Zhang and X. Q. Li and Xiangyue Jin and Xianzu Wang and Xiao Bi and Xiaodong Liu and Xiaohan Wang and Xiaojin Shen and Xiaokang Chen and Xiaosha Chen and Xiaotao Nie and Xiaowen Sun and Xiaoxiang Wang and Xin Liu and Xin Xie and Xingkai Yu and Xinnan Song and Xinyi Zhou and Xinyu Yang and Xuan Lu and Xuecheng Su and Y. Wu and Y. K. Li and Y. X. Wei and Y. X. Zhu and Yanhong Xu and Yanping Huang and Yao Li and Yao Zhao and Yaofeng Sun and Yaohui Li and Yaohui Wang and Yi Zheng and Yichao Zhang and Yiliang Xiong and Yilong Zhao and Ying He and Ying Tang and Yishi Piao and Yixin Dong and Yixuan Tan and Yiyuan Liu and Yongji Wang and Yongqiang Guo and Yuchen Zhu and Yuduan Wang and Yuheng Zou and Yukun Zha and Yunxian Ma and Yuting Yan and Yuxiang You and Yuxuan Liu and Z. Z. Ren and Zehui Ren and Zhangli Sha and Zhe Fu and Zhen Huang and Zhen Zhang and Zhenda Xie and Zhewen Hao and Zhihong Shao and Zhiniu Wen and Zhipeng Xu and Zhongyu Zhang and Zhuoshu Li and Zihan Wang and Zihui Gu and Zilin Li and Ziwei Xie},
year={2024},
eprint={2405.04434},
archivePrefix={arXiv},
primaryClass={cs.CL}
url={https://arxiv.org/abs/2405.04434},
}
tag: Rectify-Router
| Shanghai AI Lab
| Fudan University
paper link: here
citation:
@misc{zeng2024turnwasteworthrectifying,
title={Turn Waste into Worth: Rectifying Top-$k$ Router of MoE},
author={Zhiyuan Zeng and Qipeng Guo and Zhaoye Fei and Zhangyue Yin and Yunhua Zhou and Linyang Li and Tianxiang Sun and Hang Yan and Dahua Lin and Xipeng Qiu},
year={2024},
eprint={2402.12399},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2402.12399},
}
tag: DeepSeek MoE
| DeepSeek
paper link: here
code link: here
follow-up work: here
model links:
model name | link |
---|---|
deepseek-moe-16b-chat | here |
deepseek-moe-16b-base | here |
citation:
@misc{dai2024deepseekmoeultimateexpertspecialization,
title={DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models},
author={Damai Dai and Chengqi Deng and Chenggang Zhao and R. X. Xu and Huazuo Gao and Deli Chen and Jiashi Li and Wangding Zeng and Xingkai Yu and Y. Wu and Zhenda Xie and Y. K. Li and Panpan Huang and Fuli Luo and Chong Ruan and Zhifang Sui and Wenfeng Liang},
year={2024},
eprint={2401.06066},
archivePrefix={arXiv},
primaryClass={cs.CL}
url={https://arxiv.org/abs/2401.06066},
}
tag: SwitchHead
paper link: here
code link: here
citation:
@article{csordas2023switchhead,
title={SwitchHead: Accelerating Transformers with Mixture-of-Experts Attention},
author={Csord{\'a}s, R{\'o}bert and Pi{\k{e}}kos, Piotr and Irie, Kazuki},
journal={arXiv preprint arXiv:2312.07987},
year={2023}
}
tag: Mixtral
| Mistral AI
paper link: here
blog link: here
model links:
model name | link |
---|---|
Mixtral-SlimOrca-8x7B | here |
Mixtral-8x7B-Instruct-v0.1 | here |
Mixtral-8x7B-v0.1 | here |
citation:
@misc{jiang2024mixtral,
title={Mixtral of Experts},
author={Albert Q. Jiang and Alexandre Sablayrolles and Antoine Roux and Arthur Mensch and Blanche Savary and Chris Bamford and Devendra Singh Chaplot and Diego de las Casas and Emma Bou Hanna and Florian Bressand and Gianna Lengyel and Guillaume Bour and Guillaume Lample and Lélio Renard Lavaud and Lucile Saulnier and Marie-Anne Lachaux and Pierre Stock and Sandeep Subramanian and Sophia Yang and Szymon Antoniak and Teven Le Scao and Théophile Gervet and Thibaut Lavril and Thomas Wang and Timothée Lacroix and William El Sayed},
year={2024},
eprint={2401.04088},
archivePrefix={arXiv},
primaryClass={id='cs.LG' full_name='Machine Learning' is_active=True alt_name=None in_archive='cs' is_general=False description='Papers on all aspects of machine learning research (supervised, unsupervised, reinforcement learning, bandit problems, and so on) including also robustness, explanation, fairness, and methodology. cs.LG is also an appropriate primary category for applications of machine learning methods.'}
}
tag: QMoE
paper link: here
citation:
@misc{frantar2023qmoe,
title={QMoE: Practical Sub-1-Bit Compression of Trillion-Parameter Models},
author={Elias Frantar and Dan Alistarh},
year={2023},
eprint={2310.16795},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: Soft MoE
| Google DeepMind
paper link: here
citation:
@article{puigcerver2023sparse,
title={From sparse to soft mixtures of experts},
author={Puigcerver, Joan and Riquelme, Carlos and Mustafa, Basil and Houlsby, Neil},
journal={arXiv preprint arXiv:2308.00951},
year={2023}
}
tag: OpenMoE
| NUS
paper link: here
code link: here
citation:
@misc{openmoe2023,
author = {Fuzhao Xue, Zian Zheng, Yao Fu, Jinjie Ni, Zangwei Zheng, Wangchunshu Zhou and Yang You},
title = {OpenMoE: Open Mixture-of-Experts Language Models},
year = {2023},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/XueFuzhao/OpenMoE}},
}
tag: MegaBlocks
| dMoE
| MLSys23
| Stanford University
| Google
| Microsoft
paper link: here
code link: here
citation:
@misc{gale2022megablocks,
title={MegaBlocks: Efficient Sparse Training with Mixture-of-Experts},
author={Trevor Gale and Deepak Narayanan and Cliff Young and Matei Zaharia},
year={2022},
eprint={2211.15841},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: MT-TaG
| Microsoft
| Google
paper link: here
citation:
@misc{gupta2022sparsely,
title={Sparsely Activated Mixture-of-Experts are Robust Multi-Task Learners},
author={Shashank Gupta and Subhabrata Mukherjee and Krishan Subudhi and Eduardo Gonzalez and Damien Jose and Ahmed H. Awadallah and Jianfeng Gao},
year={2022},
eprint={2204.07689},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: ST-MoE
| Google Brain
paper link: here
code link: here
citation:
@misc{zoph2022stmoedesigningstabletransferable,
title={ST-MoE: Designing Stable and Transferable Sparse Expert Models},
author={Barret Zoph and Irwan Bello and Sameer Kumar and Nan Du and Yanping Huang and Jeff Dean and Noam Shazeer and William Fedus},
year={2022},
eprint={2202.08906},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2202.08906},
}
tag: Sinkhorn-BASE
| Google DeepMind
paper link: here
citation:
@misc{clark2022unified,
title={Unified Scaling Laws for Routed Language Models},
author={Aidan Clark and Diego de las Casas and Aurelia Guy and Arthur Mensch and Michela Paganini and Jordan Hoffmann and Bogdan Damoc and Blake Hechtman and Trevor Cai and Sebastian Borgeaud and George van den Driessche and Eliza Rutherford and Tom Hennigan and Matthew Johnson and Katie Millican and Albin Cassirer and Chris Jones and Elena Buchatskaya and David Budden and Laurent Sifre and Simon Osindero and Oriol Vinyals and Jack Rae and Erich Elsen and Koray Kavukcuoglu and Karen Simonyan},
year={2022},
eprint={2202.01169},
archivePrefix={arXiv},
primaryClass={id='cs.CL' full_name='Computation and Language' is_active=True alt_name='cmp-lg' in_archive='cs' is_general=False description='Covers natural language processing. Roughly includes material in ACM Subject Class I.2.7. Note that work on artificial languages (programming languages, logics, formal systems) that does not explicitly address natural-language issues broadly construed (natural-language processing, computational linguistics, speech, text retrieval, etc.) is not appropriate for this area.'}
}
tag: Switch Transformer
| SMoE
| JMLR23
| Google
paper link: here
code link: here
citation:
@article{fedus2022switch,
title={Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity},
author={Fedus, William and Zoph, Barret and Shazeer, Noam},
journal={The Journal of Machine Learning Research},
volume={23},
number={1},
pages={5232--5270},
year={2022},
publisher={JMLRORG}
}
tag: BASE
| Meta
paper link: here
code link: here
follow-up work: here
citation:
@misc{lewis2021base,
title={BASE Layers: Simplifying Training of Large, Sparse Models},
author={Mike Lewis and Shruti Bhosale and Tim Dettmers and Naman Goyal and Luke Zettlemoyer},
year={2021},
eprint={2103.16716},
archivePrefix={arXiv},
primaryClass={id='cs.CL' full_name='Computation and Language' is_active=True alt_name='cmp-lg' in_archive='cs' is_general=False description='Covers natural language processing. Roughly includes material in ACM Subject Class I.2.7. Note that work on artificial languages (programming languages, logics, formal systems) that does not explicitly address natural-language issues broadly construed (natural-language processing, computational linguistics, speech, text retrieval, etc.) is not appropriate for this area.'}
}
tag: Hash
| NIPS21
| Meta
paper link: here
citation:
@article{roller2021hash,
title={Hash layers for large sparse models},
author={Roller, Stephen and Sukhbaatar, Sainbayar and Weston, Jason and others},
journal={Advances in Neural Information Processing Systems},
volume={34},
pages={17555--17566},
year={2021}
}
tag: Sparse MoE
| NIPS21
| Google
| OpenAI
paper link: here
citation:
@article{jaszczur2021sparse,
title={Sparse is enough in scaling transformers},
author={Jaszczur, Sebastian and Chowdhery, Aakanksha and Mohiuddin, Afroz and Kaiser, Lukasz and Gajewski, Wojciech and Michalewski, Henryk and Kanerva, Jonni},
journal={Advances in Neural Information Processing Systems},
volume={34},
pages={9895--9907},
year={2021}
}
tag: MoE
| ICLR17
| Google Brain
paper link: here
citation:
@article{shazeer2017outrageously,
title={Outrageously large neural networks: The sparsely-gated mixture-of-experts layer},
author={Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
journal={arXiv preprint arXiv:1701.06538},
year={2017}
}