Here're some resources about State Space Models (SSMs) for sequence modeling
tag: Hymba
| Mamba
| SLM
| Nvidia
paper link: here
blog link: here
code link: here
modelhub link: here
citation:
@misc{dong2024hymbahybridheadarchitecturesmall,
title={Hymba: A Hybrid-head Architecture for Small Language Models},
author={Xin Dong and Yonggan Fu and Shizhe Diao and Wonmin Byeon and Zijia Chen and Ameya Sunil Mahabaleshwarkar and Shih-Yang Liu and Matthijs Van Keirsbilck and Min-Hung Chen and Yoshi Suhara and Yingyan Lin and Jan Kautz and Pavlo Molchanov},
year={2024},
eprint={2411.13676},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2411.13676},
}
tag: Falcon Mamba
| SSLM
| TII
paper link: here
blog link: here
modelhub link: here
citation:
@misc{zuo2024falconmambacompetitiveattentionfree,
title={Falcon Mamba: The First Competitive Attention-free 7B Language Model},
author={Jingwei Zuo and Maksim Velikanov and Dhia Eddine Rhaiem and Ilyas Chahed and Younes Belkada and Guillaume Kunsch and Hakim Hacid},
year={2024},
eprint={2410.05355},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2410.05355},
}
tag: Samba
| Microsoft
paper link: here
citation:
@misc{ren2024sambasimplehybridstate,
title={Samba: Simple Hybrid State Space Models for Efficient Unlimited Context Language Modeling},
author={Liliang Ren and Yang Liu and Yadong Lu and Yelong Shen and Chen Liang and Weizhu Chen},
year={2024},
eprint={2406.07522},
archivePrefix={arXiv},
primaryClass={cs.CL}
url={https://arxiv.org/abs/2406.07522},
}
Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality
tag: Mamba2
| Princeton University
| CMU
paper link: here
code link: here
citation:
@misc{dao2024transformersssmsgeneralizedmodels,
title={Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality},
author={Tri Dao and Albert Gu},
year={2024},
eprint={2405.21060},
archivePrefix={arXiv},
primaryClass={cs.LG}
url={https://arxiv.org/abs/2405.21060},
}
tag: MambaOut
| Vision Mamba
| NUS
paper link: here
code link: here
citation:
@misc{yu2024mambaoutreallyneedmamba,
title={MambaOut: Do We Really Need Mamba for Vision?},
author={Weihao Yu and Xinchao Wang},
year={2024},
eprint={2405.07992},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2405.07992},
}
tag: SSM Illusion
| ICML24
| Allen AI
| NYU
paper link: here
code link: here
homempage link: here
citation:
@inproceedings{merrill-2024-illusion,
title = {{The illusion of state in state-space models}},
author = {Merrill, William and Petty, Jackson and Sabharwal, Ashish},
booktitle = {{Forty-first International Conference on Machine Learning}},
eventtitle = {ICML},
venue = {Vienna, Austria},
eventdate = {2024-07-21},
date = {2024-04-12},
}
tag: ViM
| Vison Mamba
| ICML24
| HUST
paper link: here
code link: here
citation:
@misc{zhu2024visionmambaefficientvisual,
title={Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model},
author={Lianghui Zhu and Bencheng Liao and Qian Zhang and Xinlong Wang and Wenyu Liu and Xinggang Wang},
year={2024},
eprint={2401.09417},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2401.09417},
}
tag: RWKV5
| RWKV6
| EleutherAI
paper link: here
code link: here
citation:
@misc{peng2024eaglefinchrwkvmatrixvalued,
title={Eagle and Finch: RWKV with Matrix-Valued States and Dynamic Recurrence},
author={Bo Peng and Daniel Goldstein and Quentin Anthony and Alon Albalak and Eric Alcaide and Stella Biderman and Eugene Cheah and Xingjian Du and Teddy Ferdinan and Haowen Hou and Przemysław Kazienko and Kranthi Kiran GV and Jan Kocoń and Bartłomiej Koptyra and Satyapriya Krishna and Ronald McClelland Jr. au2 and Jiaju Lin and Niklas Muennighoff and Fares Obeid and Atsushi Saito and Guangyu Song and Haoqin Tu and Cahya Wirawan and Stanisław Woźniak and Ruichong Zhang and Bingchen Zhao and Qihang Zhao and Peng Zhou and Jian Zhu and Rui-Jie Zhu},
year={2024},
eprint={2404.05892},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.05892},
}
tag: Mamba
| Princeton University
| CMU
paper link: here
code link: here
follow-up work: here
citation:
@article{gu2023mamba,
title={Mamba: Linear-Time Sequence Modeling with Selective State Spaces},
author={Gu, Albert and Dao, Tri},
journal={arXiv preprint arXiv:2312.00752},
year={2023}
}
tag: RetNet
| Microsoft
| Tsinghua University
paper link: here
citation:
@article{sun2023retentive,
title={Retentive network: A successor to transformer for large language models},
author={Sun, Yutao and Dong, Li and Huang, Shaohan and Ma, Shuming and Xia, Yuqing and Xue, Jilong and Wang, Jianyong and Wei, Furu},
journal={arXiv preprint arXiv:2307.08621},
year={2023}
}
tag: RWKV
| RWKV4
| EleutherAI
paper link: here
code link: here
follow-up work: here
citation:
@misc{peng2023rwkvreinventingrnnstransformer,
title={RWKV: Reinventing RNNs for the Transformer Era},
author={Bo Peng and Eric Alcaide and Quentin Anthony and Alon Albalak and Samuel Arcadinho and Stella Biderman and Huanqi Cao and Xin Cheng and Michael Chung and Matteo Grella and Kranthi Kiran GV and Xuzheng He and Haowen Hou and Jiaju Lin and Przemyslaw Kazienko and Jan Kocon and Jiaming Kong and Bartlomiej Koptyra and Hayden Lau and Krishna Sri Ipsit Mantri and Ferdinand Mom and Atsushi Saito and Guangyu Song and Xiangru Tang and Bolun Wang and Johan S. Wind and Stanislaw Wozniak and Ruichong Zhang and Zhenyuan Zhang and Qihang Zhao and Peng Zhou and Qinghua Zhou and Jian Zhu and Rui-Jie Zhu},
year={2023},
eprint={2305.13048},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2305.13048},
}
tag: LRU
| Google DeepMind
paper link: here
code link: here
citation:
@article{orvieto2023resurrecting,
title={Resurrecting recurrent neural networks for long sequences},
author={Orvieto, Antonio and Smith, Samuel L and Gu, Albert and Fernando, Anushan and Gulcehre, Caglar and Pascanu, Razvan and De, Soham},
journal={arXiv preprint arXiv:2303.06349},
year={2023}
}
tag: S5
| ICLR23
| Stanford University
paper link: here
code link: here
citation:
@article{smith2022simplified,
title={Simplified state space layers for sequence modeling},
author={Smith, Jimmy TH and Warrington, Andrew and Linderman, Scott W},
journal={arXiv preprint arXiv:2208.04933},
year={2022}
}
tag: GSS
| Google
paper link: here
citation:
@article{mehta2022long,
title={Long range language modeling via gated state spaces},
author={Mehta, Harsh and Gupta, Ankit and Cutkosky, Ashok and Neyshabur, Behnam},
journal={arXiv preprint arXiv:2206.13947},
year={2022}
}
tag: DSS
| S4D
| IBM Research
| Stanford University
paper link: here
citation:
@misc{gu2022parameterizationinitializationdiagonalstate,
title={On the Parameterization and Initialization of Diagonal State Space Models},
author={Albert Gu and Ankit Gupta and Karan Goel and Christopher Ré},
year={2022},
eprint={2206.11893},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2206.11893},
}
tag: DSS
| IBM Research
| Stanford University
paper link: here
citation:
@misc{gupta2022diagonalstatespaceseffective,
title={Diagonal State Spaces are as Effective as Structured State Spaces},
author={Ankit Gupta and Albert Gu and Jonathan Berant},
year={2022},
eprint={2203.14343},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2203.14343},
}
tag: S4
| Stanford University
paper link: here
citation:
@article{gu2021efficiently,
title={Efficiently modeling long sequences with structured state spaces},
author={Gu, Albert and Goel, Karan and R{\'e}, Christopher},
journal={arXiv preprint arXiv:2111.00396},
year={2021}
}