Skip to content

Latest commit

 

History

History
377 lines (257 loc) · 11.2 KB

ssm.md

File metadata and controls

377 lines (257 loc) · 11.2 KB

State Space Models (SSM) for Sequence Modeling

Here're some resources about State Space Models (SSMs) for sequence modeling

Hymba: A Hybrid-head Architecture for Small Language Models

tag: Hymba | Mamba | SLM | Nvidia

paper link: here

blog link: here

code link: here

modelhub link: here

citation:

@misc{dong2024hymbahybridheadarchitecturesmall,
      title={Hymba: A Hybrid-head Architecture for Small Language Models}, 
      author={Xin Dong and Yonggan Fu and Shizhe Diao and Wonmin Byeon and Zijia Chen and Ameya Sunil Mahabaleshwarkar and Shih-Yang Liu and Matthijs Van Keirsbilck and Min-Hung Chen and Yoshi Suhara and Yingyan Lin and Jan Kautz and Pavlo Molchanov},
      year={2024},
      eprint={2411.13676},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2411.13676}, 
}

Falcon Mamba: The First Competitive Attention-free 7B Language Model

tag: Falcon Mamba | SSLM | TII

paper link: here

blog link: here

modelhub link: here

citation:

@misc{zuo2024falconmambacompetitiveattentionfree,
      title={Falcon Mamba: The First Competitive Attention-free 7B Language Model}, 
      author={Jingwei Zuo and Maksim Velikanov and Dhia Eddine Rhaiem and Ilyas Chahed and Younes Belkada and Guillaume Kunsch and Hakim Hacid},
      year={2024},
      eprint={2410.05355},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2410.05355}, 
}

Samba: Simple Hybrid State Space Models for Efficient Unlimited Context Language Modeling

tag: Samba | Microsoft

paper link: here

citation:

@misc{ren2024sambasimplehybridstate,
      title={Samba: Simple Hybrid State Space Models for Efficient Unlimited Context Language Modeling}, 
      author={Liliang Ren and Yang Liu and Yadong Lu and Yelong Shen and Chen Liang and Weizhu Chen},
      year={2024},
      eprint={2406.07522},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
      url={https://arxiv.org/abs/2406.07522}, 
}

Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality

tag: Mamba2 | Princeton University | CMU

paper link: here

code link: here

citation:

@misc{dao2024transformersssmsgeneralizedmodels,
      title={Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality}, 
      author={Tri Dao and Albert Gu},
      year={2024},
      eprint={2405.21060},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
      url={https://arxiv.org/abs/2405.21060}, 
}

MambaOut: Do We Really Need Mamba for Vision?

tag: MambaOut | Vision Mamba | NUS

paper link: here

code link: here

citation:

@misc{yu2024mambaoutreallyneedmamba,
      title={MambaOut: Do We Really Need Mamba for Vision?}, 
      author={Weihao Yu and Xinchao Wang},
      year={2024},
      eprint={2405.07992},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2405.07992}, 
}

The Illusion of State in State-Space Models

tag: SSM Illusion | ICML24 | Allen AI | NYU

paper link: here

code link: here

homempage link: here

citation:

@inproceedings{merrill-2024-illusion,
  title = {{The illusion of state in state-space models}},
  author = {Merrill, William and Petty, Jackson and Sabharwal, Ashish},
  booktitle = {{Forty-first International Conference on Machine Learning}},
  eventtitle = {ICML},
  venue = {Vienna, Austria},
  eventdate = {2024-07-21},
  date = {2024-04-12},
}

Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model

tag: ViM | Vison Mamba | ICML24 | HUST

paper link: here

code link: here

citation:

@misc{zhu2024visionmambaefficientvisual,
      title={Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model}, 
      author={Lianghui Zhu and Bencheng Liao and Qian Zhang and Xinlong Wang and Wenyu Liu and Xinggang Wang},
      year={2024},
      eprint={2401.09417},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2401.09417}, 
}

Eagle and Finch: RWKV with Matrix-Valued States and Dynamic Recurrence

tag: RWKV5 | RWKV6 | EleutherAI

paper link: here

code link: here

citation:

@misc{peng2024eaglefinchrwkvmatrixvalued,
      title={Eagle and Finch: RWKV with Matrix-Valued States and Dynamic Recurrence}, 
      author={Bo Peng and Daniel Goldstein and Quentin Anthony and Alon Albalak and Eric Alcaide and Stella Biderman and Eugene Cheah and Xingjian Du and Teddy Ferdinan and Haowen Hou and Przemysław Kazienko and Kranthi Kiran GV and Jan Kocoń and Bartłomiej Koptyra and Satyapriya Krishna and Ronald McClelland Jr. au2 and Jiaju Lin and Niklas Muennighoff and Fares Obeid and Atsushi Saito and Guangyu Song and Haoqin Tu and Cahya Wirawan and Stanisław Woźniak and Ruichong Zhang and Bingchen Zhao and Qihang Zhao and Peng Zhou and Jian Zhu and Rui-Jie Zhu},
      year={2024},
      eprint={2404.05892},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2404.05892}, 
}

Mamba: Linear-Time Sequence Modeling with Selective State Spaces

tag: Mamba | Princeton University | CMU

paper link: here

code link: here

follow-up work: here

citation:

@article{gu2023mamba,
  title={Mamba: Linear-Time Sequence Modeling with Selective State Spaces},
  author={Gu, Albert and Dao, Tri},
  journal={arXiv preprint arXiv:2312.00752},
  year={2023}
}

Retentive network: A successor to transformer for large language models

tag: RetNet | Microsoft | Tsinghua University

paper link: here

code link: readme | impl

citation:

@article{sun2023retentive,
  title={Retentive network: A successor to transformer for large language models},
  author={Sun, Yutao and Dong, Li and Huang, Shaohan and Ma, Shuming and Xia, Yuqing and Xue, Jilong and Wang, Jianyong and Wei, Furu},
  journal={arXiv preprint arXiv:2307.08621},
  year={2023}
}

RWKV: Reinventing RNNs for the Transformer Era

tag: RWKV | RWKV4 | EleutherAI

paper link: here

code link: here

follow-up work: here

citation:

@misc{peng2023rwkvreinventingrnnstransformer,
      title={RWKV: Reinventing RNNs for the Transformer Era}, 
      author={Bo Peng and Eric Alcaide and Quentin Anthony and Alon Albalak and Samuel Arcadinho and Stella Biderman and Huanqi Cao and Xin Cheng and Michael Chung and Matteo Grella and Kranthi Kiran GV and Xuzheng He and Haowen Hou and Jiaju Lin and Przemyslaw Kazienko and Jan Kocon and Jiaming Kong and Bartlomiej Koptyra and Hayden Lau and Krishna Sri Ipsit Mantri and Ferdinand Mom and Atsushi Saito and Guangyu Song and Xiangru Tang and Bolun Wang and Johan S. Wind and Stanislaw Wozniak and Ruichong Zhang and Zhenyuan Zhang and Qihang Zhao and Peng Zhou and Qinghua Zhou and Jian Zhu and Rui-Jie Zhu},
      year={2023},
      eprint={2305.13048},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2305.13048}, 
}

Resurrecting recurrent neural networks for long sequences

tag: LRU | Google DeepMind

paper link: here

code link: here

citation:

@article{orvieto2023resurrecting,
  title={Resurrecting recurrent neural networks for long sequences},
  author={Orvieto, Antonio and Smith, Samuel L and Gu, Albert and Fernando, Anushan and Gulcehre, Caglar and Pascanu, Razvan and De, Soham},
  journal={arXiv preprint arXiv:2303.06349},
  year={2023}
}

Simplified state space layers for sequence modeling

tag: S5 | ICLR23 | Stanford University

paper link: here

code link: here

citation:

@article{smith2022simplified,
  title={Simplified state space layers for sequence modeling},
  author={Smith, Jimmy TH and Warrington, Andrew and Linderman, Scott W},
  journal={arXiv preprint arXiv:2208.04933},
  year={2022}
}

Long range language modeling via gated state spaces

tag: GSS | Google

paper link: here

citation:

@article{mehta2022long,
  title={Long range language modeling via gated state spaces},
  author={Mehta, Harsh and Gupta, Ankit and Cutkosky, Ashok and Neyshabur, Behnam},
  journal={arXiv preprint arXiv:2206.13947},
  year={2022}
}

On the Parameterization and Initialization of Diagonal State Space Models

tag: DSS | S4D | IBM Research | Stanford University

paper link: here

citation:

@misc{gu2022parameterizationinitializationdiagonalstate,
      title={On the Parameterization and Initialization of Diagonal State Space Models}, 
      author={Albert Gu and Ankit Gupta and Karan Goel and Christopher Ré},
      year={2022},
      eprint={2206.11893},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2206.11893}, 
}

Diagonal State Spaces are as Effective as Structured State Spaces

tag: DSS | IBM Research | Stanford University

paper link: here

citation:

@misc{gupta2022diagonalstatespaceseffective,
      title={Diagonal State Spaces are as Effective as Structured State Spaces}, 
      author={Ankit Gupta and Albert Gu and Jonathan Berant},
      year={2022},
      eprint={2203.14343},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2203.14343}, 
}

Efficiently modeling long sequences with structured state spaces

tag: S4 | Stanford University

paper link: here

citation:

@article{gu2021efficiently,
  title={Efficiently modeling long sequences with structured state spaces},
  author={Gu, Albert and Goel, Karan and R{\'e}, Christopher},
  journal={arXiv preprint arXiv:2111.00396},
  year={2021}
}