Here're some resources about Linear Attention modules in language modeling
tag: LASP
| OpenNLPLab
paper link: here
code link: here
citation:
@misc{sun2024linearattentionsequenceparallelism,
title={Linear Attention Sequence Parallelism},
author={Weigao Sun and Zhen Qin and Dong Li and Xuyang Shen and Yu Qiao and Yiran Zhong},
year={2024},
eprint={2404.02882},
archivePrefix={arXiv},
primaryClass={cs.LG}
url={https://arxiv.org/abs/2404.02882},
}
tag: Hedgehog
| Softmax Mimicry
| ICLR24
| Stanford University
paper link: here
citation:
@misc{zhang2024hedgehogporcupineexpressive,
title={The Hedgehog & the Porcupine: Expressive Linear Attentions with Softmax Mimicry},
author={Michael Zhang and Kush Bhatia and Hermann Kumbong and Christopher Ré},
year={2024},
eprint={2402.04347},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2402.04347},
}
Lightning Attention-2: A Free Lunch for Handling Unlimited Sequence Lengths in Large Language Models
tag: Lightning Attention 2
| OpenNLPLab
paper link: here
code link: here
follow-up work: here
citation:
@misc{qin2024lightning,
title={Lightning Attention-2: A Free Lunch for Handling Unlimited Sequence Lengths in Large Language Models},
author={Zhen Qin and Weigao Sun and Dong Li and Xuyang Shen and Weixuan Sun and Yiran Zhong},
year={2024},
eprint={2401.04658},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: TransNormerLLM
| Lightning Attention
| OpenNLPLab
paper link: here
code link: here
follow-up work: here
citation:
@misc{qin2024transnormerllm,
title={TransNormerLLM: A Faster and Better Large Language Model with Improved TransNormer},
author={Zhen Qin and Dong Li and Weigao Sun and Weixuan Sun and Xuyang Shen and Xiaodong Han and Yunshen Wei and Baohong Lv and Xiao Luo and Yu Qiao and Yiran Zhong},
year={2024},
eprint={2307.14995},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: Primal Attention
| NIPS23
paper link: here
code link: here
@misc{chen2023primalattention,
title={Primal-Attention: Self-attention through Asymmetric Kernel SVD in Primal Representation},
author={Yingyi Chen and Qinghua Tao and Francesco Tonin and Johan A. K. Suykens},
year={2023},
eprint={2305.19798},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: Fourierformer
| NIPS22
| UCLA
paper link: here
code link: here
citation:
@article{nguyen2022fourierformer,
title={Fourierformer: Transformer meets generalized fourier integral theorem},
author={Nguyen, Tan and Pham, Minh and Nguyen, Tam and Nguyen, Khai and Osher, Stanley and Ho, Nhat},
journal={Advances in Neural Information Processing Systems},
volume={35},
pages={29319--29335},
year={2022}
}
tag: Scatterbrain
| NIPS21
| Adobe
| Stanford University
overview:
paper link: here
code link: here
citation:
@article{chen2021scatterbrain,
title={Scatterbrain: Unifying sparse and low-rank attention approximation},
author={Chen, Beidi and Dao, Tri and Winsor, Eric and Song, Zhao and Rudra, Atri and R{\'e}, Christopher},
journal={arXiv preprint arXiv:2110.15343},
year={2021}
}
tag: Luna
| NIPS21
| Meta
| CMU
overview:
paper link: here
code link: here
citation:
@article{ma2021luna,
title={Luna: Linear unified nested attention},
author={Ma, Xuezhe and Kong, Xiang and Wang, Sinong and Zhou, Chunting and May, Jonathan and Ma, Hao and Zettlemoyer, Luke},
journal={Advances in Neural Information Processing Systems},
volume={34},
pages={2441--2453},
year={2021}
}
tag: RFA
| ICLR21
| Google DeepMind
paper link: here
code link: here
citation:
@article{peng2021random,
title={Random feature attention},
author={Peng, Hao and Pappas, Nikolaos and Yogatama, Dani and Schwartz, Roy and Smith, Noah A and Kong, Lingpeng},
journal={arXiv preprint arXiv:2103.02143},
year={2021}
}
tag: Performer
| ICLR21
| Google DeepMind
overview:
paper link: here
code link: here
citation:
@article{choromanski2020rethinking,
title={Rethinking attention with performers},
author={Choromanski, Krzysztof and Likhosherstov, Valerii and Dohan, David and Song, Xingyou and Gane, Andreea and Sarlos, Tamas and Hawkins, Peter and Davis, Jared and Mohiuddin, Afroz and Kaiser, Lukasz and others},
journal={arXiv preprint arXiv:2009.14794},
year={2020}
}
tag: Linformer
| Meta
overview:
paper link: here
code link: here
citation:
@article{wang2020linformer,
title={Linformer: Self-attention with linear complexity},
author={Wang, Sinong and Li, Belinda Z and Khabsa, Madian and Fang, Han and Ma, Hao},
journal={arXiv preprint arXiv:2006.04768},
year={2020}
}
tag: Linear Transformer
| ICML20
| EPFL
overview:
$$ \begin{align} \mathcal{K}{Li}(\mathbf q,\mathbf k) := \varphi{Li}(\mathbf q)\times \varphi_{Li}(\mathbf k)^\mathrm{T}, \quad where\quad \varphi_{Li}(\mathbf x) = \mathrm{elu}(\mathbf x) + 1 \end{align} $$
paper link: here
code link: here
citation:
@inproceedings{katharopoulos2020transformers,
title={Transformers are rnns: Fast autoregressive transformers with linear attention},
author={Katharopoulos, Angelos and Vyas, Apoorv and Pappas, Nikolaos and Fleuret, Fran{\c{c}}ois},
booktitle={International conference on machine learning},
pages={5156--5165},
year={2020},
organization={PMLR}
}