Here're some resources about Miscellaneous Architectures for language/sequence modeling
tag: BLT
| Meta
| University of Washington
paper link: here
code link: here
citation:
@misc{pagnoni2024bytelatenttransformerpatches,
title={Byte Latent Transformer: Patches Scale Better Than Tokens},
author={Artidoro Pagnoni and Ram Pasunuru and Pedro Rodriguez and John Nguyen and Benjamin Muller and Margaret Li and Chunting Zhou and Lili Yu and Jason Weston and Luke Zettlemoyer and Gargi Ghosh and Mike Lewis and Ari Holtzman and Srinivasan Iyer},
year={2024},
eprint={2412.09871},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2412.09871},
}
tag: LCMs
| Meta
paper link: here
code link: here
citation:
@misc{lcmteam2024largeconceptmodelslanguage,
title={Large Concept Models: Language Modeling in a Sentence Representation Space},
author={LCM team and Loïc Barrault and Paul-Ambroise Duquenne and Maha Elbayad and Artyom Kozhevnikov and Belen Alastruey and Pierre Andrews and Mariano Coria and Guillaume Couairon and Marta R. Costa-jussà and David Dale and Hady Elsahar and Kevin Heffernan and João Maria Janeiro and Tuan Tran and Christophe Ropers and Eduardo Sánchez and Robin San Roman and Alexandre Mourachko and Safiyyah Saleem and Holger Schwenk},
year={2024},
eprint={2412.08821},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2412.08821},
}
tag: FlashRNN
| NXAI Lab
| JKU
paper link: here
code link: here
citation:
@misc{pöppel2024flashrnnoptimizingtraditionalrnns,
title={FlashRNN: Optimizing Traditional RNNs on Modern Hardware},
author={Korbinian Pöppel and Maximilian Beck and Sepp Hochreiter},
year={2024},
eprint={2412.07752},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2412.07752},
}
tag: TokenFormer
| Pattention
| Google
| Peking University
paper link: here
code link: here
citation:
@misc{wang2024tokenformerrethinkingtransformerscaling,
title={TokenFormer: Rethinking Transformer Scaling with Tokenized Model Parameters},
author={Haiyang Wang and Yue Fan and Muhammad Ferjad Naeem and Yongqin Xian and Jan Eric Lenssen and Liwei Wang and Federico Tombari and Bernt Schiele},
year={2024},
eprint={2410.23168},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2410.23168},
}
tag: NAMMs
| EMA
| BAM
| Sakana AI
paper link: here
code link: here
citation:
@misc{cetin2024evolveduniversaltransformermemory,
title={An Evolved Universal Transformer Memory},
author={Edoardo Cetin and Qi Sun and Tianyu Zhao and Yujin Tang},
year={2024},
eprint={2410.13166},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2410.13166},
}
tag: LoLCATs
| Attention Transfer
| LoRA
| Together AI
| Standford University
| MIT
paper link: here
code link: here
citation:
@misc{zhang2024lolcatslowranklinearizinglarge,
title={LoLCATs: On Low-Rank Linearizing of Large Language Models},
author={Michael Zhang and Simran Arora and Rahul Chalamala and Alan Wu and Benjamin Spector and Aaryan Singhal and Krithik Ramesh and Christopher Ré},
year={2024},
eprint={2410.10254},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2410.10254},
}
tag: FAN
| ByteDance
| Peking University
paper link: here
code link: here
citation:
@misc{dong2024fanfourieranalysisnetworks,
title={FAN: Fourier Analysis Networks},
author={Yihong Dong and Ge Li and Yongding Tao and Xue Jiang and Kechi Zhang and Jia Li and Jing Su and Jun Zhang and Jingjing Xu},
year={2024},
eprint={2410.02675},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2410.02675},
}
tag: XNet
| KAN
paper link: here
citation:
@misc{li2024modelcomparisonsxnetoutperforms,
title={Model Comparisons: XNet Outperforms KAN},
author={Xin Li and Zhihong Jeff Xia and Xiaotao Zheng},
year={2024},
eprint={2410.02033},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2410.02033},
}
tag: L-Mul
| BitEnergy AI
paper link: here
citation:
@misc{luo2024additionneedenergyefficientlanguage,
title={Addition is All You Need for Energy-efficient Language Models},
author={Hongyin Luo and Wei Sun},
year={2024},
eprint={2410.00907},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2410.00907},
}
tag: XNet
| Cauchy Activation Function
| Cauchy Integral Theorem
paper link: here
follow-up work: here
citation:
@misc{li2024cauchyactivationfunctionxnet,
title={Cauchy activation function and XNet},
author={Xin Li and Zhihong Xia and Hongkun Zhang},
year={2024},
eprint={2409.19221},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2409.19221},
}
tag: Matmul-free LM
| UCSC
paper link: here
code link: here
citation:
@misc{zhu2024scalablematmulfreelanguagemodeling,
title={Scalable MatMul-free Language Modeling},
author={Rui-Jie Zhu and Yu Zhang and Ethan Sifferman and Tyler Sheaves and Yiqiao Wang and Dustin Richmond and Peng Zhou and Jason K. Eshraghian},
year={2024},
eprint={2406.02528},
archivePrefix={arXiv},
primaryClass={cs.CL}
url={https://arxiv.org/abs/2406.02528},
}
tag: YOCO
| Decoder-Decoder
| NIPS24
| Microsoft
| Tsinghua University
paper link: here
code link: here
citation:
@misc{sun2024cacheoncedecoderdecoderarchitectures,
title={You Only Cache Once: Decoder-Decoder Architectures for Language Models},
author={Yutao Sun and Li Dong and Yi Zhu and Shaohan Huang and Wenhui Wang and Shuming Ma and Quanlu Zhang and Jianyong Wang and Furu Wei},
year={2024},
eprint={2405.05254},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2405.05254},
}
tag: xLSTM
| NIPS24
| LIT AI Lab
| NXAI Lab
| JKU
paper link: here
code link: here
citation:
@misc{beck2024xlstmextendedlongshortterm,
title={xLSTM: Extended Long Short-Term Memory},
author={Maximilian Beck and Korbinian Pöppel and Markus Spanring and Andreas Auer and Oleksandra Prudnikova and Michael Kopp and Günter Klambauer and Johannes Brandstetter and Sepp Hochreiter},
year={2024},
eprint={2405.04517},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2405.04517},
}
tag: M2
| Monarch Mixer
| NIPS23
| Standford University
paper link: here
code link: here
citation:
@article{fu2023monarch,
title={Monarch Mixer: A simple sub-quadratic GEMM-based architecture},
author={Fu, Daniel Y and Arora, Simran and Grogan, Jessica and Johnson, Isys and Eyuboglu, Sabri and Thomas, Armin W and Spector, Benjamin and Poli, Michael and Rudra, Atri and R{\'e}, Christopher},
journal={arXiv preprint arXiv:2310.12109},
year={2023}
}
tag: AFT
| Apple
paper link: here
citation:
@misc{zhai2021attentionfreetransformer,
title={An Attention Free Transformer},
author={Shuangfei Zhai and Walter Talbott and Nitish Srivastava and Chen Huang and Hanlin Goh and Ruixiang Zhang and Josh Susskind},
year={2021},
eprint={2105.14103},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2105.14103},
}
tag: RevNet
| NIPS17
| University of Toronto
paper link: here
code link: here
citation:
@article{gomez2017reversible,
title={The reversible residual network: Backpropagation without storing activations},
author={Gomez, Aidan N and Ren, Mengye and Urtasun, Raquel and Grosse, Roger B},
journal={Advances in neural information processing systems},
volume={30},
year={2017}
}