Here're some resources about Quantization Strategies for LLMs Inference
tag: BitNet a4.8
| Microsoft
paper link: here
code link: here
homepage link: here
citation:
@misc{wang2024bitneta484bitactivations,
title={BitNet a4.8: 4-bit Activations for 1-bit LLMs},
author={Hongyu Wang and Shuming Ma and Furu Wei},
year={2024},
eprint={2411.04965},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2411.04965},
}
tag: BitNet b1.58
| Microsoft
paper link: here
code link: here
homepage link: here
follow-up work: here
citation:
@misc{ma2024era1bitllmslarge,
title={The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits},
author={Shuming Ma and Hongyu Wang and Lingxiao Ma and Lei Wang and Wenhui Wang and Shaohan Huang and Li Dong and Ruiping Wang and Jilong Xue and Furu Wei},
year={2024},
eprint={2402.17764},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2402.17764},
}
tag: HQQ+
| Mobius Labs
blog link: here
code link: here
citation:
@misc{badri2023hqq+,
title = {Towards 1-bit Machine Learning Models},
url = {https://mobiusml.github.io/1bit_blog/},
author = {Hicham Badri and Appu Shaji},
month = {March},
year = {2024}
}
tag: BitNet
| BitLinear
| W1A8
| Microsoft
paper link: here
code link: here
homepage link: here
follow-up work: here
citation:
@misc{wang2023bitnet,
title={BitNet: Scaling 1-bit Transformers for Large Language Models},
author={Hongyu Wang and Shuming Ma and Li Dong and Shaohan Huang and Huaijie Wang and Lingxiao Ma and Fan Yang and Ruiping Wang and Yi Wu and Furu Wei},
year={2023},
eprint={2310.11453},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: LoftQ
| ICLR24
| Microsoft
paper link: here
gituhub link: here
modelhub link: here
citation:
@article{li2023loftq,
title={Loftq: Lora-fine-tuning-aware quantization for large language models},
author={Li, Yixiao and Yu, Yifan and Liang, Chen and He, Pengcheng and Karampatziakis, Nikos and Chen, Weizhu and Zhao, Tuo},
journal={arXiv preprint arXiv:2310.08659},
year={2023}
}
tag: QFT
| UCAS
| UC Berkeley
paper link: here
citation:
@misc{li2023qft,
title={QFT: Quantized Full-parameter Tuning of LLMs with Affordable Resources},
author={Zhikai Li and Xiaoxuan Liu and Banghua Zhu and Zhen Dong and Qingyi Gu and Kurt Keutzer},
year={2023},
eprint={2310.07147},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: QA-LoRA
| ICLR24
| Huawei
paper link: here
code link: here
citation:
@article{xu2023qa,
title={QA-LoRA: Quantization-Aware Low-Rank Adaptation of Large Language Models},
author={Xu, Yuhui and Xie, Lingxi and Gu, Xiaotao and Chen, Xin and Chang, Heng and Zhang, Hengheng and Chen, Zhensu and Zhang, Xiaopeng and Tian, Qi},
journal={arXiv preprint arXiv:2309.14717},
year={2023}
}
tag: SqueezeLLM
| ICML24
| UC Berkeley
paper link: here
code link: here
citation:
@article{kim2023squeezellm,
title={SqueezeLLM: Dense-and-Sparse Quantization},
author={Kim, Sehoon and Hooper, Coleman and Gholami, Amir and Dong, Zhen and Li, Xiuyu and Shen, Sheng and Mahoney, Michael W and Keutzer, Kurt},
journal={arXiv preprint arXiv:2306.07629},
year={2023}
}
tag: SpQR
| ICLR24
| University of Washington
paper link: here
code link: here
citation:
@article{dettmers2023spqr,
title={SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression},
author={Dettmers, Tim and Svirschevski, Ruslan and Egiazarian, Vage and Kuznedelev, Denis and Frantar, Elias and Ashkboos, Saleh and Borzunov, Alexander and Hoefler, Torsten and Alistarh, Dan},
journal={arXiv preprint arXiv:2306.03078},
year={2023}
}
tag: PEQA
| NIPS23
| NAVER Cloud
paper link: here
citation:
@article{kim2024memory,
title={Memory-efficient fine-tuning of compressed large language models via sub-4-bit integer quantization},
author={Kim, Jeonghoon and Lee, Jung Hyun and Kim, Sungdong and Park, Joonsuk and Yoo, Kang Min and Kwon, Se Jung and Lee, Dongsoo},
journal={Advances in Neural Information Processing Systems},
volume={36},
year={2024}
}
tag: Q8BERT
| NIPS19
| Intel AI Lab
paper link: here
code link: here
citation:
@inproceedings{zafrir2019q8bert,
author = "Zafrir, Ofir and Boudoukh, Guy and Izsak, Peter and Wasserblat, Moshe",
title = "Q8bert: Quantized 8bit bert",
booktitle = "2019 Fifth Workshop on Energy Efficient Machine Learning and Cognitive Computing-NeurIPS Edition (EMC2-NIPS)",
pages = "36--39",
year = "2019",
organization = "IEEE"
}
tag: DuQuant
| NIPS24
| UCAS
| Tsinghua University
paper link: here
code link: here
citation:
@misc{lin2024duquantdistributingoutliersdual,
title={DuQuant: Distributing Outliers via Dual Transformation Makes Stronger Quantized LLMs},
author={Haokun Lin and Haobo Xu and Yichen Wu and Jingzhi Cui and Yingtao Zhang and Linzhan Mou and Linqi Song and Zhenan Sun and Ying Wei},
year={2024},
eprint={2406.01721},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2406.01721},
}
tag: eXmY
| Google
paper link: here
citation:
@misc{agrawal2024exmydatatypetechnique,
title={eXmY: A Data Type and Technique for Arbitrary Bit Precision Quantization},
author={Aditya Agrawal and Matthew Hedlund and Blake Hechtman},
year={2024},
eprint={2405.13938},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2405.13938},
}
tag: HQQ
| Mobius Labs
blog link: here
code link: here
follow-up link: here
citation:
@misc{badri2023hqq,
title = {Half-Quadratic Quantization of Large Machine Learning Models},
url = {https://mobiusml.github.io/hqq_blog/},
author = {Hicham Badri and Appu Shaji},
month = {November},
year = {2023}
}
tag: ZeroQuant-FP
| W4A8
| NIPS23
| DeepSpeed
| Microsoft
paper link: here
code link: here
citation:
@article{wu2023zeroquant,
title={Zeroquant-fp: A leap forward in llms post-training w4a8 quantization using floating-point formats},
author={Wu, Xiaoxia and Yao, Zhewei and He, Yuxiong},
journal={arXiv preprint arXiv:2307.09782},
year={2023}
}
tag: AWQ
| MLSys24
| Nvidia
| MIT
| Tsinghua University
paper link: here
code link: here
citation:
@misc{lin2023awq,
title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
author={Ji Lin and Jiaming Tang and Haotian Tang and Shang Yang and Xingyu Dang and Chuang Gan and Song Han},
year={2023},
eprint={2306.00978},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation
tag: ZeroQuant V2
| DeepSpeed
| Microsoft
paper link: here
code link: here
follow-up work: here
citation:
@misc{yao2023zeroquantv2exploringposttrainingquantization,
title={ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation},
author={Zhewei Yao and Xiaoxia Wu and Cheng Li and Stephen Youn and Yuxiong He},
year={2023},
eprint={2303.08302},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2303.08302},
}
tag: SmoothQuant
| ICML23
| Nvidia
paper link: here
code link: here
citation:
@inproceedings{xiao2023smoothquant,
title={Smoothquant: Accurate and efficient post-training quantization for large language models},
author={Xiao, Guangxuan and Lin, Ji and Seznec, Mickael and Wu, Hao and Demouth, Julien and Han, Song},
booktitle={International Conference on Machine Learning},
pages={38087--38099},
year={2023},
organization={PMLR}
}
tag: GPTQ
| ICLR22
| ISTA
paper link: here
code link: here
citation:
@misc{frantar2023gptq,
title={GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers},
author={Elias Frantar and Saleh Ashkboos and Torsten Hoefler and Dan Alistarh},
year={2023},
eprint={2210.17323},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: BitsAndBytes
| Int8 Quantization
| NIPS22
| Meta
paper link: here
blog link: here
code link: here
citation:
@misc{dettmers2022llmint8,
title={LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale},
author={Tim Dettmers and Mike Lewis and Younes Belkada and Luke Zettlemoyer},
year={2022},
eprint={2208.07339},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: FP8 Quantization
| NIPS22
| Qualcomm AI
paper link: here
code link: here
citation:
@misc{kuzmin2024fp8,
title={FP8 Quantization: The Power of the Exponent},
author={Andrey Kuzmin and Mart Van Baalen and Yuwei Ren and Markus Nagel and Jorn Peters and Tijmen Blankevoort},
year={2024},
eprint={2208.09225},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: ZeroQuant
| NIPS22
| DeepSpeed
| Microsoft
paper link: here
code link: here
follow-up work: here
citation:
@misc{yao2022zeroquantefficientaffordableposttraining,
title={ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers},
author={Zhewei Yao and Reza Yazdani Aminabadi and Minjia Zhang and Xiaoxia Wu and Conglong Li and Yuxiong He},
year={2022},
eprint={2206.01861},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2206.01861},
}
tag: QSparse
| Joint Quantization and Pruning
| UCSD
paper link: here
code link: here
citation:
@misc{zhang2021trainingdeepneuralnetworks,
title={Training Deep Neural Networks with Joint Quantization and Pruning of Weights and Activations},
author={Xinyu Zhang and Ian Colbert and Ken Kreutz-Delgado and Srinjoy Das},
year={2021},
eprint={2110.08271},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2110.08271},
}
tag: Integer Quantization
| Nvidia
paper link: here
citation:
@misc{wu2020integerquantizationdeeplearning,
title={Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation},
author={Hao Wu and Patrick Judd and Xiaojie Zhang and Mikhail Isaev and Paulius Micikevicius},
year={2020},
eprint={2004.09602},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2004.09602},
}
tag: STE
| ICLR19
| UCLA
paper link: here
citation:
@misc{yin2019understandingstraightthroughestimatortraining,
title={Understanding Straight-Through Estimator in Training Activation Quantized Neural Nets},
author={Penghang Yin and Jiancheng Lyu and Shuai Zhang and Stanley Osher and Yingyong Qi and Jack Xin},
year={2019},
eprint={1903.05662},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/1903.05662},
}
tag: Quantization Whitepaper
| Google
paper link: here
code link: here
citation:
@misc{krishnamoorthi2018quantizing,
title={Quantizing deep convolutional networks for efficient inference: A whitepaper},
author={Raghuraman Krishnamoorthi},
year={2018},
eprint={1806.08342},
archivePrefix={arXiv},
primaryClass={id='cs.LG' full_name='Machine Learning' is_active=True alt_name=None in_archive='cs' is_general=False description='Papers on all aspects of machine learning research (supervised, unsupervised, reinforcement learning, bandit problems, and so on) including also robustness, explanation, fairness, and methodology. cs.LG is also an appropriate primary category for applications of machine learning methods.'}
}